# Re-identification and De-identification

In [1]:
import pandas as pd

In [3]:
"""
Useful display function for dataframe
"""
def display_df(df, nrows=10, ncols=None):
    with pd.option_context('display.max_rows', nrows, 'display.max_columns', ncols):
        display (df)

## Import data

In [2]:
#whole unaltered dataset
df_raw = pd.read_csv("../mid_sample_set.csv")

  interactivity=interactivity, compiler=compiler, result=result)


## Drop Unnecessary Fields and Clean NaNs

In [25]:
"""
Reads configuration file, a list of strings seperated by new lines, and returns a list
"""
def read_config(file):
    with open(file) as f:
        config_list = [(l) for l in f.read().split()]
    f.close()
    return config_list

In [26]:
qis = read_config('config.txt')

In [27]:
qis

['user_id',
 'cc_by_ip',
 'countryLabel',
 'continent',
 'city',
 'region',
 'subdivision',
 'postalCode',
 'LoE',
 'YoB',
 'gender',
 'nforum_posts',
 'nforum_votes',
 'nforum_endorsed',
 'nforum_threads',
 'nforum_comments',
 'nforum_pinned',
 'nforum_events']

We only need to keep the quasi-identifiers. Everything else can be dropped.

In [21]:
df_qis = df_raw[qis]

In [28]:
display_df(df_qis)

Unnamed: 0,user_id,cc_by_ip,countryLabel,continent,city,region,subdivision,postalCode,LoE,YoB,gender,nforum_posts,nforum_votes,nforum_endorsed,nforum_threads,nforum_comments,nforum_pinned,nforum_events
0,29940,US,United States,Americas,Austin,TX,Texas,78713,,,,,,,,,,0.0
1,37095,BD,Bangladesh,Asia,Dhaka,13,Dhaka,,b,1991.0,m,,,,,,,0.0
2,45634,CO,Colombia,Americas,Medellín,ANT,Antioquia,,m,1982.0,m,,,,,,,0.0
3,52234,SE,Sweden,Europe,Skanör,M,Skåne,,p,1988.0,m,,,,,,,0.0
4,52238,MX,Mexico,Americas,León,GUA,Guanajuato,,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199994,15291085,AU,Australia,Oceania,Silverdale,NSW,New South Wales,2752,jhs,2002.0,,,,,,,,0.0
199995,15292716,RU,Russian Federation,Europe,Yekaterinburg,SVE,Sverdlovskaya Oblast',620000,,,,,,,,,,0.0
199996,15295130,TR,Turkey,Asia,Istanbul,34,Istanbul,,b,1996.0,f,,,,,,,0.0
199997,15296396,US,United States,Americas,Marshfield,MA,Massachusetts,02050,,2000.0,,1.0,0.0,0.0,0.0,1.0,0.0,0.0


Many of the fields contain NaNs when they actually should contain 0. We will replace those values.

In [23]:
"""
Takes list of fields with NaNs and fills NaN values with fill_val
"""
def replace_NaNs(df, labels, fill_val):
    for label in labels:
        df[label].fillna(fill_val, inplace=True)
"""
Gets ratio of NaNs for each column
"""
def stats_NaN(df):
    df_stats = pd.DataFrame(index=[df.columns], columns=["NaN Ratio"])
    for col in df.columns:
        df_stats["NaN Ratio"][col] = df[col].isna().sum()/len(df) #NaN ratio
    return df_stats.sort_values(by=['NaN Ratio'])

In [24]:
stats_NaN(df_qis)

Unnamed: 0,NaN Ratio
user_id,0.0
continent,0.110371
countryLabel,0.111971
cc_by_ip,0.112171
gender,0.131326
LoE,0.139956
YoB,0.150226
nforum_events,0.184851
city,0.225491
subdivision,0.242641
