# Re-identification and De-identification

In [50]:
import pandas as pd

In [51]:
"""
Useful display function for dataframe
"""
def display_df(df, nrows=10, ncols=None):
    with pd.option_context('display.max_rows', nrows, 'display.max_columns', ncols):
        display (df)

## Import data

In [2]:
#whole unaltered dataset
df_raw = pd.read_csv("../mid_sample_set.csv")

  interactivity=interactivity, compiler=compiler, result=result)


## Drop Unnecessary Fields and Clean NaNs

In [65]:
"""
Reads configuration file, a list of strings seperated by new lines, and returns a list
"""
def read_config(file):
    with open(file) as f:
        config_list = [(l) for l in f.read().split()]
    f.close()
    return config_list

In [66]:
qis = read_config('config.txt')

In [67]:
qis

['cc_by_ip',
 'countryLabel',
 'continent',
 'city',
 'region',
 'subdivision',
 'postalCode',
 'LoE',
 'YoB',
 'gender',
 'nforum_posts',
 'nforum_votes',
 'nforum_endorsed',
 'nforum_threads',
 'nforum_comments',
 'nforum_pinned',
 'nforum_events']

We only need to keep the 'user_id' as a key, the quasi-identifiers, and the 'completed' field to find the completion rate. Everything else can be dropped.

In [68]:
df_qis = df_raw[['user_id'] + qis + ['completed']]

Many of the fields contain NaNs when they actually should contain 0. We will replace those values.

In [69]:
"""
Takes list of fields with NaNs and fills NaN values with fill_val. Does this inplace.
"""
def replace_NaNs(df, labels, fill_val):
    for label in labels:
        df[label].fillna(fill_val, inplace=True)
"""
Gets ratio of NaNs for each column
"""
def stats_NaN(df):
    df_stats = pd.DataFrame(index=[df.columns], columns=["NaN Ratio"])
    for col in df.columns:
        df_stats["NaN Ratio"][col] = df[col].isna().sum()/len(df) #NaN ratio
    return df_stats.sort_values(by=['NaN Ratio'])

In [70]:
stats_NaN(df_qis)

Unnamed: 0,NaN Ratio
user_id,0.0
completed,0.0
continent,0.110371
countryLabel,0.111971
cc_by_ip,0.112171
gender,0.131326
LoE,0.139956
YoB,0.150226
nforum_events,0.184851
city,0.225491


In [71]:
NaN_to_0_fields = ['YoB', 'postalCode', 'nforum_posts', 'nforum_votes', 'nforum_endorsed', 
                   'nforum_threads', 'nforum_comments', 'nforum_pinned', 'nforum_events']
replace_NaNs(df_qis, NaN_to_0_fields, 0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [72]:
#NaN_to_empty = set(df_qis.columns) - set(NaN_to_0_fields) - set(['user_id'])
#replace_NaNs(df_qis, NaN_to_empty, "")

In [73]:
df_qis

Unnamed: 0,user_id,cc_by_ip,countryLabel,continent,city,region,subdivision,postalCode,LoE,YoB,gender,nforum_posts,nforum_votes,nforum_endorsed,nforum_threads,nforum_comments,nforum_pinned,nforum_events,completed
0,29940,US,United States,Americas,Austin,TX,Texas,78713,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
1,37095,BD,Bangladesh,Asia,Dhaka,13,Dhaka,0,b,1991.0,m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
2,45634,CO,Colombia,Americas,Medellín,ANT,Antioquia,0,m,1982.0,m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
3,52234,SE,Sweden,Europe,Skanör,M,Skåne,0,p,1988.0,m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
4,52238,MX,Mexico,Americas,León,GUA,Guanajuato,0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
5,79526,US,United States,Americas,Hoboken,NJ,New Jersey,07030,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
6,85566,TR,Turkey,Asia,Ankara,06,Ankara,0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
7,86941,IN,India,Asia,,,,0,b,1991.0,m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
8,99599,SE,Sweden,Europe,Kista,AB,Stockholm,16432,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
9,115862,US,United States,Americas,Vallejo,CA,California,94591,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


In [None]:
k_anon = pd.DataFrame(df_qis.groupby(qis).size()

In [81]:
display_df(pd.DataFrame(df_qis.groupby(qis+['user_id']).size()),nrows = 1000)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,0
cc_by_ip,countryLabel,continent,city,region,subdivision,postalCode,LoE,YoB,gender,nforum_posts,nforum_votes,nforum_endorsed,nforum_threads,nforum_comments,nforum_pinned,nforum_events,user_id,Unnamed: 18_level_1
AD,Andorra,Europe,Andorra La Vella,07,Andorra la Vella,0,m,1972.0,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2747910,1
AD,Andorra,Europe,Engordany,08,Escaldes-Engordany,0,a,1973.0,m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5386336,1
AD,Andorra,Europe,Engordany,08,Escaldes-Engordany,0,m,1984.0,m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,580526,1
AE,United Arab Emirates,Asia,Abu Dhabi,AZ,Abu Dhabi,0,a,1988.0,m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8877854,1
AE,United Arab Emirates,Asia,Abu Dhabi,AZ,Abu Dhabi,0,a,1992.0,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15195614,1
AE,United Arab Emirates,Asia,Abu Dhabi,AZ,Abu Dhabi,0,b,1954.0,m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1342281,1
AE,United Arab Emirates,Asia,Abu Dhabi,AZ,Abu Dhabi,0,b,1966.0,m,27.0,3.0,0.0,24.0,3.0,0.0,0.0,14249604,1
AE,United Arab Emirates,Asia,Abu Dhabi,AZ,Abu Dhabi,0,b,1967.0,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9844893,1
AE,United Arab Emirates,Asia,Abu Dhabi,AZ,Abu Dhabi,0,b,1967.0,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10749635,1
AE,United Arab Emirates,Asia,Abu Dhabi,AZ,Abu Dhabi,0,b,1969.0,m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7512819,1


In [80]:
df_qis[(df_qis['YoB']==1967) & (df_qis['city']=='Abu Dhabi') & (df_qis['gender']=='f')]

Unnamed: 0,user_id,cc_by_ip,countryLabel,continent,city,region,subdivision,postalCode,LoE,YoB,gender,nforum_posts,nforum_votes,nforum_endorsed,nforum_threads,nforum_comments,nforum_pinned,nforum_events,completed
158529,10749635,AE,United Arab Emirates,Asia,Abu Dhabi,AZ,Abu Dhabi,0,b,1967.0,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
165164,9844893,AE,United Arab Emirates,Asia,Abu Dhabi,AZ,Abu Dhabi,0,b,1967.0,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


In [82]:
df_qis[df_qis['user_id']==7094933]

Unnamed: 0,user_id,cc_by_ip,countryLabel,continent,city,region,subdivision,postalCode,LoE,YoB,gender,nforum_posts,nforum_votes,nforum_endorsed,nforum_threads,nforum_comments,nforum_pinned,nforum_events,completed
12524,7094933,AE,United Arab Emirates,Asia,Abu Dhabi,AZ,Abu Dhabi,0,hs,1997.0,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
132543,7094933,AE,United Arab Emirates,Asia,Abu Dhabi,AZ,Abu Dhabi,0,hs,1997.0,f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
