# Re-identification and De-identification

In [50]:
import pandas as pd

In [51]:
"""
Useful display function for dataframe
"""
def display_df(df, nrows=10, ncols=None):
    with pd.option_context('display.max_rows', nrows, 'display.max_columns', ncols):
        display (df)

## Import data

In [2]:
#whole unaltered dataset
df_raw = pd.read_csv("../mid_sample_set.csv")

  interactivity=interactivity, compiler=compiler, result=result)


## Drop Unnecessary Fields and Clean NaNs

In [65]:
"""
Reads configuration file, a list of strings seperated by new lines, and returns a list
"""
def read_config(file):
    with open(file) as f:
        config_list = [(l) for l in f.read().split()]
    f.close()
    return config_list

In [66]:
qis = read_config('config.txt')

In [67]:
qis

['cc_by_ip',
 'countryLabel',
 'continent',
 'city',
 'region',
 'subdivision',
 'postalCode',
 'LoE',
 'YoB',
 'gender',
 'nforum_posts',
 'nforum_votes',
 'nforum_endorsed',
 'nforum_threads',
 'nforum_comments',
 'nforum_pinned',
 'nforum_events']

We only need to keep the `user_id` as a key, the quasi-identifiers, and the `completed` field to find the completion rate. Everything else can be dropped. Then we can clean the dataset.

In [87]:
df_clean = df_raw[['user_id'] + qis + ['completed']]

Many of the fields contain NaNs when they actually should contain 0. We will replace those values.

In [88]:
"""
Takes list of fields with NaNs and fills NaN values with fill_val. Does this inplace.
"""
def replace_NaNs(df, labels, fill_val):
    for label in labels:
        df[label].fillna(fill_val, inplace=True)
"""
Gets ratio of NaNs for each column
"""
def stats_NaN(df):
    df_stats = pd.DataFrame(index=[df.columns], columns=["NaN Ratio"])
    for col in df.columns:
        df_stats["NaN Ratio"][col] = df[col].isna().sum()/len(df) #NaN ratio
    return df_stats.sort_values(by=['NaN Ratio'])

In [89]:
stats_NaN(df_clean)

Unnamed: 0,NaN Ratio
user_id,0.0
completed,0.0
continent,0.110371
countryLabel,0.111971
cc_by_ip,0.112171
gender,0.131326
LoE,0.139956
YoB,0.150226
nforum_events,0.184851
city,0.225491


In [90]:
NaN_to_0_fields = ['YoB', 'postalCode', 'nforum_posts', 'nforum_votes', 'nforum_endorsed', 
                   'nforum_threads', 'nforum_comments', 'nforum_pinned', 'nforum_events']
replace_NaNs(df_clean, NaN_to_0_fields, 0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [72]:
#NaN_to_empty = set(df_qis.columns) - set(NaN_to_0_fields) - set(['user_id'])
#replace_NaNs(df_clean, NaN_to_empty, "")

## Add Useful Statistical Fields and Drop Duplicates

In [104]:
df_clean.sort_values('user_id', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


For the sake of k-anonymity analysis, we will remove duplicates from the dataset. However, we must first preserve valuable statistics like the completion rate, which means we must create new columns `nStarted` and `nCompleted` which keeps track of the amount of classes started and completed respectively by a given user.

In [130]:
df_clean = df_clean.join(pd.DataFrame(df_clean.groupby('user_id').size(), 
                                      columns=['nStarted']),
                         on='user_id')

In [133]:
df_clean = df_clean.join(pd.DataFrame(df_clean[df_clean['completed']==True].groupby(['user_id']).size(), 
                                      columns=['nCompleted']),
                   on='user_id')

In [136]:
#Fix NaNs in nCompleted row
replace_NaNs(df_clean, ['nCompleted'], 0)

In [138]:
df_clean.drop(columns=['completed'], inplace=True)

In [139]:
display_df(df_clean,nrows=1000)

Unnamed: 0,user_id,cc_by_ip,countryLabel,continent,city,region,subdivision,postalCode,LoE,YoB,gender,nforum_posts,nforum_votes,nforum_endorsed,nforum_threads,nforum_comments,nforum_pinned,nforum_events,nStarted,nCompleted
194141,1,US,United States,Americas,Cambridge,MA,Massachusetts,02138,p,0.0,m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0
41519,8,US,United States,Americas,Cambridge,MA,Massachusetts,02139,p,1959.0,m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0
20131,11,US,United States,Americas,Monson,MA,Massachusetts,01057,,1980.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0
172418,30,US,United States,North America,,,,0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0
84114,64,US,United States,Americas,Stanford,CA,California,94305,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0
34681,79,US,United States,Americas,Kirkland,WA,Washington,98033,,1974.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0
107797,117,US,United States,Americas,Minneapolis,MN,Minnesota,55414,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0
163100,120,US,United States,North America,,,,0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0
8266,194,US,United States,Americas,Santa Rosa,CA,California,95401,,0.0,,3.0,0.0,0.0,2.0,1.0,0.0,167.0,1,1.0
14526,210,MX,Mexico,Americas,Pachuca,HID,Hidalgo,42080,b,1985.0,m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0
