# BONUS 🎁 Anonymisation des microdonnées

In [None]:
!pip install sqlite3

In [None]:
import pandas as pd
import sqlite3

con = sqlite3.connect('./datasets/persons.db')

df = pd.read_sql('SELECT * FROM persons LEFT JOIN haircuts ON persons.national_number = haircuts.national_number', con)
df

In [None]:
# count the number of persons by gender and haircuts
pivot_table = df.groupby(['haircut', 'gender']).size().reset_index(name='count')
pivot_table.sort_values(by='count', ascending=True, ignore_index=True)

![image](./anonymisation.drawio.png)

In [None]:
# ------------------
# anonymisation functions
# ------------------

def check_if_need_to_be_anonymised(df, nMax):
    if df['count'].min() < nMax:
        return True
    else:
        return False

def get_similarity_score(df):
    df = df.sort_values(by=['count'], ascending=True)

    first_line = df.iloc[0]
    
    # columns except count and score
    columns_except_count = df.columns.drop(['count'])
    if 'score' in columns_except_count:
        columns_except_count = columns_except_count.drop(['score'])

    for index, row in df.iterrows():
        score = 0
        
        for column in columns_except_count:
            if row[column] == first_line[column]:
                score += 1

        score = score / len(columns_except_count)

        # add score to df
        df.loc[index, 'score'] = score

    df = df.sort_values(by=['score', 'count'], ascending=[False, True])
    df = df.reset_index(drop=True)

    return df

def combine_similar_rows(df, count_column='count'):

    # combine each features of the two rows
    for column in df.columns:
        # addition du count
        if column == count_column:
            df.loc[0, column] = df.iloc[0][column] + df.iloc[1][column]
        else:
            # ATTENTION, on considère ici que les attributs de type string
            # check if the two rows have the same value
            if df.iloc[0][column] != df.iloc[1][column]:
                new_value = str(df.iloc[0][column]) + " + " + str(df.iloc[1][column])
                
                # split new_value with " + ", order and remove duplicates
                new_value = sorted(set(new_value.split(" + ")))
                new_value = " + ".join(new_value)

                df.loc[0, column] = new_value 

    df = df.drop(columns=['score'])

    # drop iloc[1] row
    df = df.drop(df.index[1])

    df = df.sort_values(by=['count'], ascending=True)
    df = df.reset_index(drop=True)

    return df

In [None]:
def anonymise_microdata(df, nMax):
    interation = 0
    while check_if_need_to_be_anonymised(df, nMax) == True:
        df_with_score = get_similarity_score(df)
        df = combine_similar_rows(df_with_score)
        print(f'Iteration {interation}')
        interation += 1
    
    return df

In [None]:
df_anon = anonymise_microdata(pivot_table, 120)
df_anon