<h1> Task 2: Data Anonymisation

In [21]:
import pandas as pd
import hashlib
from pathlib import Path

In [22]:
data_dir = Path("data")
csv_path = data_dir / "mobile_customers.xlsx"
df = pd.read_excel(csv_path)

In [23]:
df.columns

Index(['Unnamed: 0', 'customer_id', 'date_registered', 'username', 'name',
       'gender', 'address', 'email', 'birthdate', 'current_location',
       'residence', 'employer', 'job', 'age', 'salary', 'credit_card_provider',
       'credit_card_number', 'credit_card_security_code',
       'credit_card_expire'],
      dtype='object')

In [24]:
df.dtypes

Unnamed: 0                            int64
customer_id                          object
date_registered              datetime64[ns]
username                             object
name                                 object
gender                               object
address                              object
email                                object
birthdate                    datetime64[ns]
current_location                     object
residence                            object
employer                             object
job                                  object
age                                   int64
salary                                int64
credit_card_provider                 object
credit_card_number                    int64
credit_card_security_code             int64
credit_card_expire                   object
dtype: object

In [25]:
# columns that don’t provide helpful information for analysis
df = df.drop(columns=["Unnamed: 0", "username", "name", "email", "birthdate", "residence", "address",
                      "credit_card_provider", "credit_card_number", "credit_card_security_code", "credit_card_expire", 
                      "current_location"])

# anonymisation
df["employer"] = df["employer"].apply(
    lambda x: hashlib.sha256(str(x).encode()).hexdigest()
)

df["job"] = df["job"].apply(
    lambda x: hashlib.sha256(str(x).encode()).hexdigest()
)

# grouping the values
def age_groups(input: int) -> str:
    # 18 <= age <= 65
    tens_digit = str(input)[:1]
    return f"{tens_digit}0 - {tens_digit}9"
df["age"] = df["age"].apply(age_groups)

df["salary"] = df["salary"].apply(
    lambda x: round(int(x), ndigits=-3)
)

df

Unnamed: 0,customer_id,date_registered,gender,employer,job,age,salary
0,24c9d2d0-d0d3-4a90-9a3a-e00e4aac99bd,2021-09-29,M,a2e5631b9eb4ab110d98fff28178c4bd20e9c2dea0d822...,847953c7c1bef61e2ea6d7cb7d8061e3cfc7890239d2aa...,40 - 49,54000
1,7b2bc220-0296-4914-ba46-d6cc6a55a62a,2019-08-17,F,a8a4a9f4669c14da68479a0cb5505d4115c9d6cee096ef...,5124dd1227dcf3135500059a76ed9a2bb37c80f8341fde...,40 - 49,82000
2,06febdf9-07fb-4a1b-87d7-a5f97d9a5faf,2019-11-01,M,ce5b9ed5f6fddafa761843a9d1235c312fbafb8516b7fd...,8d4277a3e9386a478273e969d4e24c2cbce451677fd49e...,40 - 49,205000
3,23df88e5-5dd3-46af-ac0d-0c6bd92e4b96,2021-12-31,F,8f037308147ed902c152496c8c02f2569bf5cd75b75b60...,1aa7059579295f5d0d837fa8f8c01f0206a9207e0daa7d...,30 - 39,116000
4,6069c2d7-7905-4993-a155-64f6aba143b1,2020-08-09,F,ecdd87961fc686b931267ace5a440882bf64c3c80c2973...,f04c1c7e30210f09d47fca93074fed043943bf351b0337...,50 - 59,108000
...,...,...,...,...,...,...,...
9995,d6424be2-4351-4535-8bca-34775cdcc411,2021-09-12,F,6278b322cfb9a42b082554be7aa86c98c726d76a92f9a0...,892166d95492472cb1b103ddfe2f6fb8ba2975fe943901...,50 - 59,241000
9996,d9ac8491-5e97-46ba-a7f3-d12491e9487c,2020-08-15,F,6d9783f1d33672854f3c0a1eb3ef8e1b05f10f907ff1b7...,34437c3150c914c0a6b25c085599eab69d27f71f91ecb4...,30 - 39,26000
9997,1d5e94c5-bfc9-439d-84f6-6bf6c613cb16,2019-10-24,F,d23477b7868ee40fbfbd339a9a4dd2878b691d227ae86d...,29f178456b5671da2a773a24bd59e0ea80bd11968950f8...,20 - 29,245000
9998,15549c9b-caa9-413f-9dd5-84b43648ca23,2020-08-14,M,ac0a1342c56b85609e095e82e5876986b280a0b61b0dba...,568fcbb621f83801e5bd734dd893247db66c715a22661b...,40 - 49,233000


In [26]:
df.to_csv("anonymised_mobile_customers.csv", index=False)