# Data cleaning project

In [1]:
import pandas as pd
import numpy as np

## Exploring

In [5]:
#Open the excel file - import it
# note : you need the openpyxl module in the environment you are currently using.
df=pd.read_excel('5_New_markets.xlsx')
df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,458989,Female,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6
1,458994,Male,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6
2,458996,female,Yes,69,No,,0.0,Low,1.0,Cat_6
3,459000,Male,Yes,59,No,Executive,11.0,High,2.0,Cat_6
4,459001,Female,No,19,No,Marketing,,Low,4.0,Cat_6
...,...,...,...,...,...,...,...,...,...,...
2622,467954,Male,No,29,No,Healthcare,9.0,Low,4.0,Cat_6
2623,467958,Female,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6
2624,467960,Female,No,53,Yes,Entertainment,,Low,2.0,Cat_6
2625,467961,Male,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4


In [7]:
#Check if there are full duplicates : answer is no 
# We therefore have 2627 rows (individuals)
len(df.drop_duplicates())

2627

In [13]:
# The table has 10 columns : ID, gender (F/M), ever_married (Y/N), age (num in yrs), graduated (Yes/No), profession (text), 
# work_experience (in years), family_size (number of individuals), spending_score (Low/Med/High)
df.describe()
# Numeric values : 
# AGE mean is 43, median 41, range is from 18 to 89 (adults), with 75% over 30, so mainly active population
# WORK XP ranges from 0 to 14 years, with a mean of 2.5 years and 75% with under 4 years of experience 
# Work XP is a weird series because older people should have more than 14 years of experience 
# FAMILY SIZE is 1 to 9 individuals (range 8), mean of 2.8 and median of 2.  


Unnamed: 0,ID,Age,Work_Experience,Family_Size
count,2627.0,2627.0,2358.0,2514.0
mean,463433.918919,43.649791,2.552587,2.825378
std,2618.245698,16.967015,3.341094,1.551906
min,458989.0,18.0,0.0,1.0
25%,461162.5,30.0,0.0,2.0
50%,463379.0,41.0,1.0,2.0
75%,465696.0,53.0,4.0,4.0
max,467968.0,89.0,14.0,9.0


In [35]:
df.describe(include='O')
#Categorical variables : 
# GENDER : male or female (with renaming necessary): 54% male 
# EVER MARRIED (Y/N) : 59% yes
# GRADUATED (Y/N) : 61% yes
# PROFESSION : 
    # note : some missing values 
    # Artists : 30%
    # Heathcare : 16%
    # Entertainment : 12%
    # Doctor : 9%
    # Engineer : 9%
    # Executive : 7%
    # Marketing : 4%
    # Homemaker : 3%
# SPENDING SCORE
    # Low : 61%
    # Average : 24%
    # High : 15% 
# CATEGORY 
    # Category 6 = 64%
    # Category 4 = 15%
    # Category 3 = 10%
    # Category 2 = 5%
    # other categories < 3% 



Unnamed: 0,Gender,Ever_Married,Graduated,Profession,Spending_Score,Var_1
count,2627,2577,2603,2589,2627,2595
unique,2,2,2,9,3,7
top,M,Yes,Yes,Artist,Low,Cat_6
freq,1424,1520,1602,802,1616,1672


In [40]:
df["Var_1"].value_counts(normalize=True)
# using normalize returns percentage 

Cat_6    0.644316
Cat_4    0.148748
Cat_3    0.102890
Cat_2    0.054335
Cat_7    0.025434
Cat_1    0.013102
Cat_5    0.011175
Name: Var_1, dtype: float64

In [16]:
df['Gender']=df["Gender"].map({'Male':'M', 'Female':'F', 'female':'F'})
list(df["Gender"].unique())

['F', 'M']

In [9]:
# Find out if there are missing values
df.isna().sum()
# 

ID                   0
Gender               0
Ever_Married        50
Age                  0
Graduated           24
Profession          38
Work_Experience    269
Spending_Score       0
Family_Size        113
Var_1               32
dtype: int64

In [27]:
df.dtypes

ID                   int64
Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Var_1               object
dtype: object

In [42]:
#trying to find a correlation 
# only works with numeric values
# so not very interesting to understand spending score
# This one doesn't work : df.corrwith(df["Var_1"]).value_counts()
df.corr(method='pearson', min_periods=1, numeric_only=True)

Unnamed: 0,ID,Age,Work_Experience,Family_Size
ID,1.0,-0.02294,-0.025445,0.02083
Age,-0.02294,1.0,-0.186238,-0.285237
Work_Experience,-0.025445,-0.186238,1.0,-0.071253
Family_Size,0.02083,-0.285237,-0.071253,1.0


In [53]:
#pd.crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, margins_name='All', dropna=True, normalize=False)
pd.crosstab([df["Spending_Score"]], [df["Gender"], df["Ever_Married"], df["Graduated"]], normalize=True)
# df["Ever_Married"] df["Profession"] df["Graduated"]
# This is really cool to explore categorical variables
# however it doesn't allow to regroup numeric values - we would need to do this 

Gender,F,F,F,F,M,M,M,M
Ever_Married,No,No,Yes,Yes,No,No,Yes,Yes
Graduated,No,Yes,No,Yes,No,Yes,No,Yes
Spending_Score,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Average,0.0,0.0,0.024276,0.0787,0.0,0.0,0.038763,0.094753
High,0.0,0.0,0.021143,0.038763,0.0,0.0,0.031715,0.054033
Low,0.091229,0.122944,0.024276,0.054816,0.114722,0.081832,0.036413,0.091621


In [68]:
pd.crosstab([df["Gender"]], [df["Profession"]], normalize=True)

Profession,Artist,Doctor,Engineer,Entertainment,Executive,Healthcare,Homemaker,Lawyer,Marketing
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
F,0.153727,0.044805,0.07107,0.032831,0.00309,0.062572,0.025879,0.043646,0.022789
M,0.156045,0.048667,0.020085,0.08343,0.06489,0.09888,0.005794,0.041715,0.020085


In [58]:
df_high=df[df["Spending_Score"].isin(["High"])]
df_high

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
3,459000,M,Yes,59,No,Executive,11.0,High,2.0,Cat_6
5,459003,M,Yes,47,Yes,Doctor,0.0,High,5.0,Cat_4
20,459056,M,Yes,63,No,Executive,,High,3.0,Cat_6
21,459057,M,Yes,69,No,Lawyer,,High,,Cat_6
23,459059,M,Yes,79,No,Executive,,High,2.0,Cat_6
...,...,...,...,...,...,...,...,...,...,...
2591,467868,F,Yes,66,Yes,Entertainment,0.0,High,2.0,Cat_6
2592,467876,F,Yes,50,Yes,Artist,9.0,High,2.0,Cat_6
2593,467879,F,Yes,51,Yes,Artist,,High,4.0,Cat_6
2605,467905,M,Yes,37,Yes,Executive,0.0,High,3.0,Cat_6


In [62]:
pd.crosstab([df_high["Spending_Score"]], [df_high["Gender"], df_high["Ever_Married"], df_high["Graduated"], normalize=True)

df_high["Var_1"]]

# In the high spending category, we only have married people
# 37% men + graduated
# 27% women + graduated 
# 22% men - not graduated 
# 15% women - not graduated 
# So at the moment, we know we should focus on married + graduated
# Gender : not really representative because same proportion as original population 

Gender,F,F,F,F,F,F,F,F,F,F,...,M,M,M,M,M,M,M,M,M,M
Ever_Married,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes
Graduated,No,No,No,No,No,Yes,Yes,Yes,Yes,Yes,...,No,No,No,No,No,Yes,Yes,Yes,Yes,Yes
Var_1,Cat_1,Cat_3,Cat_4,Cat_6,Cat_7,Cat_2,Cat_3,Cat_4,Cat_5,Cat_6,...,Cat_2,Cat_3,Cat_4,Cat_6,Cat_7,Cat_2,Cat_3,Cat_4,Cat_6,Cat_7
Spending_Score,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
High,0.002717,0.008152,0.021739,0.105978,0.008152,0.008152,0.013587,0.013587,0.002717,0.220109,...,0.002717,0.019022,0.029891,0.157609,0.008152,0.01087,0.016304,0.029891,0.309783,0.005435


In [63]:
df_high["Profession"].value_counts(normalize=True)
# Top spending professions : lawyers (31%), executives (30%), and then artists (20%).
# Which sort of repeats the graduate argument, but also pushes towards considering men as key targets (executive is mainly male)


Lawyer           0.307087
Executive        0.304462
Artist           0.196850
Entertainment    0.041995
Engineer         0.041995
Marketing        0.031496
Doctor           0.028871
Healthcare       0.026247
Homemaker        0.020997
Name: Profession, dtype: float64

In [70]:
df.describe()

Unnamed: 0,ID,Age,Work_Experience,Family_Size
count,2627.0,2627.0,2358.0,2514.0
mean,463433.918919,43.649791,2.552587,2.825378
std,2618.245698,16.967015,3.341094,1.551906
min,458989.0,18.0,0.0,1.0
25%,461162.5,30.0,0.0,2.0
50%,463379.0,41.0,1.0,2.0
75%,465696.0,53.0,4.0,4.0
max,467968.0,89.0,14.0,9.0


In [98]:
list_quart=list (df["Age"].quantile([0.25, 0.5, 0.75]))
list_quart

[30.0, 41.0, 53.0]

In [115]:
df["Age_Category"]="None"
df.loc[df["Age"] <= list_quart[0] , "Age_Category"] = "Under 30"
df.loc[df["Age"] > list_quart[0] , "Age_Category"] = "Between 30 and 41"
df.loc[df["Age"] > list_quart[1] , "Age_Category"] = "Between 41 and 53"
df.loc[df["Age"] >= list_quart[2] , "Age_Category"] = "Over 53"
df 


Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Age_Category
0,458989,F,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6,Between 30 and 41
1,458994,M,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6,Between 30 and 41
2,458996,F,Yes,69,No,,0.0,Low,1.0,Cat_6,Over 53
3,459000,M,Yes,59,No,Executive,11.0,High,2.0,Cat_6,Over 53
4,459001,F,No,19,No,Marketing,,Low,4.0,Cat_6,Under 30
...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,M,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,Under 30
2623,467958,F,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,Between 30 and 41
2624,467960,F,No,53,Yes,Entertainment,,Low,2.0,Cat_6,Over 53
2625,467961,M,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,Between 41 and 53


In [116]:
df["Age_Category"].value_counts()

Between 30 and 41    688
Under 30             675
Over 53              661
Between 41 and 53    603
Name: Age_Category, dtype: int64

In [118]:
df["Family_Size_Category"] = df["Family_Size"].fillna(2)
df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Age_Category,Family_Size_Category
0,458989,F,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6,Between 30 and 41,1.0
1,458994,M,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6,Between 30 and 41,4.0
2,458996,F,Yes,69,No,,0.0,Low,1.0,Cat_6,Over 53,1.0
3,459000,M,Yes,59,No,Executive,11.0,High,2.0,Cat_6,Over 53,2.0
4,459001,F,No,19,No,Marketing,,Low,4.0,Cat_6,Under 30,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,M,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,Under 30,4.0
2623,467958,F,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,Between 30 and 41,1.0
2624,467960,F,No,53,Yes,Entertainment,,Low,2.0,Cat_6,Over 53,2.0
2625,467961,M,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,Between 41 and 53,5.0


In [122]:
df["Family_Size"].value_counts()

2.0    768
1.0    512
3.0    455
4.0    444
5.0    200
6.0     78
7.0     26
9.0     16
8.0     15
Name: Family_Size, dtype: int64

In [123]:
print(881-768)

113


In [119]:
df["Family_Size_Category"].value_counts()

2.0    881
1.0    512
3.0    455
4.0    444
5.0    200
6.0     78
7.0     26
9.0     16
8.0     15
Name: Family_Size_Category, dtype: int64

In [127]:
df["Family_Size_Category_cat"]= "None"
df.loc[df["Family_Size_Category"] == 1, "Family_Size_Category_cat"] = "One person"
df.loc[df["Family_Size_Category"] == 2, "Family_Size_Category_cat"] = "Two people"
df.loc[(df["Family_Size_Category"] == 3)|(df["Family_Size_Category"] == 4), "Family_Size_Category_cat"] = "Three or four people"
df.loc[df["Family_Size_Category"] >= 5, "Family_Size_Category_cat"] = "5 or more people"
df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Age_Category,Family_Size_Category,Family_Size_Category_cat
0,458989,F,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6,Between 30 and 41,1.0,One person
1,458994,M,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6,Between 30 and 41,4.0,Three or four people
2,458996,F,Yes,69,No,,0.0,Low,1.0,Cat_6,Over 53,1.0,One person
3,459000,M,Yes,59,No,Executive,11.0,High,2.0,Cat_6,Over 53,2.0,Two people
4,459001,F,No,19,No,Marketing,,Low,4.0,Cat_6,Under 30,4.0,Three or four people
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,M,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,Under 30,4.0,Three or four people
2623,467958,F,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,Between 30 and 41,1.0,One person
2624,467960,F,No,53,Yes,Entertainment,,Low,2.0,Cat_6,Over 53,2.0,Two people
2625,467961,M,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,Between 41 and 53,5.0,5 or more people


In [128]:
df["Family_Size_Category_cat"].value_counts()

Three or four people    899
Two people              881
One person              512
5 or more people        335
Name: Family_Size_Category_cat, dtype: int64

In [130]:
pd.crosstab([df["Family_Size_Category_cat"]], [df["Spending_Score"]], normalize=True)

Spending_Score,Average,High,Low
Family_Size_Category_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5 or more people,0.027788,0.016749,0.082984
One person,0.000761,0.002284,0.191854
Three or four people,0.105443,0.048725,0.188047
Two people,0.103921,0.079178,0.152265


In [None]:
Family_Size_Category_cat