# Data cleaning project with Pandas

In [2]:
import pandas as pd

## Exploring the data 

In [3]:
#Import the excel file (note : you need the openpyxl module in the environment you are currently using). 
df=pd.read_excel('5_New_markets.xlsx')

In [4]:
#Check if there are full duplicates : answer is no 
len(df.drop_duplicates())

2627

In [5]:
df
# The table has 2627 rows (individuals)
# The table has 10 columns : ID, gender (F/M), ever_married (Y/N), age (num in yrs), graduated (Yes/No), profession (text), 
# work_experience (in years), family_size (number of individuals), spending_score (Low/Med/High)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,458989,Female,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6
1,458994,Male,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6
2,458996,female,Yes,69,No,,0.0,Low,1.0,Cat_6
3,459000,Male,Yes,59,No,Executive,11.0,High,2.0,Cat_6
4,459001,Female,No,19,No,Marketing,,Low,4.0,Cat_6
...,...,...,...,...,...,...,...,...,...,...
2622,467954,Male,No,29,No,Healthcare,9.0,Low,4.0,Cat_6
2623,467958,Female,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6
2624,467960,Female,No,53,Yes,Entertainment,,Low,2.0,Cat_6
2625,467961,Male,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4


In [6]:
df.describe()
# We use describe to find out more about our numeric values : 
# AGE mean is 43, median 41, range is from 18 to 89 (adults), with 75% over 30, so mainly active population
# FAMILY SIZE is 1 to 9 individuals (range 8), mean of 2.8 and median of 2.  
# WORK XP ranges from 0 to 14 years, with a mean of 2.5 years and 75% with under 4 years of experience 
    # Work XP is a weird series because older people should have more than 14 years of experience. 
    # For the moment, we are choosing to not use this column. 
    # This can be justified by the fact that age and work_experience are normally redundant. 
    # If we choose to analyse it later, the work XP column could refer the time in the current occupation. 
    # BONUS

Unnamed: 0,ID,Age,Work_Experience,Family_Size
count,2627.0,2627.0,2358.0,2514.0
mean,463433.918919,43.649791,2.552587,2.825378
std,2618.245698,16.967015,3.341094,1.551906
min,458989.0,18.0,0.0,1.0
25%,461162.5,30.0,0.0,2.0
50%,463379.0,41.0,1.0,2.0
75%,465696.0,53.0,4.0,4.0
max,467968.0,89.0,14.0,9.0


In [7]:
df.describe(include='O')
# We can also use describe to look into our categorical variables : 
# This is useful to see how the data is organized, but also proportions in the whole population. 
# GENDER : male or female (after renaming): 54% male 
# EVER MARRIED (Y/N) : 59% yes
# GRADUATED (Y/N) : 61% yes
# PROFESSION : 
    # note : some missing values 
    # Artists : 30%
    # Heathcare : 16%
    # Entertainment : 12%
    # Doctor : 9%
    # Engineer : 9%
    # Executive : 7%
    # Marketing : 4%
    # Homemaker : 3%
# SPENDING SCORE
    # Low : 61%
    # Average : 24%
    # High : 15% 
# CATEGORY 
    # Category 6 = 64%
    # Category 4 = 15%
    # Category 3 = 10%
    # Category 2 = 5%
    # other categories < 3% 
    # This Var_1 variable is quite weird, we don't really know what it refers to. 
    # For the moment, we are not considering it. If we have time, we'll get back to it to check its correlations with other variables. 
    # BONUS
# Note : these detailed statistics were gathered using value counts for each of the variables.  
        # df["Var_1"].value_counts(normalize=True)
# Note 2: using "normalize" returns a percentage instead of a simple count. 
# We did all this before choosing how to impute missing values 

Unnamed: 0,Gender,Ever_Married,Graduated,Profession,Spending_Score,Var_1
count,2627,2577,2603,2589,2627,2595
unique,3,2,2,9,3,7
top,Male,Yes,Yes,Artist,Low,Cat_6
freq,1424,1520,1602,802,1616,1672


In [8]:
df["Var_1"].value_counts(normalize=True)
# Note that value counts can also be used to get a cross table 

Cat_6    0.644316
Cat_4    0.148748
Cat_3    0.102890
Cat_2    0.054335
Cat_7    0.025434
Cat_1    0.013102
Cat_5    0.011175
Name: Var_1, dtype: float64

In [9]:
# We noticed there are 3 unique values in gender. When listing them, we notice it is only a upper/lower case issue. 
# So we fixed that. 
df['Gender']=df["Gender"].map({'Male':'M', 'Female':'F', 'female':'F'})
list(df["Gender"].unique())

['F', 'M']

In [10]:
df["Gender"].value_counts(normalize=True)

M    0.542063
F    0.457937
Name: Gender, dtype: float64

In [11]:
# Before we perform calculations, we checked the data types for the different columns. 
# One thing we can notice is that we mainly have categorical variables (objects). 
df.dtypes

ID                   int64
Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Var_1               object
dtype: object

In [12]:
# We looked to see if there are missing values
df.isna().sum()
# We notice there are no missing values for gender, age, and spending score, which will be our key variables. 
# There are a few missing variables for ever_married, graduated, and profession (<50 missing values)
# There are a little more missing variables for family size (113/2627 rows)
# And still a little more for work_experience (269/2627 rows) - but we are not considering this column yet. 
# We decided against forcing imputation at this point. 

ID                   0
Gender               0
Ever_Married        50
Age                  0
Graduated           24
Profession          38
Work_Experience    269
Spending_Score       0
Family_Size        113
Var_1               32
dtype: int64

## Correlations exploration 

In [13]:
# Our initial idea was to try and find a correlation between the different variables and "spending_score" 
# Indeed, we believe this is what the table is trying to show : what are the characteristics of the people with a high spending score ? 
# Answering this question would allow to know which market share (population) the company should focus on for new markets. 

# So we thought of using the correlation method. However, it only works with numeric values. 
# The variable we want to explain, however (spending score) is categorical. 
# So this doesn't work : df.corrwith(df["Var_1"]).value_counts()
# In addition, the numeric values we have don't tell us much about our population 
df.corr(method='pearson', min_periods=1, numeric_only=True)

Unnamed: 0,ID,Age,Work_Experience,Family_Size
ID,1.0,-0.02294,-0.025445,0.02083
Age,-0.02294,1.0,-0.186238,-0.285237
Work_Experience,-0.025445,-0.186238,1.0,-0.071253
Family_Size,0.02083,-0.285237,-0.071253,1.0


In [14]:
# Our lovely TA, Berkay, introduces us to the "crosstab" method (https://pandas.pydata.org/docs/reference/api/pandas.crosstab.html) 
# This allows us to check, for some variables, how the population is distributed based on another characteristic. 
# DETAIL OF THE FUNCTION PARAMETERS : pd.crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, 
# margins=False, margins_name='All', dropna=True, normalize=False)
# For instance, spending score per gender (normalized) will return a table which shows that in the total population: 
# 10% are women are in spending score average, 32% are men in spending score low. 

pd.crosstab([df["Spending_Score"]], [df["Gender"]], normalize=True)

Gender,F,M
Spending_Score,Unnamed: 1_level_1,Unnamed: 2_level_1
Average,0.10354,0.134374
High,0.060525,0.08641
Low,0.293871,0.321279


In [16]:
df.groupby('Spending_Score', as_index=True).mean()
# This allows us to notice the difference in Age between the three categories 

  df.groupby('Spending_Score', as_index=True).mean()


Unnamed: 0_level_0,ID,Age,Work_Experience,Family_Size
Spending_Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Average,463558.8848,47.3072,2.253097,3.066993
High,463320.34715,59.489637,2.136364,2.931217
Low,463412.715347,38.451733,2.762133,2.7021


In [19]:
# We can also check how individuals are distributed in the different spending bins 
df.groupby('Spending_Score', as_index=True).count()

Unnamed: 0_level_0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Family_Size,Var_1
Spending_Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Average,625,625,610,625,619,615,565,612,617
High,386,386,375,386,382,381,330,378,381
Low,1616,1616,1592,1616,1602,1593,1463,1524,1597


In [20]:
# This first crosstab was an exploration of data before we started imputing - to have the original correlations. 
a = pd.crosstab([df["Spending_Score"]], [df["Gender"], df["Ever_Married"], df["Graduated"]], margins=True, normalize=True)
a 
# if we do it with our currently functional categorical variables
# i.e. df["Gender"], df["Ever_Married"] and df["Graduated"]
# we notice that one key variable is "Ever_Married" : there are no never_married women and no never_married men in the "high" - or even average - score. 
# Note : we tried including df["Profession"], but the results were hard to read for the moment. 
# In addition, profession and graduated may be redundant. 
# BONUS 

Gender,F,F,F,F,M,M,M,M,All
Ever_Married,No,No,Yes,Yes,No,No,Yes,Yes,Unnamed: 9_level_1
Graduated,No,Yes,No,Yes,No,Yes,No,Yes,Unnamed: 9_level_2
Spending_Score,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
Average,0.0,0.0,0.024276,0.0787,0.0,0.0,0.038763,0.094753,0.236492
High,0.0,0.0,0.021143,0.038763,0.0,0.0,0.031715,0.054033,0.145654
Low,0.091229,0.122944,0.024276,0.054816,0.114722,0.081832,0.036413,0.091621,0.617854
All,0.091229,0.122944,0.069695,0.172279,0.114722,0.081832,0.106891,0.240407,1.0


In [21]:
a.to_excel('crosstab_v1.xlsx')

## Exploring the high-spending category 

In [23]:
# We are here changing the order of our exploration. 
# For our first try, we built a database with only the "high" spending score individuals 
# and we explored it to see who was in there (gender, ever married, graduated, and professions)

# We noticed that : 
# In the high spending category, we only have married people
# 37% men + graduated
# 27% women + graduated 
# 22% men - not graduated 
# 15% women - not graduated 
# So at the moment, we know we should focus on married + graduated
# Gender : not really representative because same proportion as original population 

# Top spending professions : lawyers (31%), executives (30%), and then artists (20%).
# Which sort of repeats the graduate argument, but also pushes towards considering men as key targets (executive is mainly male)
pd.crosstab([df["Profession"]], [df["Gender"]])

Gender,F,M
Profession,Unnamed: 1_level_1,Unnamed: 2_level_1
Artist,398,404
Doctor,116,126
Engineer,184,52
Entertainment,85,216
Executive,8,168
Healthcare,162,256
Homemaker,67,15
Lawyer,113,108
Marketing,59,52


In [43]:
#Below is the code we used to explore the "high_spending" category
df_high=df[df["Spending_Score"].isin(["High"])]
df_high

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
3,459000,M,Yes,59,No,Executive,11.0,High,2.0,Cat_6
5,459003,M,Yes,47,Yes,Doctor,0.0,High,5.0,Cat_4
20,459056,M,Yes,63,No,Executive,,High,3.0,Cat_6
21,459057,M,Yes,69,No,Lawyer,,High,,Cat_6
23,459059,M,Yes,79,No,Executive,,High,2.0,Cat_6
...,...,...,...,...,...,...,...,...,...,...
2591,467868,F,Yes,66,Yes,Entertainment,0.0,High,2.0,Cat_6
2592,467876,F,Yes,50,Yes,Artist,9.0,High,2.0,Cat_6
2593,467879,F,Yes,51,Yes,Artist,,High,4.0,Cat_6
2605,467905,M,Yes,37,Yes,Executive,0.0,High,3.0,Cat_6


In [44]:
#b = pd.crosstab([df_high["Spending_Score"]], [df_high["Gender"], df_high["Graduated"]], normalize=True, margins=True)
#b = pd.crosstab([df_high["Gender"]], [df_high["Graduated"]], normalize=True, margins=True)
# Inside the high spending category, 100% are married, 63% are graduates, 59% are men. 
# 3/4 of women are graduates
# 3/5 of men are graduates
b = pd.crosstab([df_high["Profession"]], [df_high["Graduated"]], normalize=True, margins=True)
# We have a lot of non-graduated even in the high-spending category. Are we looking at students of a specific school that allow to train during the entire life ? 
b

Graduated,No,Yes,All
Profession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Artist,0.02122,0.177719,0.198939
Doctor,0.013263,0.013263,0.026525
Engineer,0.018568,0.023873,0.04244
Entertainment,0.013263,0.026525,0.039788
Executive,0.124668,0.180371,0.30504
Healthcare,0.005305,0.02122,0.026525
Homemaker,0.005305,0.015915,0.02122
Lawyer,0.145889,0.161804,0.307692
Marketing,0.018568,0.013263,0.03183
All,0.366048,0.633952,1.0


In [45]:
df_high["Profession"].value_counts(normalize=True)
# overrepresentation of lawyers and executives - underrep of artists, healthcare, and entertainment 

Lawyer           0.307087
Executive        0.304462
Artist           0.196850
Entertainment    0.041995
Engineer         0.041995
Marketing        0.031496
Doctor           0.028871
Healthcare       0.026247
Homemaker        0.020997
Name: Profession, dtype: float64

In [46]:
df["Profession"].value_counts(normalize=True)

Artist           0.309772
Healthcare       0.161452
Entertainment    0.116261
Doctor           0.093472
Engineer         0.091155
Lawyer           0.085361
Executive        0.067980
Marketing        0.042874
Homemaker        0.031672
Name: Profession, dtype: float64

In [49]:
df_high.to_csv("high_spending_v1.csv", sep= ";")

In [50]:
#After doing this, we felt we could go further by adding the age and family size variables to our cross examination
#To do this, however, we needed to change these numeric variables into categorical ones. 
#So here, we are going to do this but in the original dataframe, before recreating a dataframe with only the high spending scores. 

## Imputation for new file

In [51]:
# So going back to our original dataframe.
df.describe()

Unnamed: 0,ID,Age,Work_Experience,Family_Size
count,2627.0,2627.0,2358.0,2514.0
mean,463433.918919,43.649791,2.552587,2.825378
std,2618.245698,16.967015,3.341094,1.551906
min,458989.0,18.0,0.0,1.0
25%,461162.5,30.0,0.0,2.0
50%,463379.0,41.0,1.0,2.0
75%,465696.0,53.0,4.0,4.0
max,467968.0,89.0,14.0,9.0


In [52]:
# Note : age did not have missing values, but imputation was needed for family size.
# However, family size is oddly distributed, with a very high concentration at 2 people families
# We therefore chose to use the mode (2) instead of the mean to fill the missing values. 
# This means, however, that our "bins" will not have the same size in the end. 


In [53]:
# We chose to fill the family size missing values with 2 (our mode) - we do it in a new column to check. 
df["Family_Size_Imputed"] = df["Family_Size"].fillna(2)
df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Family_Size_Imputed
0,458989,F,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6,1.0
1,458994,M,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6,4.0
2,458996,F,Yes,69,No,,0.0,Low,1.0,Cat_6,1.0
3,459000,M,Yes,59,No,Executive,11.0,High,2.0,Cat_6,2.0
4,459001,F,No,19,No,Marketing,,Low,4.0,Cat_6,4.0
...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,M,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,4.0
2623,467958,F,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,1.0
2624,467960,F,No,53,Yes,Entertainment,,Low,2.0,Cat_6,2.0
2625,467961,M,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,5.0


In [54]:
# We check the value counts and notice, as expected, that our "2 people family" category is important. 
df["Family_Size_Imputed"].value_counts()

2.0    881
1.0    512
3.0    455
4.0    444
5.0    200
6.0     78
7.0     26
9.0     16
8.0     15
Name: Family_Size_Imputed, dtype: int64

In [55]:
# For profession (38 missing values)
df["Profession"].value_counts(normalize=True)

Artist           0.309772
Healthcare       0.161452
Entertainment    0.116261
Doctor           0.093472
Engineer         0.091155
Lawyer           0.085361
Executive        0.067980
Marketing        0.042874
Homemaker        0.031672
Name: Profession, dtype: float64

In [56]:
# For profession, fill with Unknown 
df["Profession"]= df["Profession"].fillna("Unknown")
df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Family_Size_Imputed
0,458989,F,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6,1.0
1,458994,M,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6,4.0
2,458996,F,Yes,69,No,Unknown,0.0,Low,1.0,Cat_6,1.0
3,459000,M,Yes,59,No,Executive,11.0,High,2.0,Cat_6,2.0
4,459001,F,No,19,No,Marketing,,Low,4.0,Cat_6,4.0
...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,M,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,4.0
2623,467958,F,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,1.0
2624,467960,F,No,53,Yes,Entertainment,,Low,2.0,Cat_6,2.0
2625,467961,M,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,5.0


In [57]:
# We calculate the quartiles in the age category 
list_quart=list (df["Age"].quantile([0.25, 0.5, 0.75]))
list_quart

[30.0, 41.0, 53.0]

In [58]:
# And use it to create a new age_category variable 
df["Age_Category"]="None"
df.loc[df["Age"] <= list_quart[0] , "Age_Category"] = "Under 30"
df.loc[df["Age"] > list_quart[0] , "Age_Category"] = "Between 30 and 41"
df.loc[df["Age"] > list_quart[1] , "Age_Category"] = "Between 41 and 53"
df.loc[df["Age"] >= list_quart[2] , "Age_Category"] = "Over 53"
df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Family_Size_Imputed,Age_Category
0,458989,F,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6,1.0,Between 30 and 41
1,458994,M,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6,4.0,Between 30 and 41
2,458996,F,Yes,69,No,Unknown,0.0,Low,1.0,Cat_6,1.0,Over 53
3,459000,M,Yes,59,No,Executive,11.0,High,2.0,Cat_6,2.0,Over 53
4,459001,F,No,19,No,Marketing,,Low,4.0,Cat_6,4.0,Under 30
...,...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,M,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,4.0,Under 30
2623,467958,F,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,1.0,Between 30 and 41
2624,467960,F,No,53,Yes,Entertainment,,Low,2.0,Cat_6,2.0,Over 53
2625,467961,M,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,5.0,Between 41 and 53


In [59]:
# Checking the value counts is useful to see if our categories are relatively similar, and if all rows are categorizes
df["Age_Category"].value_counts()

Between 30 and 41    688
Under 30             675
Over 53              661
Between 41 and 53    603
Name: Age_Category, dtype: int64

In [61]:
pd.crosstab([df["Spending_Score"]], [df["Age_Category"]], normalize=True, margins=True)

Age_Category,Between 30 and 41,Between 41 and 53,Over 53,Under 30,All
Spending_Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Average,0.066616,0.090598,0.067377,0.013323,0.237914
High,0.020936,0.032737,0.085649,0.007613,0.146936
Low,0.174343,0.106205,0.098592,0.236011,0.61515
All,0.261896,0.229539,0.251618,0.256947,1.0


In [62]:
# Then we proceed to create our categorical variable for family size. 
# We discussed A LOT, and came to the conclusion that 
# single-person families should be separated from the other families 
# two-person families, as a mode, should be a category 
# we can merge three and four person families in one category 
# and then we create a 5 and more people category
# Note : it is unclear whether the "people" considered are adults or not.
        # More precisely : 2 people families can be 2 adults, or a monoparental family.
        # So we shouldn't overinterpret this variable. 
df["Family_Size_Cat"]= "None"
df.loc[df["Family_Size_Imputed"] == 1, "Family_Size_Cat"] = "1 person"
df.loc[df["Family_Size_Imputed"] == 2, "Family_Size_Cat"] = "2 people"
df.loc[(df["Family_Size_Imputed"] == 3)|(df["Family_Size_Imputed"] == 4), "Family_Size_Cat"] = "3 or 4 people"
df.loc[df["Family_Size_Imputed"] >= 5, "Family_Size_Cat"] = "5 or more people"
df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Family_Size_Imputed,Age_Category,Family_Size_Cat
0,458989,F,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6,1.0,Between 30 and 41,1 person
1,458994,M,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6,4.0,Between 30 and 41,3 or 4 people
2,458996,F,Yes,69,No,Unknown,0.0,Low,1.0,Cat_6,1.0,Over 53,1 person
3,459000,M,Yes,59,No,Executive,11.0,High,2.0,Cat_6,2.0,Over 53,2 people
4,459001,F,No,19,No,Marketing,,Low,4.0,Cat_6,4.0,Under 30,3 or 4 people
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,M,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,4.0,Under 30,3 or 4 people
2623,467958,F,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,1.0,Between 30 and 41,1 person
2624,467960,F,No,53,Yes,Entertainment,,Low,2.0,Cat_6,2.0,Over 53,2 people
2625,467961,M,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,5.0,Between 41 and 53,5 or more people


In [63]:
df["Family_Size_Cat"].value_counts()
# again value counts

3 or 4 people       899
2 people            881
1 person            512
5 or more people    335
Name: Family_Size_Cat, dtype: int64

In [64]:
pd.crosstab([df["Family_Size_Cat"]], [df["Spending_Score"]], normalize=True)
# and here we are checking whether family size and spending score are related 
# This table is harder to read, but we can see that the distribution in the high category based on family size 
# doesn't seem much different than the general distribution in the high category. 

Spending_Score,Average,High,Low
Family_Size_Cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1 person,0.000761,0.002284,0.191854
2 people,0.103921,0.079178,0.152265
3 or 4 people,0.105443,0.048725,0.188047
5 or more people,0.027788,0.016749,0.082984


In [68]:
df.isna().sum()
# We still have to fix ever_married (we are droping work xp and var_1, as well as the original family size variable)

ID                       0
Gender                   0
Ever_Married            50
Age                      0
Graduated               24
Profession               0
Work_Experience        269
Spending_Score           0
Family_Size            113
Var_1                   32
Family_Size_Imputed      0
Age_Category             0
Family_Size_Cat          0
dtype: int64

In [67]:
from sklearn.impute import SimpleImputer, KNNImputer

In [102]:
# We are going to use a simple imputer to fill the na values in ever_Married
# We ran the following lines 
# marriage = pd.DataFrame(df.loc[df["Ever_Married"].isna()])
# marriage["Family_Size"].value_counts()
# which show that 38 out of 50 individuals for which we don't have the married value are in a family of 2 or more. 
df["Ever_Married"] = df["Ever_Married"].fillna("Yes")
# to be precise, we should have filled "no" for the ones in 1 person families, but we can't really know this 
# age was not a good indicator either 
# considering the relatively low amount of NAs (50 out of more than 2600 rows), we think it is acceptable 

In [106]:
# We are going to use a simple imputer to fill the na values in graduated
# We ran the following lines 
#graduation = pd.DataFrame(df.loc[df["Graduated"].isna()])
#graduation["Profession"].value_counts()
# which show that 23 out of 24 have a stated profession, so we will imput graduated for all. 
# considering the relatively low amount of NAs (24 out of more than 2600 rows), we think it is acceptable 
df["Graduated"] = df["Graduated"].fillna("Yes")

In [107]:
# At this point we feel we may want to move to SQL to create more readable tables. 
# We are simply exporting to csv
# BONUS would be to get to use sqlachemy. 

# Creating a table with our original database - then dropping the unnecessary columns 
new_markets_clean = df.copy()


In [108]:
new_markets_clean

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Family_Size_Imputed,Age_Category,Family_Size_Cat
0,458989,F,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6,1.0,Between 30 and 41,1 person
1,458994,M,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6,4.0,Between 30 and 41,3 or 4 people
2,458996,F,Yes,69,No,Unknown,0.0,Low,1.0,Cat_6,1.0,Over 53,1 person
3,459000,M,Yes,59,No,Executive,11.0,High,2.0,Cat_6,2.0,Over 53,2 people
4,459001,F,No,19,No,Marketing,,Low,4.0,Cat_6,4.0,Under 30,3 or 4 people
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,M,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,4.0,Under 30,3 or 4 people
2623,467958,F,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,1.0,Between 30 and 41,1 person
2624,467960,F,No,53,Yes,Entertainment,,Low,2.0,Cat_6,2.0,Over 53,2 people
2625,467961,M,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,5.0,Between 41 and 53,5 or more people


In [109]:
new_markets_clean.isna().sum()

ID                       0
Gender                   0
Ever_Married             0
Age                      0
Graduated                0
Profession               0
Work_Experience        269
Spending_Score           0
Family_Size            113
Var_1                   32
Family_Size_Imputed      0
Age_Category             0
Family_Size_Cat          0
dtype: int64

In [110]:
new_markets_clean.drop(columns=["Work_Experience","Family_Size", "Var_1"], inplace=True)


In [111]:
new_markets_clean.to_csv('new_markets_clean.csv', sep=';', index=False)

In [112]:
df_high2=new_markets_clean[new_markets_clean["Spending_Score"].isin(["High"])]
df_high2.drop(columns=["Spending_Score"], inplace=True)
df_high2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_high2.drop(columns=["Spending_Score"], inplace=True)


Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Var_1,Family_Size_Imputed,Age_Category,Family_Size_Cat
3,459000,M,Yes,59,No,Executive,Cat_6,2.0,Over 53,2 people
5,459003,M,Yes,47,Yes,Doctor,Cat_4,5.0,Between 41 and 53,5 or more people
20,459056,M,Yes,63,No,Executive,Cat_6,3.0,Over 53,3 or 4 people
21,459057,M,Yes,69,No,Lawyer,Cat_6,2.0,Over 53,2 people
23,459059,M,Yes,79,No,Executive,Cat_6,2.0,Over 53,2 people
...,...,...,...,...,...,...,...,...,...,...
2591,467868,F,Yes,66,Yes,Entertainment,Cat_6,2.0,Over 53,2 people
2592,467876,F,Yes,50,Yes,Artist,Cat_6,2.0,Between 41 and 53,2 people
2593,467879,F,Yes,51,Yes,Artist,Cat_6,4.0,Between 41 and 53,3 or 4 people
2605,467905,M,Yes,37,Yes,Executive,Cat_6,3.0,Between 30 and 41,3 or 4 people


In [113]:
df_high2.to_csv("high_spending_v2.csv", sep=";", index=False)

In [118]:
# We can recreate the original crosstab - and of course play around with other crosstabs. 
c = pd.crosstab([new_markets_clean["Spending_Score"]], [new_markets_clean["Gender"], new_markets_clean["Ever_Married"], new_markets_clean["Graduated"]], margins=True, normalize=True)
# pd.crosstab([new_markets_clean["Spending_Score"]],[new_markets_clean["Gender"], new_markets_clean["Ever_Married"], new_markets_clean["Graduated"], new_markets_clean["Age_Category"], new_markets_clean["Family_Size_Cat"], new_markets_clean["Profession"]], normalize=True)
c

Gender,F,F,F,F,M,M,M,M,All
Ever_Married,No,No,Yes,Yes,No,No,Yes,Yes,Unnamed: 9_level_1
Graduated,No,Yes,No,Yes,No,Yes,No,Yes,Unnamed: 9_level_2
Spending_Score,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
Average,0.0,0.0,0.024743,0.078797,0.0,0.0,0.038828,0.095546,0.237914
High,0.0,0.0,0.020936,0.039589,0.0,0.0,0.032356,0.054054,0.146936
Low,0.088694,0.12067,0.026646,0.057861,0.111534,0.081462,0.037305,0.090978,0.61515
All,0.088694,0.12067,0.072326,0.176247,0.111534,0.081462,0.108489,0.240579,1.0


In [119]:
c.to_csv("crosstab2.csv", sep=";", index=True)