# New Markets

In [49]:
import pandas as pd

## Import the data using Pandas

In [50]:
df = pd.read_excel("5_New_markets.xlsx")
df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,458989,Female,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6
1,458994,Male,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6
2,458996,female,Yes,69,No,,0.0,Low,1.0,Cat_6
3,459000,Male,Yes,59,No,Executive,11.0,High,2.0,Cat_6
4,459001,Female,No,19,No,Marketing,,Low,4.0,Cat_6
...,...,...,...,...,...,...,...,...,...,...
2622,467954,Male,No,29,No,Healthcare,9.0,Low,4.0,Cat_6
2623,467958,Female,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6
2624,467960,Female,No,53,Yes,Entertainment,,Low,2.0,Cat_6
2625,467961,Male,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4


## Examine the data for potential issues (missing data, data inconsistency, outliers, duplicates etc)

In [51]:
len(df.drop_duplicates())

2627

In [52]:
len(df.drop_duplicates(subset='ID'))

2627

In [53]:
df.isna().sum()

ID                   0
Gender               0
Ever_Married        50
Age                  0
Graduated           24
Profession          38
Work_Experience    269
Spending_Score       0
Family_Size        113
Var_1               32
dtype: int64

In [54]:
df.describe()

Unnamed: 0,ID,Age,Work_Experience,Family_Size
count,2627.0,2627.0,2358.0,2514.0
mean,463433.918919,43.649791,2.552587,2.825378
std,2618.245698,16.967015,3.341094,1.551906
min,458989.0,18.0,0.0,1.0
25%,461162.5,30.0,0.0,2.0
50%,463379.0,41.0,1.0,2.0
75%,465696.0,53.0,4.0,4.0
max,467968.0,89.0,14.0,9.0


In [55]:
df.describe(include="O")

Unnamed: 0,Gender,Ever_Married,Graduated,Profession,Spending_Score,Var_1
count,2627,2577,2603,2589,2627,2595
unique,3,2,2,9,3,7
top,Male,Yes,Yes,Artist,Low,Cat_6
freq,1424,1520,1602,802,1616,1672


## Apply the different cleaning and manipulation techniques you have learned

In [56]:
from sklearn.impute import SimpleImputer, KNNImputer

In [57]:
list(df['Profession'].unique())

['Engineer',
 'Healthcare',
 nan,
 'Executive',
 'Marketing',
 'Doctor',
 'Artist',
 'Lawyer',
 'Entertainment',
 'Homemaker']

### cleaning the culumn "Gender"

In [58]:
list(df['Gender'].unique())

['Female', 'Male', 'female']

In [59]:
df['Gender'] = df['Gender'].map({'Male': 'M', 'Female': 'F', 'female': 'F'})
#df['Sex'].map({val: val[0].upper() for val in values})

In [60]:
list(df['Ever_Married'].unique())

['Yes', 'No', nan]

In [61]:
list(df['Graduated'].unique())

['Yes', 'No', nan]

In [62]:
list(df['Spending_Score'].unique())

['Low', 'Average', 'High']

In [63]:
df['Spending_Score'].value_counts()

Low        1616
Average     625
High        386
Name: Spending_Score, dtype: int64

In [64]:
df['Var_1'].value_counts()

Cat_6    1672
Cat_4     386
Cat_3     267
Cat_2     141
Cat_7      66
Cat_1      34
Cat_5      29
Name: Var_1, dtype: int64

In [65]:
df.head(10)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,458989,F,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6
1,458994,M,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6
2,458996,F,Yes,69,No,,0.0,Low,1.0,Cat_6
3,459000,M,Yes,59,No,Executive,11.0,High,2.0,Cat_6
4,459001,F,No,19,No,Marketing,,Low,4.0,Cat_6
5,459003,M,Yes,47,Yes,Doctor,0.0,High,5.0,Cat_4
6,459005,M,Yes,61,Yes,Doctor,5.0,Low,3.0,Cat_6
7,459008,F,Yes,47,Yes,Artist,1.0,Average,3.0,Cat_6
8,459013,M,Yes,50,Yes,Artist,2.0,Average,4.0,Cat_6
9,459014,M,No,19,No,Healthcare,0.0,Low,4.0,Cat_6


### imputing the column "Work_Experience" by the mean

In [66]:
df['Work_Experience'] = df['Work_Experience_Imputed']

KeyError: 'Work_Experience_Imputed'

In [67]:
work_experience_mean = df['Work_Experience'].mean()
work_experience_mean

2.552586938083121

In [68]:
df['Work_Experience_Imputed'] = df['Work_Experience']
df.loc[df['Work_Experience'].isna(), 'Work_Experience_Imputed'] = work_experience_mean

In [69]:
df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Work_Experience_Imputed
0,458989,F,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6,0.000000
1,458994,M,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6,8.000000
2,458996,F,Yes,69,No,,0.0,Low,1.0,Cat_6,0.000000
3,459000,M,Yes,59,No,Executive,11.0,High,2.0,Cat_6,11.000000
4,459001,F,No,19,No,Marketing,,Low,4.0,Cat_6,2.552587
...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,M,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,9.000000
2623,467958,F,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,1.000000
2624,467960,F,No,53,Yes,Entertainment,,Low,2.0,Cat_6,2.552587
2625,467961,M,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,1.000000


### imputing the column "Family_Size" by the medium (instead of by the mean)

In [70]:
df['Family_Size'] = df['Family_Size_Imputed']

KeyError: 'Family_Size_Imputed'

In [71]:
family_size_mean = round(df['Family_Size'].mean())
family_size_mean

3

In [72]:
family_size_median = df['Family_Size'].median()
family_size_median

2.0

In [73]:
df['Family_Size'].value_counts()

2.0    768
1.0    512
3.0    455
4.0    444
5.0    200
6.0     78
7.0     26
9.0     16
8.0     15
Name: Family_Size, dtype: int64

In [74]:
df['Family_Size'].mean()

2.8253778838504378

In [75]:
df['Family_Size_Imputed'] = df['Family_Size']
df.loc[df['Family_Size'].isna(), 'Family_Size_Imputed'] = family_size_median

In [76]:
df['Family_Size_Imputed'].value_counts()

2.0    881
1.0    512
3.0    455
4.0    444
5.0    200
6.0     78
7.0     26
9.0     16
8.0     15
Name: Family_Size_Imputed, dtype: int64

In [77]:
df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Work_Experience_Imputed,Family_Size_Imputed
0,458989,F,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6,0.000000,1.0
1,458994,M,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6,8.000000,4.0
2,458996,F,Yes,69,No,,0.0,Low,1.0,Cat_6,0.000000,1.0
3,459000,M,Yes,59,No,Executive,11.0,High,2.0,Cat_6,11.000000,2.0
4,459001,F,No,19,No,Marketing,,Low,4.0,Cat_6,2.552587,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,M,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,9.000000,4.0
2623,467958,F,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,1.000000,1.0
2624,467960,F,No,53,Yes,Entertainment,,Low,2.0,Cat_6,2.552587,2.0
2625,467961,M,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,1.000000,5.0


### replacing missing values in culumn "Profession" by "Unknown"

In [84]:
df['Profession'].fillna('Unknown', inplace=True)

In [85]:
df['Profession'].head(20)

0       Engineer
1     Healthcare
2        Unknown
3      Executive
4      Marketing
5         Doctor
6         Doctor
7         Artist
8         Artist
9     Healthcare
10    Healthcare
11    Healthcare
12        Artist
13    Healthcare
14        Doctor
15        Artist
16        Lawyer
17        Artist
18        Artist
19        Lawyer
Name: Profession, dtype: object

In [80]:
df['Profession'].value_counts()

Artist           802
Healthcare       418
Entertainment    301
Doctor           242
Engineer         236
Lawyer           221
Executive        176
Marketing        111
Homemaker         82
Unknown           38
Name: Profession, dtype: int64

### for the columns of "Ever_Married" and "Graduated"

In [81]:
# For the columns of "Ever_Married" and "Graduated", I first planned to use this imputation method: 
# caculate the percentage of the two values (yes or no), then assign one of the two values to 
# the people with missing age values, but the percentage of the assigned values should be 
# roughly equal to the percentage of the total existing values. But finally, We decided that 
# we can just leave them as they are and, when using the data of the column, 
# simply neglecting the missing values, since they don't occupy an important proportion.

### finding the correlation between "Spending_Score" and "Family_Size"

In [82]:
df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Work_Experience_Imputed,Family_Size_Imputed
0,458989,F,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6,0.000000,1.0
1,458994,M,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6,8.000000,4.0
2,458996,F,Yes,69,No,Unknown,0.0,Low,1.0,Cat_6,0.000000,1.0
3,459000,M,Yes,59,No,Executive,11.0,High,2.0,Cat_6,11.000000,2.0
4,459001,F,No,19,No,Marketing,,Low,4.0,Cat_6,2.552587,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,M,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,9.000000,4.0
2623,467958,F,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,1.000000,1.0
2624,467960,F,No,53,Yes,Entertainment,,Low,2.0,Cat_6,2.552587,2.0
2625,467961,M,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,1.000000,5.0


In [83]:
df.loc[df['amount_spent']>5840.182, 'customer_type'] = 'VIP Customer'

KeyError: 'amount_spent'

### Export clean version of your data into CSV file using Pandas.

In [86]:
df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Work_Experience_Imputed,Family_Size_Imputed
0,458989,F,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6,0.000000,1.0
1,458994,M,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6,8.000000,4.0
2,458996,F,Yes,69,No,Unknown,0.0,Low,1.0,Cat_6,0.000000,1.0
3,459000,M,Yes,59,No,Executive,11.0,High,2.0,Cat_6,11.000000,2.0
4,459001,F,No,19,No,Marketing,,Low,4.0,Cat_6,2.552587,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,M,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,9.000000,4.0
2623,467958,F,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,1.000000,1.0
2624,467960,F,No,53,Yes,Entertainment,,Low,2.0,Cat_6,2.552587,2.0
2625,467961,M,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,1.000000,5.0


In [90]:
df.to_csv('NewMarkets_SQL.csv', sep=',', index=False)  # ',' for SQL

In [89]:
df.to_csv('NewMarkets_Excel.csv', sep=';', index=False)  # ';' for Excel