In [118]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, RobustScaler
from category_encoders import BinaryEncoder
from sklearn.model_selection import train_test_split

In [62]:
df = pd.read_csv('black_friday.csv')

In [63]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


## Get quick info : 

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


## Get the Shape : 

In [65]:
df.shape

(550068, 12)

## Statistical infos : 

In [66]:
df.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


## Check the duplicated data : 

In [67]:
df.duplicated().sum()

0

## Check missing values :

In [68]:
df.isnull().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

### filing the missing data with 0.0 values:
* like we did in the analyze notebook `analyze_black_friday_kaggle.ipynb`

In [69]:
df['Product_Category_2'].fillna(0, inplace=True)

In [70]:
df.isnull().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2                 0
Product_Category_3            383247
Purchase                           0
dtype: int64

In [71]:
df['Product_Category_3'].fillna(0, inplace=True)

In [72]:
df.isnull().sum()

User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
dtype: int64

## Droping the `User_ID` and `Product_ID` columns : 

In [73]:
df.drop(['User_ID', 'Product_ID'], axis=1, inplace=True)

In [74]:
df

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,F,0-17,10,A,2,0,3,0.0,0.0,8370
1,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,F,0-17,10,A,2,0,12,0.0,0.0,1422
3,F,0-17,10,A,2,0,12,14.0,0.0,1057
4,M,55+,16,C,4+,0,8,0.0,0.0,7969
...,...,...,...,...,...,...,...,...,...,...
550063,M,51-55,13,B,1,1,20,0.0,0.0,368
550064,F,26-35,1,C,3,0,20,0.0,0.0,371
550065,F,26-35,15,B,4+,1,20,0.0,0.0,137
550066,F,55+,1,C,2,0,20,0.0,0.0,365


## Now Data is Ready for some preprocessing actions :

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 10 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Gender                      550068 non-null  object 
 1   Age                         550068 non-null  object 
 2   Occupation                  550068 non-null  int64  
 3   City_Category               550068 non-null  object 
 4   Stay_In_Current_City_Years  550068 non-null  object 
 5   Marital_Status              550068 non-null  int64  
 6   Product_Category_1          550068 non-null  int64  
 7   Product_Category_2          550068 non-null  float64
 8   Product_Category_3          550068 non-null  float64
 9   Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 42.0+ MB


### 1) Encode the categorical data:

In [76]:
df['Gender'].value_counts()

M    414259
F    135809
Name: Gender, dtype: int64

In [78]:
encoder = OneHotEncoder(sparse=False, drop='first')
transform = encoder.fit_transform(df[['Gender']])
transform_df = pd.DataFrame(transform, columns=encoder.get_feature_names_out())


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.



In [79]:
df = pd.concat([df, transform_df], axis=1)
df.drop('Gender', axis=1, inplace=True)

In [80]:
df['Age'].unique().tolist()

['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25']

In [83]:
encoder = BinaryEncoder()
transform_df = encoder.fit_transform(df[['Age']])

In [84]:
df = pd.concat([df, transform_df], axis=1)
df.drop('Age', axis=1, inplace=True)

In [43]:
df['City_Category'].unique()

array(['A', 'C', 'B'], dtype=object)

In [86]:
encoder = OneHotEncoder(sparse=False, drop='first')
transform = encoder.fit_transform(df[['City_Category']])
transform_df = pd.DataFrame(transform, columns=encoder.get_feature_names_out())


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.



In [87]:
df = pd.concat([df, transform_df], axis=1)
df.drop('City_Category', axis=1, inplace=True)

In [89]:
df['Stay_In_Current_City_Years'].unique()

array(['2', '4+', '3', '1', '0'], dtype=object)

In [90]:
encoder = BinaryEncoder()
transform_df = encoder.fit_transform(df[['Stay_In_Current_City_Years']])

In [91]:
df = pd.concat([df, transform_df], axis=1)
df.drop('Stay_In_Current_City_Years', axis=1, inplace=True)

In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 15 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Occupation                    550068 non-null  int64  
 1   Marital_Status                550068 non-null  int64  
 2   Product_Category_1            550068 non-null  int64  
 3   Product_Category_2            550068 non-null  float64
 4   Product_Category_3            550068 non-null  float64
 5   Purchase                      550068 non-null  int64  
 6   Gender_M                      550068 non-null  float64
 7   Age_0                         550068 non-null  int64  
 8   Age_1                         550068 non-null  int64  
 9   Age_2                         550068 non-null  int64  
 10  City_Category_B               550068 non-null  float64
 11  City_Category_C               550068 non-null  float64
 12  Stay_In_Current_City_Years_0  550068 non-nul

In [98]:
df

Unnamed: 0,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Gender_M,Age_0,Age_1,Age_2,City_Category_B,City_Category_C,Stay_In_Current_City_Years_0,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2
0,10,0,3,0.0,0.0,8370,0.0,0,0,1,0.0,0.0,0,0,1
1,10,0,1,6.0,14.0,15200,0.0,0,0,1,0.0,0.0,0,0,1
2,10,0,12,0.0,0.0,1422,0.0,0,0,1,0.0,0.0,0,0,1
3,10,0,12,14.0,0.0,1057,0.0,0,0,1,0.0,0.0,0,0,1
4,16,0,8,0.0,0.0,7969,1.0,0,1,0,0.0,1.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550063,13,1,20,0.0,0.0,368,1.0,1,0,1,1.0,0.0,1,0,0
550064,1,0,20,0.0,0.0,371,0.0,0,1,1,0.0,1.0,0,1,1
550065,15,1,20,0.0,0.0,137,0.0,0,1,1,1.0,0.0,0,1,0
550066,1,0,20,0.0,0.0,365,0.0,0,1,0,0.0,1.0,0,0,1


### 2) Split Data into Train and Test sets: 

In [95]:
x = df.drop('Marital_Status', axis=1)
y = df['Marital_Status']

In [96]:
x_train, x_test, y_train, y_teat = train_test_split(x, y, test_size=0.2, stratify=y, random_state=7)

In [97]:
x_train

Unnamed: 0,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Gender_M,Age_0,Age_1,Age_2,City_Category_B,City_Category_C,Stay_In_Current_City_Years_0,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2
410571,7,1,2.0,15.0,3978,0.0,0,1,1,1.0,0.0,0,1,1
35544,7,8,0.0,0.0,7907,1.0,1,1,1,0.0,0.0,1,0,0
87447,1,13,16.0,0.0,761,0.0,1,0,0,0.0,1.0,0,0,1
338203,20,1,4.0,0.0,7651,1.0,0,1,1,1.0,0.0,0,0,1
133118,4,5,0.0,0.0,8769,1.0,0,1,1,0.0,0.0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23018,17,5,8.0,0.0,8657,1.0,1,1,0,0.0,1.0,1,0,1
256114,14,1,6.0,15.0,11588,0.0,0,1,1,0.0,0.0,1,0,0
377472,1,1,15.0,0.0,15165,0.0,1,1,0,0.0,1.0,0,1,0
352243,7,5,0.0,0.0,7127,1.0,1,0,1,1.0,0.0,0,1,1


## There is no missing Data sonce we filled them with 0.0 value

In [99]:
df.isnull().sum()

Occupation                      0
Marital_Status                  0
Product_Category_1              0
Product_Category_2              0
Product_Category_3              0
Purchase                        0
Gender_M                        0
Age_0                           0
Age_1                           0
Age_2                           0
City_Category_B                 0
City_Category_C                 0
Stay_In_Current_City_Years_0    0
Stay_In_Current_City_Years_1    0
Stay_In_Current_City_Years_2    0
dtype: int64

### 3) Scaling features : 
* with `RobustScaler` 

In [108]:
scaler = RobustScaler()
scaler.fit(x_train)
scaled_x_train = scaler.transform(x_train)
scaled_x_test = scaler.transform(x_test)

In [109]:
scaled_x_train = pd.DataFrame(scaled_x_train, columns=x_train.columns)

In [110]:
scaled_x_train

Unnamed: 0,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Gender_M,Age_0,Age_1,Age_2,City_Category_B,City_Category_C,Stay_In_Current_City_Years_0,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2
0,0.000000,-0.571429,-0.214286,1.875,-0.652871,-1.0,-1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
1,0.000000,0.428571,-0.357143,0.000,-0.022618,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.500000,1.142857,0.785714,0.000,-1.168912,-1.0,0.0,-1.0,-1.0,0.0,1.0,0.0,0.0,1.0
3,1.083333,-0.571429,-0.071429,0.000,-0.063683,0.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-0.250000,0.000000,-0.357143,0.000,0.115656,0.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440049,0.833333,0.000000,0.214286,0.000,0.097690,0.0,0.0,0.0,-1.0,0.0,1.0,1.0,0.0,1.0
440050,0.583333,-0.571429,0.071429,1.875,0.567854,-1.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
440051,-0.500000,-0.571429,0.714286,0.000,1.141643,-1.0,0.0,0.0,-1.0,0.0,1.0,0.0,1.0,0.0
440052,0.000000,0.000000,-0.357143,0.000,-0.147738,0.0,0.0,-1.0,0.0,1.0,0.0,0.0,1.0,1.0


In [111]:
scaled_x_test = pd.DataFrame(scaled_x_test, columns=x_train.columns)

### 4) Detect Inbalanced classes : 

In [114]:
scaled_x_train['Gender_M'].value_counts(normalize=True)*100

 0.0    75.302122
-1.0    24.697878
Name: Gender_M, dtype: float64

In [115]:
scaled_x_train['Age_0'].value_counts(normalize=True)*100

 0.0    53.427307
-1.0    46.572693
Name: Age_0, dtype: float64

In [116]:
scaled_x_train['Age_1'].value_counts(normalize=True)*100

 0.0    81.999027
-1.0    18.000973
Name: Age_1, dtype: float64

In [117]:
scaled_x_train['Age_2'].value_counts(normalize=True)*100

 0.0    67.810087
-1.0    32.189913
Name: Age_2, dtype: float64