In [27]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline 

# Problem Statement
A retail company “ABC Private Limited” wants to understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month.
The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month.

Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.

In [28]:
df = pd.read_csv('blackfriday.csv')

In [29]:
df.head(5)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


In [30]:
df.columns

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3'],
      dtype='object')

In [31]:
df.shape


(233599, 11)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233599 entries, 0 to 233598
Data columns (total 11 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     233599 non-null  int64  
 1   Product_ID                  233599 non-null  object 
 2   Gender                      233599 non-null  object 
 3   Age                         233599 non-null  object 
 4   Occupation                  233599 non-null  int64  
 5   City_Category               233599 non-null  object 
 6   Stay_In_Current_City_Years  233599 non-null  object 
 7   Marital_Status              233599 non-null  int64  
 8   Product_Category_1          233599 non-null  int64  
 9   Product_Category_2          161255 non-null  float64
 10  Product_Category_3          71037 non-null   float64
dtypes: float64(2), int64(4), object(5)
memory usage: 19.6+ MB


In [33]:
df.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
count,233599.0,233599.0,233599.0,233599.0,161255.0,71037.0
mean,1003029.0,8.085407,0.41007,5.276542,9.849586,12.669454
std,1726.505,6.521146,0.491847,3.73638,5.094943,4.125944
min,1000001.0,0.0,0.0,1.0,2.0,3.0
25%,1001527.0,2.0,0.0,1.0,5.0,9.0
50%,1003070.0,7.0,0.0,5.0,9.0,14.0
75%,1004477.0,14.0,1.0,8.0,15.0,16.0
max,1006040.0,20.0,1.0,18.0,18.0,18.0


In [34]:
df.drop(['User_ID'], axis=1,inplace=True)

In [35]:
df.head(2)

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,P00128942,M,46-50,7,B,2,1,1,11.0,
1,P00113442,M,26-35,17,C,0,0,3,5.0,


In [36]:
df['Gender'] = pd.get_dummies(df['Gender'],drop_first=1).head(10)

In [37]:
df.head(10)

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,P00128942,1.0,46-50,7,B,2,1,1,11.0,
1,P00113442,1.0,26-35,17,C,0,0,3,5.0,
2,P00288442,0.0,36-45,1,B,4+,1,5,14.0,
3,P00145342,0.0,36-45,1,B,4+,1,4,9.0,
4,P00053842,0.0,26-35,1,C,1,0,4,5.0,12.0
5,P00350442,1.0,46-50,1,C,3,1,2,3.0,15.0
6,P00155442,1.0,46-50,1,C,3,1,1,11.0,15.0
7,P0094542,1.0,46-50,1,C,3,1,2,4.0,9.0
8,P00161842,1.0,26-35,7,A,1,0,10,13.0,16.0
9,P00067942,1.0,18-25,15,A,4+,0,5,14.0,


In [19]:
# df['Gender'] = df['Gender'].map({'F': 0, 'M':1})

In [52]:
df.loc[10:]

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
10,P00046742,,26-35,7,B,2,1,1,2.0,15.0
11,P00040042,,26-35,7,B,2,1,5,,
12,P00196542,,26-35,7,B,2,1,5,8.0,14.0
13,P00004542,,26-35,7,B,2,1,5,8.0,
14,P00159542,,26-35,1,C,2,1,10,15.0,16.0
...,...,...,...,...,...,...,...,...,...,...
233594,P00118942,,26-35,15,B,4+,1,8,,
233595,P00254642,,26-35,15,B,4+,1,5,8.0,
233596,P00031842,,26-35,15,B,4+,1,1,5.0,12.0
233597,P00124742,,46-50,1,C,4+,0,10,16.0,


In [50]:
df.groupby("Gender").size()

Gender
0.0    3
1.0    7
dtype: int64

In [51]:
df["Gender"].value_counts()

1.0    7
0.0    3
Name: Gender, dtype: int64

In [21]:
df.columns

Index(['Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3'],
      dtype='object')

In [39]:
df.iloc[-1]

Product_ID                    P00316642
Gender                              NaN
Age                               46-50
Occupation                            0
City_Category                         B
Stay_In_Current_City_Years           4+
Marital_Status                        1
Product_Category_1                    4
Product_Category_2                  5.0
Product_Category_3                  NaN
Name: 233598, dtype: object

In [23]:
df['Age'].unique()

array([1, 2, 3, 4, 5, 6, 7], dtype=int64)

In [24]:
df['Age'] = df["Age"].map({'46-50':1, '26-35':2, '36-45':3, '18-25':4, '51-55':5, '55+':6, '0-17':7})

In [25]:
df.head(10)

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,P00128942,,,7,B,2,1,1,11.0,
1,P00113442,,,17,C,0,0,3,5.0,
2,P00288442,,,1,B,4+,1,5,14.0,
3,P00145342,,,1,B,4+,1,4,9.0,
4,P00053842,,,1,C,1,0,4,5.0,12.0
5,P00350442,,,1,C,3,1,2,3.0,15.0
6,P00155442,,,1,C,3,1,1,11.0,15.0
7,P0094542,,,1,C,3,1,2,4.0,9.0
8,P00161842,,,7,A,1,0,10,13.0,16.0
9,P00067942,,,15,A,4+,0,5,14.0,


In [26]:
df['City_Category'].unique()

array(['B', 'C', 'A'], dtype=object)