<h4>Black Friday Spend Prediction</h4>

<h6>Problem Statement</h6>

<p>A retail company “ABC Private Limited” wants to understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month.
The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month.

Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.</p>

In [None]:
# Importing data manupilation libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# Loading the test and train datasets
df_train=pd.read_csv('./Datasets/train.csv')

In [None]:
df_train.shape

In [None]:
df_train.head(5)

In [None]:
df_train.describe()

In [None]:
df_train.User_ID.nunique()

<h5>Analyzing the dataset</h5>

###### Getting data at customer ID level

In [None]:
df_train_sub=df_train.loc[:,['User_ID','Gender','Age','Occupation','City_Category','Stay_In_Current_City_Years','Marital_Status','Purchase']]

In [None]:
df_train_cust1=df_train_sub.groupby(['User_ID','Gender','Age','Occupation','City_Category','Stay_In_Current_City_Years','Marital_Status']).sum()

In [None]:
df_train_cust = df_train_cust1.reset_index(level=['User_ID','Gender','Age','Occupation','City_Category','Stay_In_Current_City_Years','Marital_Status'])

In [None]:
df_train_cust.head()

In [None]:
df_train_cust.columns

In [None]:
df_train_cust.User_ID.nunique()

In [None]:
df_train_cust.shape

In [None]:
df_train_cust.Occupation.unique()

In [None]:
df_train_cust.City_Category.unique()

In [None]:
#Impact of Gender on mean spend
df_train_group_gender=df_train_cust.groupby(['Gender'])

In [None]:
df_train_group_gender['Purchase'].mean()

In [None]:
df_train_group_gender['Purchase'].median()

In [None]:
#ig, ax = plt.subplots(figsize=(15,7))
df_train_cust.groupby(['Gender']).mean()['Purchase'].plot.bar()
#f.plot.bar()

In [None]:
# Average Male spend is relatively higher on average than Females

In [None]:
#Impact of Marital status on mean spend
df_train_group_mar_stat=df_train_cust.groupby(['Marital_Status'])

In [None]:
df_train_group_mar_stat['Purchase'].mean()

In [None]:
df_train_group_mar_stat['Purchase'].median()

In [None]:
df_train_cust.groupby(['Marital_Status']).mean()['Purchase'].plot.bar()

In [None]:
# Unmarried people spend little higher than married

In [None]:
#Impact of Gender + Marital status on mean spend
df_train_group_mar_stat=df_train_cust.groupby(['Marital_Status'])

In [None]:
df_train_cust.groupby(['Gender','Marital_Status']).mean()['Purchase'].plot.bar()

In [None]:
#Unmarried Male and Females spend slightly more than their male counterparts

In [None]:
df_train_cust.groupby(['Occupation']).mean()['Purchase'].plot.bar()

In [None]:
df_train_cust.groupby(['City_Category']).mean()['Purchase'].plot.bar()

In [None]:
# City C people spned lowest

In [None]:
df_train_cust.groupby(['Stay_In_Current_City_Years']).mean()['Purchase'].plot.bar()

In [None]:
df_train.head()

In [None]:
# Count of Null values in Data
pd.isnull(df_train).sum()

In [None]:
df_train["Product_Category_2"].fillna(50, inplace = True)

In [None]:
df_train["Product_Category_3"].fillna(50, inplace = True)

In [None]:
pd.isnull(df_train).sum()

In [None]:
pd.isnull(df_train_cust).sum()

In [None]:
# Creating Customer Product level dataset

In [None]:
df_train_sub2=df_train.loc[:,['User_ID','Product_ID','Gender','Age','Occupation','City_Category','Stay_In_Current_City_Years','Marital_Status','Purchase']]

In [None]:
df_train_cust1=df_train_sub2.groupby(['User_ID','Product_ID','Gender','Age','Occupation','City_Category','Stay_In_Current_City_Years','Marital_Status']).sum()

In [None]:
df_train_cust = df_train_cust1.reset_index(level=['User_ID','Product_ID','Gender','Age','Occupation','City_Category','Stay_In_Current_City_Years','Marital_Status'])

In [None]:
df_train_cust2=df_train_sub2.groupby(['Product_ID']).mean()['Purchase']

In [None]:
df_train_cust3= df_train_cust2.reset_index(level=['Product_ID'])

In [None]:
df_train_cust3.rename(columns={'Purchase': 'mean_purchase'}, inplace=True)

In [None]:
df_train_cust3.head()

In [None]:
df_train_cust_prod=pd.merge(df_train,df_train_cust3,how='left',on=['Product_ID'])

In [None]:
df_train_cust_prod.head()

In [None]:
df_train_cust_prod.shape

In [None]:
# One hot Encoding

In [None]:
df_train_cust_prod.loc[df_train_cust_prod["Age"] == "0-17", "Age"] = 17
df_train_cust_prod.loc[df_train_cust_prod["Age"] == "18-25", "Age"] = 25
df_train_cust_prod.loc[df_train_cust_prod["Age"] == "26-35", "Age"] = 35
df_train_cust_prod.loc[df_train_cust_prod["Age"] == "36-45", "Age"] = 45
df_train_cust_prod.loc[df_train_cust_prod["Age"] == "46-50", "Age"] = 50
df_train_cust_prod.loc[df_train_cust_prod["Age"] == "51-55", "Age"] = 55
df_train_cust_prod.loc[df_train_cust_prod["Age"] == "55+", "Age"] = 65

In [None]:
df_train_cust_prod.loc[df_train_cust_prod["Stay_In_Current_City_Years"] == "0", "Stay_In_Current_City_Years"] = 1
df_train_cust_prod.loc[df_train_cust_prod["Stay_In_Current_City_Years"] == "1", "Stay_In_Current_City_Years"] = 2
df_train_cust_prod.loc[df_train_cust_prod["Stay_In_Current_City_Years"] == "2", "Stay_In_Current_City_Years"] = 3
df_train_cust_prod.loc[df_train_cust_prod["Stay_In_Current_City_Years"] == "3", "Stay_In_Current_City_Years"] = 4
df_train_cust_prod.loc[df_train_cust_prod["Stay_In_Current_City_Years"] == "4+", "Stay_In_Current_City_Years"] = 10

In [None]:
df_train_cust_prod.loc[df_train_cust_prod["Gender"] == "M", "Gender"] = 1
df_train_cust_prod.loc[df_train_cust_prod["Gender"] == "F", "Gender"] = 0

In [None]:
df_train_cust_prod.loc[df_train_cust_prod["City_Category"] == "A", "City_Category"] = 2
df_train_cust_prod.loc[df_train_cust_prod["City_Category"] == "B", "City_Category"] = 1
df_train_cust_prod.loc[df_train_cust_prod["City_Category"] == "C", "City_Category"] = 0

#### Defining the features

In [None]:
labels = np.array(df_train_cust_prod['Purchase'])

In [None]:
df_train_cust_prod1= df_train_cust_prod.drop(['Purchase','User_ID','Product_ID'], axis = 1)

In [None]:
feature_list = list(df_train_cust_prod1.columns)

In [None]:
df_train_cust_prod1=np.array(df_train_cust_prod1)

#### Test Train Split

In [None]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_df, test_df, train_labels, test_labels = train_test_split(df_train_cust_prod1, labels, test_size = 0.25,
                                                                           random_state = 0)

In [None]:
#Model Training
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model 
rf = RandomForestRegressor(n_estimators= 1000, random_state=0)

# Train the model on training data
rf.fit(train_df, train_labels);

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_df)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')