In [1]:
#Necessary imports for viewing data
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [35]:
#Loading test and train data set
train_df = pd.read_csv(r'E:\Data hackatons\AV black friday sales\train.csv')
test_df = pd.read_csv(r'E:\Data hackatons\AV black friday sales\test.csv')

In [None]:
#Let's have a look at how the data looks using the head
train_df.head()
test_df.head()

There are 12 columns:
User_ID, Product_ID, Gender, Age, Occupation, City_Category, Stay_In_Current_City_Years, Marital_Status, Product_Category_1, Product_Category_2, Product_Category_3, Purchase

Target column is Purchase

In [54]:
#Check for columns with null values
print("Train Data***********")
print(train_df.isnull().mean() * 100)
print("Test Data***********")
print(test_df.isnull().mean() * 100)

Train Data***********
User_ID                       0.0
Product_ID                    0.0
Gender                        0.0
Age                           0.0
Occupation                    0.0
City_Category                 0.0
Stay_In_Current_City_Years    0.0
Marital_Status                0.0
Product_Category_1            0.0
Product_Category_2            0.0
Product_Category_3            0.0
Purchase                      0.0
dtype: float64
Test Data***********
User_ID               0.0
Product_ID            0.0
Gender                0.0
Age                   0.0
Occupation            0.0
                     ... 
PC3_Max_Purchase      0.0
PC3_Mean_Purchase     0.0
PC3_25Per_Purchase    0.0
PC3_50Per_Purchase    0.0
PC3_75Per_Purchase    0.0
Length: 88, dtype: float64


We can see that Product_Category_2 and Product_Category_3 have null values
train_df has 31% and 70% null values in Product_Category_2 and Product_Category_3 respectively
test_df has 31% and 70% null values in Product_Category_2 and Product_Category_3 respectively

In [None]:
#All the columns are categorical. Let's check unique number of entries in each column for Training set
print("User_ID: "+str(train_df["User_ID"].unique().shape[0]))
print("Product_ID: "+str(train_df["Product_ID"].unique().shape[0]))
print("Gender: "+str(train_df["Gender"].unique().shape[0]))
print("Age: "+str(train_df["Age"].unique().shape[0]))
print("Occupation: "+str(train_df["Occupation"].unique().shape[0]))
print("City_Category: "+str(train_df["City_Category"].unique().shape[0]))
print("Stay_In_Current_City_Years: "+str(train_df["Stay_In_Current_City_Years"].unique().shape[0]))
print("Marital_Status: "+str(train_df["Marital_Status"].unique().shape[0]))
print("Product_Category_1: "+str(train_df["Product_Category_1"].unique().shape[0]))
print("Product_Category_2: "+str(train_df["Product_Category_2"].unique().shape[0]))
print("Product_Category_3: "+str(train_df["Product_Category_3"].unique().shape[0]))

In [None]:
#Let's check unique number of entries in each column for Test set
print("User_ID: "+str(test_df["User_ID"].unique().shape[0]))
print("Product_ID: "+str(test_df["Product_ID"].unique().shape[0]))
print("Gender: "+str(test_df["Gender"].unique().shape[0]))
print("Age: "+str(test_df["Age"].unique().shape[0]))
print("Occupation: "+str(test_df["Occupation"].unique().shape[0]))
print("City_Category: "+str(test_df["City_Category"].unique().shape[0]))
print("Stay_In_Current_City_Years: "+str(test_df["Stay_In_Current_City_Years"].unique().shape[0]))
print("Marital_Status: "+str(test_df["Marital_Status"].unique().shape[0]))
print("Product_Category_1: "+str(test_df["Product_Category_1"].unique().shape[0]))
print("Product_Category_2: "+str(test_df["Product_Category_2"].unique().shape[0]))
print("Product_Category_3: "+str(test_df["Product_Category_3"].unique().shape[0]))

Test set has 140 less product ids and 2 less product categories.

In [None]:
print(train_df['Product_Category_1'].unique())
print(test_df['Product_Category_1'].unique())

We can see that 19 and 20 are not there in Product_Category_1 for test data. I want to check if the 140 extra product ids in the training data are from the 2 product categories (19 and 20)

In [None]:
missed_values = train_df.loc[~train_df['Product_ID'].isin(test_df["Product_ID"].unique())]

In [None]:
missed_values["Product_Category_1"].unique()

so, the 140 extra product ids are not from the product categories 19 and 20.
Lets look at the frequency of occurence of user id

In [None]:
train_df['User_ID'].value_counts().describe()

In [None]:
train_df['User_ID'].value_counts().head(10)

In [None]:
train_df['User_ID'].value_counts(ascending = True).head(10)

Lets see if the value counts have any effect on the purchase. 
Add the user_id_count as a column to the data

In [None]:
user_id_grouped = train_df.groupby("User_ID")
count_dict = {}
for name, group in user_id_grouped:
    count_dict[name] = group.shape[0]
count_list = []
for index, row in train_df.iterrows():
    name = row["User_ID"]
    count_list.append(count_dict.get(name, 0))

Let's look at the correlation of this new field with the purchase 

In [None]:
train_df["User_ID_Count"] = count_list

In [None]:
train_df['User_ID_Count'].corr(train_df['Purchase'])

The user id count is correlated to the Purchase.Let's look at the count values of other variables, and for that we need to convert the categorical variables into numbers. They are:
<br> Gender
<br> Age
<br> City_Category
<br> Stay_In_Current_City
<br> Define dictionaries to convert the categorical features into numeric

In [27]:
gender_dict = {'F':0, 'M':1}
age_dict = {'0-17':0, '18-25':1, '26-35':2, '36-45':3, '46-50':4, '51-55':5, '55+':6}
city_dict = {'A':0, 'B':1, 'C':2}
stay_dict = {'0':0, '1':1, '2':2, '3':3, '4+':4}

Use the dictionary above to convert the categorical variables into numeric

In [28]:
train_df["Gender"] = train_df["Gender"].apply(lambda x: gender_dict[x])
train_df["Age"] = train_df["Age"].apply(lambda x: age_dict[x])
train_df["City_Category"] = train_df["City_Category"].apply(lambda x: city_dict[x])
train_df["Stay_In_Current_City_Years"] = train_df["Stay_In_Current_City_Years"].apply(lambda x: stay_dict[x])

KeyError: 0

Let's have  alook at the variables now

In [None]:
train_df.head()

Now let's create a function to give the count of the features

In [31]:
def getCountofVar(dataset_df, var_name):
    var_name_grouped = dataset_df.groupby(var_name)
    count_dict = {}
    for name, group in var_name_grouped:
        count_dict[name] = group.shape[0]
    count_list = []
    for index, row in dataset_df.iterrows():
        name = row[var_name]
        count_list.append(count_dict.get(name, 0))
    return count_list

get counts for all other features

In [None]:
train_df["Product_ID_Count"] = getCountofVar(train_df,"Product_ID")
train_df["Gender_Count"] = getCountofVar(train_df,"Gender")
train_df["Age_Count"] = getCountofVar(train_df,"Age")
train_df["Occupation_Count"] = getCountofVar(train_df,"Occupation")
train_df["City_Count"] = getCountofVar(train_df,"City_Category")
train_df["Stay_Count"] = getCountofVar(train_df,"Stay_In_Current_City_Years")
train_df["Marital_Status_Count"] = getCountofVar(train_df,"Marital_Status")
train_df["PC1_Count"] = getCountofVar(train_df,"Product_Category_1")
train_df["PC2_Count"] = getCountofVar(train_df,"Product_Category_2")
train_df["PC3_Count"] = getCountofVar(train_df,"Product_Category_3")

In [None]:
#Look at the columns once
train_df.columns

In [None]:
#Lets look at the correlation of the features
corr = train_df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
sns.heatmap(corr)

In [None]:
#correlation of all other columns with Purchase
train_df[train_df.columns[1:]].corr()['Purchase'][:]
#to see the correlation in a csv file
#corr.to_csv(r'C:\Users\User\Desktop\corr_table.csv', index=False)

Now I want to check if the minimum values, max values, mean of the features have any effect on the purchase.
<br> But, before this I will impute the missing values with 0. I did not impute the missing values earlier as it will cause the counts to reflect values. Right now the count for missing values is 0.

In [None]:
train_df.fillna(0, inplace=True)

For every user, we find the min purchase, max purchase and mean purchase

In [None]:
train_df_user_id_grouped = train_df.groupby("User_ID")
min_dict = {}
max_dict = {}
mean_dict = {}
for name, group in train_df_user_id_grouped:
    min_dict[name] = min(np.array(group["Purchase"]))
    max_dict[name] = max(np.array(group["Purchase"]))
    mean_dict[name] = np.mean(np.array(group["Purchase"]))
min_list = []
max_list = []
mean_list = []
for index, row in train_df.iterrows():
    name = row["User_ID"]
    min_list.append(min_dict.get(name,0))
    max_list.append(max_dict.get(name,0))
    mean_list.append(mean_dict.get(name,0))
train_df["User_ID_Min_Purchase"] = min_list
train_df["User_ID_Max_Purchase"] = max_list
train_df["User_ID_Mean_Purchase"] = mean_list

Now we look at the correlation values again.

In [None]:
#correlation of all other columns with Purchase
train_df[train_df.columns[1:]].corr()['Purchase'][:]

High correlation between min,max and mean purchase.
<br> I will now do the same for 25, 50 and 75 %ile values.
<br> Write a method to calculate all the stats

In [34]:
def getPurchaseStats(target_df,compute_df, feature_name):
    feature_grouped = compute_df.groupby(feature_name)
    min_dict = {}
    max_dict = {}
    mean_dict = {}
    twentyfive_dict = {}
    fifty_dict = {}
    seventyfive_dict = {}
    for name, group in feature_grouped:
        min_dict[name] = min(np.array(group["Purchase"]))
        max_dict[name] = max(np.array(group["Purchase"]))
        mean_dict[name] = np.mean(np.array(group["Purchase"]))
        twentyfive_dict[name] = np.percentile(np.array(group["Purchase"]),25)
        fifty_dict[name] = np.percentile(np.array(group["Purchase"]),50)
        seventyfive_dict[name] = np.percentile(np.array(group["Purchase"]),75)
    min_list = []
    max_list = []
    mean_list = []
    twentyfive_list = []
    fifty_list = []
    seventyfive_list = []
    for index, row in target_df.iterrows():
        name = row[feature_name]
        min_list.append(min_dict.get(name,0))
        max_list.append(max_dict.get(name,0))
        mean_list.append(mean_dict.get(name,0))
        twentyfive_list.append( twentyfive_dict.get(name,0))
        fifty_list.append( fifty_dict.get(name,0))
        seventyfive_list.append( seventyfive_dict.get(name,0))
    return min_list, max_list, mean_list, twentyfive_list, fifty_list, seventyfive_list

User ID and Purchase stats

In [None]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(train_df,train_df, "User_ID")
train_df["User_ID_Min_Purchase"] = min_price_list
train_df["User_ID_Max_Purchase"] = max_price_list
train_df["User_ID_Mean_Purchase"] = mean_price_list
train_df["User_ID_25Per_Purchase"] = twentyfive_price_list
train_df["User_ID_50Per_Purchase"] = fifty_price_list
train_df["User_ID_75Per_Purchase"] = seventyfive_price_list

Product_ID and Purchase Stats

In [None]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(train_df,train_df, "Product_ID")
train_df["Product_ID_Min_Purchase"] = min_price_list
train_df["Product_ID_Max_Purchase"] = max_price_list
train_df["Product_ID_Mean_Purchase"] = mean_price_list
train_df["Product_ID_25Per_Purchase"] = twentyfive_price_list
train_df["Product_ID_50Per_Purchase"] = fifty_price_list
train_df["Product_ID_75Per_Purchase"] = seventyfive_price_list

Gender and Purchase Stats

In [None]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(train_df, train_df, "Gender")
train_df["Gender_Min_Purchase"] = min_price_list
train_df["Gender_Max_Purchase"] = max_price_list
train_df["Gender_Mean_Purchase"] = mean_price_list
train_df["Gender_25Per_Purchase"] = twentyfive_price_list
train_df["Gender_50Per_Purchase"] = fifty_price_list
train_df["Gender_75Per_Purchase"] = seventyfive_price_list

Age and Purchase stats

In [None]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(train_df,train_df, "Age")
train_df["Age_Min_Purchase"] = min_price_list
train_df["Age_Max_Purchase"] = max_price_list
train_df["Age_Mean_Purchase"] = mean_price_list
train_df["Age_25Per_Purchase"] = twentyfive_price_list
train_df["Age_50Per_Purchase"] = fifty_price_list
train_df["Age_75Per_Purchase"] = seventyfive_price_list

Occupation and Purchase stats

In [None]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(train_df,train_df, "Occupation")
train_df["Occupation_Min_Purchase"] = min_price_list
train_df["Occupation_Max_Purchase"] = max_price_list
train_df["Occupation_Mean_Purchase"] = mean_price_list
train_df["Occupation_25Per_Purchase"] = twentyfive_price_list
train_df["Occupation_50Per_Purchase"] = fifty_price_list
train_df["Occupation_75Per_Purchase"] = seventyfive_price_list

City and Purchase stats

In [None]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(train_df,train_df, "City_Category")
train_df["City_Min_Purchase"] = min_price_list
train_df["City_Max_Purchase"] = max_price_list
train_df["City_Mean_Purchase"] = mean_price_list
train_df["City_25Per_Purchase"] = twentyfive_price_list
train_df["City_50Per_Purchase"] = fifty_price_list
train_df["City_75Per_Purchase"] = seventyfive_price_list

stay in current city and Purchase stats

In [None]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(train_df,train_df, "Stay_In_Current_City_Years")
train_df["Stay_Min_Purchase"] = min_price_list
train_df["Stay_Max_Purchase"] = max_price_list
train_df["Stay_Mean_Purchase"] = mean_price_list
train_df["Stay_25Per_Purchase"] = twentyfive_price_list
train_df["Stay_50Per_Purchase"] = fifty_price_list
train_df["Stay_75Per_Purchase"] = seventyfive_price_list

Marital status and Purchase Stats

In [None]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(train_df,train_df, "Marital_Status")
train_df["Marital_Min_Purchase"] = min_price_list
train_df["Marital_Max_Purchase"] = max_price_list
train_df["Marital_Mean_Purchase"] = mean_price_list
train_df["Marital_25Per_Purchase"] = twentyfive_price_list
train_df["Marital_50Per_Purchase"] = fifty_price_list
train_df["Marital_75Per_Purchase"] = seventyfive_price_list

PC1 and Purchase Stats

In [None]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(train_df,train_df, "Product_Category_1")
train_df["PC1_Min_Purchase"] = min_price_list
train_df["PC1_Max_Purchase"] = max_price_list
train_df["PC1_Mean_Purchase"] = mean_price_list
train_df["PC1_25Per_Purchase"] = twentyfive_price_list
train_df["PC1_50Per_Purchase"] = fifty_price_list
train_df["PC1_75Per_Purchase"] = seventyfive_price_list

PC2 and Purchase

In [None]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(train_df,train_df, "Product_Category_2")
train_df["PC2_Min_Purchase"] = min_price_list
train_df["PC2_Max_Purchase"] = max_price_list
train_df["PC2_Mean_Purchase"] = mean_price_list
train_df["PC2_25Per_Purchase"] = twentyfive_price_list
train_df["PC2_50Per_Purchase"] = fifty_price_list
train_df["PC2_75Per_Purchase"] = seventyfive_price_list

PC3 and Purchase

In [None]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(train_df,train_df, "Product_Category_3")
train_df["PC3_Min_Purchase"] = min_price_list
train_df["PC3_Max_Purchase"] = max_price_list
train_df["PC3_Mean_Purchase"] = mean_price_list
train_df["PC3_25Per_Purchase"] = twentyfive_price_list
train_df["PC3_50Per_Purchase"] = fifty_price_list
train_df["PC3_75Per_Purchase"] = seventyfive_price_list

In [None]:
train_df.columns

Now, I store the data in a file so that I can use it later

In [None]:
#train_df.to_csv(r'E:\Data hackatons\AV black friday sales\train_full_feature.csv',index=False)

In [2]:
train_df = pd.read_csv(r'E:\Data hackatons\AV black friday sales\train_full_feature.csv')

Before discarding any feature, lets run the model once and see how it performs.So, we split the training data into test and training set
from sklearn.model_selection import train_test_split

In [None]:
#define X
X = train_df.drop(columns=['User_ID','Product_ID','Purchase'],axis=1)

In [None]:
#define y
y = train_df["Purchase"]

In [None]:
#split training data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
#use random forest regressor
from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor(max_depth=5, min_samples_leaf=100)

In [None]:
#fit the model
RF.fit(X_train,y_train)

In [None]:
#predict on X test
y_test_pred = RF.predict(X_test)

In [None]:
#get the mean squared error
from sklearn import metrics
import math
mse = metrics.mean_squared_error(y_test, y_test_pred)
rmse = math.sqrt(mse)
print(rmse)

Method 1 for finding useful features: Using Correlation between the features and the target value
<br> Can we improve it more by using features that are more relevalt? Let's find out by finding out the correlation of features with the target column

In [None]:
#finding the correlation of all features against the target
corr = train_df[train_df.columns[1:]].corr()['Purchase'][:]

In [None]:
corr.to_csv(r'E:\Data hackatons\AV black friday sales\corr_purchase.csv')

I will consider all the features that have a correlation of more than 0.75,0.1, 0.07,0.05,0.04,0.03 and see the results.
<br> I found the following observations:
<br> corr>0.75: 2551.514009612337
<br> corr>0.1: 2567.0259719491337
<br> corr>0.08: 2547.717685223172
<br> corr>0.07: 2547.3391061981
<br> corr>0.05: 2538.0098909567223
<br> corr>0.04: 2545.5502808804126
<br> corr>0.03: 2542.985008014369

In [None]:
from sklearn import metrics
import math

corr_values = [0.75, 0.1, 0.08, 0.07, 0.05, 0.04, 0.03]

for x in corr_values:
    feature_list = corr[corr>0.05]
    features = feature_list.index.tolist()
    features.remove("Purchase")
    X = train_df[features]
    y = train_df["Purchase"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    RF.fit(X_train,y_train)
    y_test_pred = RF.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_test_pred)
    rmse = math.sqrt(mse)
    print("corr>"+str(x)+": "+str(rmse))

Based on the results above, we can go with features with corr>0.05. Now we will do some hyperparameter tuning of our RandomForestRegressor.
<br> I will do Randomized Search CV to find the best parameters later in the notebook
<br> 
<br> Method 2 for finding useful features: Correlation between all features. We remove the ones which have high correlation among themselves

In [None]:
#find correlation between all the features
corr = train_df.corr()

In [None]:
corr.to_csv(r'E:\Data hackatons\AV black friday sales\corr_all_features.csv')

In [None]:
#have a look at the correlation map
corr.style.background_gradient(cmap='coolwarm')

In [None]:
#have a look at the heatmap for correlation
sns.heatmap(corr)

In [None]:
data = train_df.drop(columns=['Product_ID'])
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.1:
            if columns[j]:
                columns[j] = False

In [None]:
selected_columns = data.columns[columns]

In [None]:
data = data[selected_columns]

In [None]:
#define X
X = data.drop(columns=['User_ID'],axis=1)

In [None]:
#define y
y = train_df["Purchase"]

In [None]:
#split training data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
#use random forest regressor
from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor(max_depth=5, min_samples_leaf=100,random_state=123)

In [None]:
#fit the model
RF.fit(X_train,y_train)

In [None]:
#predict on X test
y_test_pred = RF.predict(X_test)

In [None]:
#get the mean squared error
from sklearn import metrics
import math
mse = metrics.mean_squared_error(y_test, y_test_pred)
rmse = math.sqrt(mse)
print(rmse)

Method 3: Using the feature selection in Random Forest regressor.

In [None]:
#define X
X = train_df.drop(columns=['User_ID','Product_ID','Purchase'],axis=1)

#define y
y = train_df["Purchase"]

#split training data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
#use random forest regressor
from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor(max_depth=5, min_samples_leaf=100)

In [None]:
#fit the model
RF.fit(X_train,y_train)

In [None]:
#Let's look at the features and their scores
print ("Features sorted by their score:")
print (sorted(zip(map(lambda x: round(x, 4), RF.feature_importances_), X_train), reverse=True))

In [None]:
#define X after selecting the top few features
X = train_df[["Product_ID_Mean_Purchase","User_ID_75Per_Purchase","User_ID_Mean_Purchase","User_ID_25Per_Purchase","User_ID_Max_Purchase","Product_ID_75Per_Purchase","User_ID_50Per_Purchase","Product_ID_50Per_Purchase","Product_ID_25Per_Purchase","User_ID_Min_Purchase","User_ID_Count","Stay_Min_Purchase","Stay_Mean_Purchase","Stay_Max_Purchase","Stay_In_Current_City_Years","Stay_Count","Stay_75Per_Purchase","Stay_50Per_Purchase","Stay_25Per_Purchase"]]

In [None]:
#use random forest regressor
from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor(max_depth=5, min_samples_leaf=100)

In [None]:
#fit the model
RF.fit(X_train,y_train)

In [None]:
#predict on X test
y_test_pred = RF.predict(X_test)

#get the mean squared error
from sklearn import metrics
import math
mse = metrics.mean_squared_error(y_test, y_test_pred)
rmse = math.sqrt(mse)
print(rmse)

I have done 3 iterations with a few more features and the rmse seems to be stuck at one value.So, we will not do any further iterations.
<br> Method 4: Now we will try the same with xgboost

In [3]:
#define X
X = train_df.drop(columns=['User_ID','Product_ID','Purchase'],axis=1)

In [4]:
#define y
y = train_df["Purchase"]

In [5]:
#split training data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
X_train.shape

(440054, 86)

In [7]:
X_test.shape

(110014, 86)

In [8]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05)

In [9]:
xgb.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [10]:
y_test_pred = xgb.predict(X_test)

In [11]:
#get the mean squared error
from sklearn import metrics
import math
mse = metrics.mean_squared_error(y_test, y_test_pred)
rmse = math.sqrt(mse)
print(rmse)

2444.9273196038525


In [12]:
print(xgb.feature_importances_)

[2.9876465e-03 1.3517028e-03 6.6246116e-04 8.3649537e-04 5.6555693e-04
 3.4135481e-04 1.4658089e-03 5.6706718e-04 6.0337031e-04 1.9646459e-03
 9.4769296e-04 0.0000000e+00 8.8327209e-04 8.1652292e-04 3.7492742e-04
 7.6457596e-04 0.0000000e+00 5.7051715e-04 6.8848504e-04 5.8336748e-04
 7.0602726e-04 2.4166501e-03 8.6763585e-03 7.5712698e-03 3.8120893e-03
 9.2351176e-03 1.3045226e-03 2.8741399e-03 9.0283388e-01 4.4284058e-03
 4.1518575e-03 1.2750125e-02 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 7.5191120e-04 0.0000000e+00 0.0000000e+00 1.3141887e-03 0.0000000e+00
 8.4565335e-04 8.0333988e-04 9.3002291e-04 7.8399033e-05 6.8166334e-04
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 5.7822664e-04 3.1911474e-04
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 4.4141281e-03 0.0000000e+00
 1.332

In [13]:
print ("Features sorted by their score:")
print (sorted(zip(map(lambda x: round(x, 4), xgb.feature_importances_), X_train), reverse=True))

Features sorted by their score:
[(0.9028, 'Product_ID_Mean_Purchase'), (0.0128, 'Product_ID_75Per_Purchase'), (0.0092, 'User_ID_75Per_Purchase'), (0.0087, 'User_ID_Mean_Purchase'), (0.0076, 'User_ID_25Per_Purchase'), (0.0044, 'Product_ID_25Per_Purchase'), (0.0044, 'PC1_Min_Purchase'), (0.0042, 'Product_ID_50Per_Purchase'), (0.0038, 'User_ID_50Per_Purchase'), (0.003, 'Gender'), (0.0029, 'Product_ID_Max_Purchase'), (0.0024, 'User_ID_Max_Purchase'), (0.002, 'User_ID_Count'), (0.002, 'PC1_50Per_Purchase'), (0.0015, 'Product_Category_1'), (0.0014, 'Age'), (0.0013, 'Product_ID_Min_Purchase'), (0.0013, 'PC1_Mean_Purchase'), (0.0013, 'Age_75Per_Purchase'), (0.0012, 'PC2_Max_Purchase'), (0.0009, 'Product_ID_Count'), (0.0009, 'PC2_75Per_Purchase'), (0.0009, 'Occupation_25Per_Purchase'), (0.0009, 'Age_Count'), (0.0008, 'Stay_Count'), (0.0008, 'PC2_Mean_Purchase'), (0.0008, 'Occupation_Mean_Purchase'), (0.0008, 'Occupation_Max_Purchase'), (0.0008, 'Occupation_Count'), (0.0008, 'City_Category'), (0

In [20]:
#we will take features that have importance more than 0.0005
from sklearn.feature_selection import SelectFromModel
selection = SelectFromModel(xgb, threshold=0.001, prefit=True)

In [21]:
#define X_train after selecting the top few features
X_train_selection = selection.transform(X_train)
#define X_test after selecting the top few features
X_test_selection = selection.transform(X_test)

In [22]:
xgb_selected_features = XGBRegressor(n_estimators=1000, learning_rate=0.05)
# train model
xgb_selected_features.fit(X_train_selection, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [23]:
#predict with the new features
y_test_pred = xgb_selected_features.predict(X_test_selection)

In [24]:
#get the mean squared error
from sklearn import metrics
import math
mse = metrics.mean_squared_error(y_test, y_test_pred)
rmse = math.sqrt(mse)
print(rmse)

2446.865162601363


Now we know that we can use this model for predicting the purchase for the test data
<br> So, we will do all the steps that we did for preprocessing the training data

In [56]:
#Load the data
test_df = pd.read_csv(r'E:\Data hackatons\AV black friday sales\test.csv')

In [57]:
test_df["Gender"] = test_df["Gender"].apply(lambda x: gender_dict[x])
test_df["Age"] = test_df["Age"].apply(lambda x: age_dict[x])
test_df["City_Category"] = test_df["City_Category"].apply(lambda x: city_dict[x])
test_df["Stay_In_Current_City_Years"] = test_df["Stay_In_Current_City_Years"].apply(lambda x: stay_dict[x])

In [58]:
test_df["User_ID_Count"] = getCountofVar(test_df,"User_ID")
test_df["Product_ID_Count"] = getCountofVar(test_df,"Product_ID")
test_df["Gender_Count"] = getCountofVar(test_df,"Gender")
test_df["Age_Count"] = getCountofVar(test_df,"Age")
test_df["Occupation_Count"] = getCountofVar(test_df,"Occupation")
test_df["City_Count"] = getCountofVar(test_df,"City_Category")
test_df["Stay_Count"] = getCountofVar(test_df,"Stay_In_Current_City_Years")
test_df["Marital_Status_Count"] = getCountofVar(test_df,"Marital_Status")
test_df["PC1_Count"] = getCountofVar(test_df,"Product_Category_1")
test_df["PC2_Count"] = getCountofVar(test_df,"Product_Category_2")
test_df["PC3_Count"] = getCountofVar(test_df,"Product_Category_3")

In [59]:
test_df.fillna(0, inplace=True)

In [60]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(test_df,train_df, "User_ID")
test_df["User_ID_Min_Purchase"] = min_price_list
test_df["User_ID_Max_Purchase"] = max_price_list
test_df["User_ID_Mean_Purchase"] = mean_price_list
test_df["User_ID_25Per_Purchase"] = twentyfive_price_list
test_df["User_ID_50Per_Purchase"] = fifty_price_list
test_df["User_ID_75Per_Purchase"] = seventyfive_price_list

In [61]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(test_df,train_df, "Product_ID")
test_df["Product_ID_Min_Purchase"] = min_price_list
test_df["Product_ID_Max_Purchase"] = max_price_list
test_df["Product_ID_Mean_Purchase"] = mean_price_list
test_df["Product_ID_25Per_Purchase"] = twentyfive_price_list
test_df["Product_ID_50Per_Purchase"] = fifty_price_list
test_df["Product_ID_75Per_Purchase"] = seventyfive_price_list

In [62]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(test_df, train_df, "Gender")
test_df["Gender_Min_Purchase"] = min_price_list
test_df["Gender_Max_Purchase"] = max_price_list
test_df["Gender_Mean_Purchase"] = mean_price_list
test_df["Gender_25Per_Purchase"] = twentyfive_price_list
test_df["Gender_50Per_Purchase"] = fifty_price_list
test_df["Gender_75Per_Purchase"] = seventyfive_price_list

In [63]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(test_df,train_df, "Age")
test_df["Age_Min_Purchase"] = min_price_list
test_df["Age_Max_Purchase"] = max_price_list
test_df["Age_Mean_Purchase"] = mean_price_list
test_df["Age_25Per_Purchase"] = twentyfive_price_list
test_df["Age_50Per_Purchase"] = fifty_price_list
test_df["Age_75Per_Purchase"] = seventyfive_price_list

In [64]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(test_df,train_df, "Occupation")
test_df["Occupation_Min_Purchase"] = min_price_list
test_df["Occupation_Max_Purchase"] = max_price_list
test_df["Occupation_Mean_Purchase"] = mean_price_list
test_df["Occupation_25Per_Purchase"] = twentyfive_price_list
test_df["Occupation_50Per_Purchase"] = fifty_price_list
test_df["Occupation_75Per_Purchase"] = seventyfive_price_list

In [65]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(test_df,train_df, "City_Category")
test_df["City_Min_Purchase"] = min_price_list
test_df["City_Max_Purchase"] = max_price_list
test_df["City_Mean_Purchase"] = mean_price_list
test_df["City_25Per_Purchase"] = twentyfive_price_list
test_df["City_50Per_Purchase"] = fifty_price_list
test_df["City_75Per_Purchase"] = seventyfive_price_list

In [66]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(test_df,train_df, "Stay_In_Current_City_Years")
test_df["Stay_Min_Purchase"] = min_price_list
test_df["Stay_Max_Purchase"] = max_price_list
test_df["Stay_Mean_Purchase"] = mean_price_list
test_df["Stay_25Per_Purchase"] = twentyfive_price_list
test_df["Stay_50Per_Purchase"] = fifty_price_list
test_df["Stay_75Per_Purchase"] = seventyfive_price_list

In [67]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(test_df,train_df, "Marital_Status")
test_df["Marital_Min_Purchase"] = min_price_list
test_df["Marital_Max_Purchase"] = max_price_list
test_df["Marital_Mean_Purchase"] = mean_price_list
test_df["Marital_25Per_Purchase"] = twentyfive_price_list
test_df["Marital_50Per_Purchase"] = fifty_price_list
test_df["Marital_75Per_Purchase"] = seventyfive_price_list

In [68]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(test_df,train_df, "Product_Category_1")
test_df["PC1_Min_Purchase"] = min_price_list
test_df["PC1_Max_Purchase"] = max_price_list
test_df["PC1_Mean_Purchase"] = mean_price_list
test_df["PC1_25Per_Purchase"] = twentyfive_price_list
test_df["PC1_50Per_Purchase"] = fifty_price_list
test_df["PC1_75Per_Purchase"] = seventyfive_price_list

In [69]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(test_df,train_df, "Product_Category_2")
test_df["PC2_Min_Purchase"] = min_price_list
test_df["PC2_Max_Purchase"] = max_price_list
test_df["PC2_Mean_Purchase"] = mean_price_list
test_df["PC2_25Per_Purchase"] = twentyfive_price_list
test_df["PC2_50Per_Purchase"] = fifty_price_list
test_df["PC2_75Per_Purchase"] = seventyfive_price_list

In [70]:
min_price_list, max_price_list, mean_price_list, twentyfive_price_list,fifty_price_list, seventyfive_price_list = getPurchaseStats(test_df,train_df, "Product_Category_3")
test_df["PC3_Min_Purchase"] = min_price_list
test_df["PC3_Max_Purchase"] = max_price_list
test_df["PC3_Mean_Purchase"] = mean_price_list
test_df["PC3_25Per_Purchase"] = twentyfive_price_list
test_df["PC3_50Per_Purchase"] = fifty_price_list
test_df["PC3_75Per_Purchase"] = seventyfive_price_list

In [71]:
test_df.to_csv(r'E:\Data hackatons\AV black friday sales\test_full_feature.csv',index=False)
#test_df = pd.read_csv(r'E:\Data hackatons\AV black friday sales\test_full_feature.csv')

In [72]:
#define test data
test_data = test_df.drop(columns=['User_ID','Product_ID'],axis=1)

In [73]:
test_data_selection = selection.transform(test_data)

In [74]:
#predict the test data
test_df["Purchase"] = xgb_selected_features.predict(test_data_selection)

In [76]:
IDcol = ['User_ID','Product_ID']
IDcol.append("Purchase")
submission = pd.DataFrame({ x: test_df[x] for x in IDcol})
submission.to_csv(r"E:\Data hackatons\AV black friday sales\submission_xgb1.csv", index=False)