# Predicting on Unseen Data
### Generating Submission File

I've done the EDA and training the model in a separate notebook.<br>In this notebook I'm using the model that I've already trained on the training set provided.

In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [54]:
df = pd.read_csv('Data/test.csv')

In [55]:
df.shape

(233599, 11)

In [56]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


Now, let's save the User IDs and Product IDs for putting submission file into proper format

In [57]:
userIDs = df.User_ID
prodIDs = df.Product_ID

## Cleaning the data as per the inputs used for training the model.
#### (Making the data in proper format to feed into trained model)

In [58]:
df = df.drop('Product_Category_3', axis = 1)

In [59]:
df.Product_Category_2 = df.Product_Category_2.fillna(0)

In [60]:
df.Stay_In_Current_City_Years.replace('4+', '4', inplace=True)
df.Stay_In_Current_City_Years = df.Stay_In_Current_City_Years.astype(int)

In [61]:
df.Gender.replace('F', 0, inplace=True)
df.Gender.replace('M', 1, inplace=True)

In [62]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2
0,1000004,P00128942,1,46-50,7,B,2,1,1,11.0
1,1000009,P00113442,1,26-35,17,C,0,0,3,5.0
2,1000010,P00288442,0,36-45,1,B,4,1,5,14.0
3,1000010,P00145342,0,36-45,1,B,4,1,4,9.0
4,1000011,P00053842,0,26-35,1,C,1,0,4,5.0


In [63]:
features = ['Gender', 'Occupation', 'Stay_In_Current_City_Years',
                      'Marital_Status', 'Product_Category_1', 'Product_Category_2', 'Age', 'City_Category']

In [64]:
df = df[pd.Index(features)]

In [65]:
df = pd.get_dummies(df, columns = ['Age', 'City_Category'])

In [66]:
df.shape

(233599, 16)

In [67]:
import sklearn
import pickle

In [68]:
from sklearn.metrics import r2_score

In [69]:
def rmse(ytrue, ypred):
    return np.sqrt(np.mean(np.square(ytrue - ypred)))

In [70]:
with open('BlackF.pkl', 'rb') as f:
    model = pickle.load(f)

In [71]:
model

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=50, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False))])

In [72]:
df.shape

(233599, 16)

In [73]:
predictions = model.predict(df)

In [74]:
len(predictions)

233599

In [75]:
subm_dict = {
    'User_ID': userIDs,
    'Product_ID': prodIDs,
    'Purchase': predictions
}

In [76]:
submissionDF = pd.DataFrame(subm_dict)
submissionDF.head()

Unnamed: 0,User_ID,Product_ID,Purchase
0,1000004,P00128942,14401.825592
1,1000009,P00113442,10668.373202
2,1000010,P00288442,6470.281772
3,1000010,P00145342,2543.264686
4,1000011,P00053842,2360.736391


In [77]:
submissionDF.to_csv('Data/BlackFriday Submission1.csv', index=False)