# Test Dataset Prediction
* Prediction on test data using model build by using mean for category of demand unit and revenue.

##Importing required libraries

In [0]:
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import dateutil.parser as dparser
import random
from sklearn.preprocessing import RobustScaler
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Mount Google drive

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
path = '/content/gdrive/My Drive/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Import test data

In [3]:
test=pd.read_csv(path+'test-data.csv', header=0)
test.head()

Unnamed: 0,id,experiment_week,channel_type,supplier_identifier,category_of_route,store_identifier,product_identifier
0,1,Week 6 of 2019,Warehouse Retailers,supplier_identifier_055f7,route_67047,store_fc6aa,product_cbfad
1,2,Week 6 of 2019,Warehouse Retailers,supplier_identifier_055f7,route_67047,store_fc6aa,product_2b363
2,3,Week 6 of 2019,Warehouse Retailers,supplier_identifier_055f7,route_67047,store_fc6aa,product_f79b8
3,4,Week 6 of 2019,Warehouse Retailers,supplier_identifier_055f7,route_67047,store_fc6aa,product_8de14
4,5,Week 6 of 2019,Warehouse Retailers,supplier_identifier_055f7,route_67047,store_fc6aa,product_1c00e


## Data pre-processing

In [4]:
data = test.copy()
data.shape

(20815581, 7)

### Importing and merging the colums
* Importing and merging the colums generated for category columns with dataset.

In [5]:
cols = ['channel_type','product_identifier','category_of_route','supplier_identifier','store_identifier', 'experiment_week']
for c in cols:
  print(c+'...Processing')
  temp = pd.read_csv('/content/gdrive/My Drive/'+c+'_mean.csv', header=0)
  temp = temp.set_index(c)
  temp.columns = [c+'_value']
  data = data.merge(temp, right_index=True, left_on = c, how='left')
  data.fillna(temp.mean(), inplace=True)
  print('Done')
  del temp

channel_type...Processing
Done
product_identifier...Processing
Done
category_of_route...Processing
Done
supplier_identifier...Processing
Done
store_identifier...Processing
Done
experiment_week...Processing
Done


In [6]:
cols = ['channel_type','product_identifier','category_of_route','supplier_identifier','store_identifier', 'experiment_week']
for c in cols:
  print(c+'...Processing')
  temp = pd.read_csv('/content/gdrive/My Drive/'+c+'_mean_revenue.csv', header=0)
  temp = temp.set_index(c)
  temp.columns = [c+'_value_revenue']
  data = data.merge(temp, right_index=True, left_on = c, how='left')
  data.fillna(temp.mean(), inplace=True)
  print('Done')
  del temp

channel_type...Processing
Done
product_identifier...Processing
Done
category_of_route...Processing
Done
supplier_identifier...Processing
Done
store_identifier...Processing
Done
experiment_week...Processing
Done


### Converting numAttributes to int16 datatype
* To save memory on RAM and setting values to round-up value, converting numAttributes to int16 datatype

In [0]:
numAttributes = ['experiment_week_value', 'channel_type_value', 'product_identifier_value', 'category_of_route_value', 'supplier_identifier_value', 'store_identifier_value'] 
numAttributesRevenue = ['experiment_week_value_revenue', 'channel_type_value_revenue', 'product_identifier_value_revenue', 'category_of_route_value_revenue', 'supplier_identifier_value_revenue', 'store_identifier_value_revenue']


In [9]:
numCol = numAttributes[:]
numCol.extend(numAttributesRevenue)
for cols in numCol:
  data[cols] = data[cols].astype('int16')
data.dtypes

id                                    int64
experiment_week                      object
channel_type                         object
supplier_identifier                  object
category_of_route                    object
store_identifier                     object
product_identifier                   object
channel_type_value                    int16
product_identifier_value              int16
category_of_route_value               int16
supplier_identifier_value             int16
store_identifier_value                int16
experiment_week_value                 int16
channel_type_value_revenue            int16
product_identifier_value_revenue      int16
category_of_route_value_revenue       int16
supplier_identifier_value_revenue     int16
store_identifier_value_revenue        int16
experiment_week_value_revenue         int16
dtype: object

### Dropping column ID

In [0]:
id = pd.DataFrame(data['id'])

In [0]:
data.drop('id', axis = 1, inplace=True)

### Generate new dataset with value columns imported

In [12]:
cols = numAttributes[:]
cols.extend(numAttributesRevenue)
data = data.loc[:, cols]
data.head()

Unnamed: 0,experiment_week_value,channel_type_value,product_identifier_value,category_of_route_value,supplier_identifier_value,store_identifier_value,experiment_week_value_revenue,channel_type_value_revenue,product_identifier_value_revenue,category_of_route_value_revenue,supplier_identifier_value_revenue,store_identifier_value_revenue
0,7,15,2,18,15,5,1026,2666,370,4172,2522,572
1,7,15,3,18,15,5,1026,2666,397,4172,2522,572
2,7,15,3,18,15,5,1026,2666,366,4172,2522,572
3,7,15,3,18,15,5,1026,2666,488,4172,2522,572
4,7,15,5,18,15,5,1026,2666,738,4172,2522,572


### Drop Column 'experiment_week_value' and 'experiment_week_value_revenue'
* Droping column 'experiment_week_value' and 'experiment_week_value_revenue' as it is nearly a contant value

In [0]:
data.drop('experiment_week_value', axis = 1, inplace=True)
data.drop('experiment_week_value_revenue', axis = 1, inplace=True)

In [14]:
X_test = data.loc[:, data.columns]
X_test.shape

(20815581, 10)

## Generating Prediction File

In [0]:
def createSubmissionFile(m):
  loaded_model = pickle.load(open(path+m+'.sav', 'rb'))
  y_predicted = loaded_model.predict(X_test)
  print('Predicted')

  demand_projection = pd.DataFrame(y_predicted) 
  prediction = pd.concat([id, demand_projection],axis=1)
  prediction.columns = ['id', 'demand_projection']
  prediction.to_csv(path+m+'.csv',index=False)
  print('Prediction saved')

In [16]:
#savedModels = ['linearRegressionUnitRevenue', 'decisionTreeUnitRevenue']
savedModels = ['decisionTreeUnitRevenue']
for m in savedModels:
  createSubmissionFile(m)

Predicted
Prediction saved


In [17]:
rfCols = ['channel_type_value', 'product_identifier_value', 'category_of_route_value', 'supplier_identifier_value', 'store_identifier_value', 'supplier_identifier_value_revenue'] 

X_test = data.loc[:, rfCols]
X_test.shape

(20815581, 6)

In [18]:
savedModels = ['randomForestUnitRevenue']
for m in savedModels:
  createSubmissionFile(m)

Predicted
Prediction saved
