In [1]:
# To support python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# To plot pretty figures
import matplotlib as mpl
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To make output stable across all runs
np.random.seed(42)
    
# Ignore useless warning
import warnings
warnings.filterwarnings(action='ignore', message="^internal gelsd")

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,Item_ID,Store_ID,Item_Store_ID,Item_Weight,Item_Sugar_Content,Item_Visibility,Item_Type,Item_Price,Store_Start_Year,Store_Size,Store_Location_Type,Store_Type,Item_Store_Returns
0,DRA12,BABATUNJI010,DRA12_BABATUNJI010,11.6,Low Sugar,0.068535,Soft Drinks,357.54,2005,,Cluster 3,Grocery Store,709.08
1,DRA12,BABATUNJI013,DRA12_BABATUNJI013,11.6,Low Sugar,0.040912,Soft Drinks,355.79,1994,High,Cluster 3,Supermarket Type1,6381.69
2,DRA12,BABATUNJI017,DRA12_BABATUNJI017,11.6,Low Sugar,0.041178,Soft Drinks,350.79,2014,,Cluster 2,Supermarket Type1,6381.69
3,DRA12,BABATUNJI018,DRA12_BABATUNJI018,11.6,Low Sugar,0.041113,Soft Drinks,355.04,2016,Medium,Cluster 3,Supermarket Type2,2127.23
4,DRA12,BABATUNJI035,DRA12_BABATUNJI035,11.6,Ultra Low Sugar,0.0,Soft Drinks,354.79,2011,Small,Cluster 2,Supermarket Type1,2481.77


In [4]:
train.tail()

Unnamed: 0,Item_ID,Store_ID,Item_Store_ID,Item_Weight,Item_Sugar_Content,Item_Visibility,Item_Type,Item_Price,Store_Start_Year,Store_Size,Store_Location_Type,Store_Type,Item_Store_Returns
4985,NCZ54,BABATUNJI019,NCZ54_BABATUNJI019,,Low Sugar,0.145952,Household,402.39,1992,Small,Cluster 1,Grocery Store,406.14
4986,NCZ54,BABATUNJI027,NCZ54_BABATUNJI027,,Low Sugar,0.082956,Household,410.14,1992,Medium,Cluster 3,Supermarket Type3,13808.69
4987,NCZ54,BABATUNJI045,NCZ54_BABATUNJI045,14.65,Low Sugar,0.083528,Household,406.14,2009,,Cluster 2,Supermarket Type1,5685.93
4988,NCZ54,BABATUNJI046,NCZ54_BABATUNJI046,14.65,Low Sugar,0.083359,Household,404.89,2004,Small,Cluster 1,Supermarket Type1,11778.0
4989,NCZ54,BABATUNJI049,NCZ54_BABATUNJI049,14.65,Low Sugar,0.083489,Household,403.39,2006,Medium,Cluster 1,Supermarket Type1,17870.07


# Prepare Data For Machine Learning

In [5]:
from sklearn import preprocessing

# Concatinating train and test for easy featuring
ntrain = train.shape[0]
ntest = test.shape[0]

# Get data target variable
lab_enc = preprocessing.LabelEncoder()
y = lab_enc.fit_transform(train['Item_Store_Returns'])

all_data = pd.concat((train, test)).reset_index(drop=True)

# Drop Target variable
all_data.drop(['Item_Store_Returns', 'Item_ID'], axis=1, inplace=True)

print("Total data size is: {}".format(all_data.shape))

Total data size is: (8522, 11)


In [6]:
all_data.isna().sum()

Store_ID                  0
Item_Store_ID             0
Item_Weight            1463
Item_Sugar_Content        0
Item_Visibility           0
Item_Type                 0
Item_Price                0
Store_Start_Year          0
Store_Size             2409
Store_Location_Type       0
Store_Type                0
dtype: int64

In [7]:
# Handling missing values
for column in all_data.columns:
    if all_data[column].dtypes == 'O':
        all_data[column].fillna(all_data[column].mode()[0], inplace=True)
    elif all_data[column].dtypes != 'O':
        all_data[column].fillna(all_data[column].median(), inplace=True)

In [8]:
all_data.isna().sum()

Store_ID               0
Item_Store_ID          0
Item_Weight            0
Item_Sugar_Content     0
Item_Visibility        0
Item_Type              0
Item_Price             0
Store_Start_Year       0
Store_Size             0
Store_Location_Type    0
Store_Type             0
dtype: int64

In [9]:
all_data['Item_price_sqrt'] = np.sqrt(all_data['Item_Price'])

all_data['cross_price_weight'] = all_data['Item_Price'] * all_data['Item_Weight']

In [10]:
# Encoding Categorical features
one_hot_cols = [column for column in all_data.columns[2:] if all_data[column].dtypes == 'O']
label_cols = [column for column in all_data.columns[0: 2]]
label_cols

['Store_ID', 'Item_Store_ID']

In [11]:
one_hot_cols

['Item_Sugar_Content',
 'Item_Type',
 'Store_Size',
 'Store_Location_Type',
 'Store_Type']

In [12]:
# Applying one hot encoding to one_hot_cols
all_data = pd.get_dummies(all_data, prefix_sep='_', columns=one_hot_cols)

# Apply label encoding to label_cols
for col in label_cols:
    all_data[col] = all_data[col].factorize()[0]

In [13]:
from sklearn import preprocessing

# Get data target variable
lab_enc = preprocessing.LabelEncoder()
all_data_enc = lab_enc.fit_transform(all_data.columns)

In [14]:
train = all_data[:ntrain]
test = all_data[ntrain:]

print('Train size: {}------- Test size: {}'.format(train.shape, test.shape))

Train size: (4990, 37)------- Test size: (3532, 37)


In [15]:
train.head()

Unnamed: 0,Store_ID,Item_Store_ID,Item_Weight,Item_Visibility,Item_Price,Store_Start_Year,Item_price_sqrt,cross_price_weight,Item_Sugar_Content_Low Sugar,Item_Sugar_Content_Normal Sugar,...,Store_Size_High,Store_Size_Medium,Store_Size_Small,Store_Location_Type_Cluster 1,Store_Location_Type_Cluster 2,Store_Location_Type_Cluster 3,Store_Type_Grocery Store,Store_Type_Supermarket Type1,Store_Type_Supermarket Type2,Store_Type_Supermarket Type3
0,0,0,11.6,0.068535,357.54,2005,18.908728,4147.464,1,0,...,0,1,0,0,0,1,1,0,0,0
1,1,1,11.6,0.040912,355.79,1994,18.862396,4127.164,1,0,...,1,0,0,0,0,1,0,1,0,0
2,2,2,11.6,0.041178,350.79,2014,18.729389,4069.164,1,0,...,0,1,0,0,1,0,0,1,0,0
3,3,3,11.6,0.041113,355.04,2016,18.842505,4118.464,1,0,...,0,1,0,0,0,1,0,0,1,0
4,4,4,11.6,0.0,354.79,2011,18.83587,4115.564,0,0,...,0,0,1,0,1,0,0,1,0,0


# Select and Train a Model

In [16]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

#scaler = StandardScaler()
#Fit on train set
#scaler.fit(train)
#Apply transform to trainng and test set
#train = scaler.transform(train)
#test = scaler.transform(test)

#poly_features = PolynomialFeatures(degree=2, include_bias=False)
#X_poly_train = poly_features.fit_transform(X_train)

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression()

In [19]:
lin_reg.predict(train[:20])

array([ 403.81151193, 1194.82207792, 1358.27818077, 1163.74023906,
       1387.15175585, 1307.91880506,  562.88481949, 1350.23920825,
       1536.94567081,  282.19451788, 2072.37045436, 1538.26440316,
       1486.19549402, 1632.58249349, 1422.18140849,  377.75648058,
       2168.75689445, 1518.03560631, 1557.93006243, 2163.09994546])

In [20]:
lin_reg.score(X_test, y_test)

0.5731801342670406

In [21]:
forest_reg = RandomForestRegressor(n_jobs=-1, n_estimators=1000, random_state=42)
forest_reg.fit(X_train, y_train)

RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=42)

In [22]:
forest_reg.score(X_test, y_test)

0.5189230204250141

In [55]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

gbrt = GradientBoostingRegressor(max_depth=2, learning_rate=0.01, warm_start=True)

min_val_err = float('inf')
error_going_up = 0
for n_estimators in range(1, 2000):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_test)
    val_err = mean_squared_error(y_test, y_pred)
    if val_err < min_val_err:
        min_val_err = val_err
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break # early stopping

In [56]:
gbrt.score(X_test, y_test)

0.5784418910062415

In [65]:
gbrt.predict(train[:20])

array([ 289.68082204, 1171.49908026, 1181.00447925, 1130.34428845,
       1233.80981548, 1233.80981548,  342.05811222, 1423.33753046,
       1485.64826568,  335.7115829 , 2027.27604099, 1485.64826568,
       1475.50357628, 1621.17564461, 1517.71011758,  340.94182942,
       2243.70041632, 1575.67002708, 1619.11082523, 2277.57641786])

In [66]:
predictions = gbrt.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

458.6416849633456

In [59]:
y_pred = gbrt.predict(test)

In [60]:
# Loading submission dataset
submission = pd.read_csv('SampleSubmission.csv')
submission.head()

Unnamed: 0,Item_Store_ID,Item_Store_Returns
0,DRA59_BABATUNJI010,100
1,DRA59_BABATUNJI013,100
2,DRB01_BABATUNJI013,100
3,DRB13_BABATUNJI010,100
4,DRB13_BABATUNJI013,100


In [61]:
submission['Item_Store_Returns'] = y_pred

In [62]:
submission.to_csv('second_submission.csv', index=False)

In [63]:
sub = pd.read_csv('first_submission.csv')
sub.tail()

Unnamed: 0,Item_Store_ID,Item_Store_Returns
3527,NCZ42_BABATUNJI010,557.07009
3528,NCZ42_BABATUNJI013,1881.944218
3529,NCZ42_BABATUNJI049,1947.190489
3530,NCZ53_BABATUNJI010,643.196103
3531,NCZ53_BABATUNJI013,1859.421581
