In [2]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline

import xgboost as xgb
import lightgbm as lgb 
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV

In [3]:
# Prep data
%run 'data-prep.ipynb'

In [4]:
print(train_features.shape)
print(train_outcome.shape)
print(test_features.shape)

(8523, 36)
(8523,)
(5681, 36)


# Model Training

In [7]:
# Below are just identifiers which we dont want to fit.
IDcols = ['Item_Identifier', 'Outlet_Identifier']

train_predictors = train_features.set_index(IDcols)
test_predictors = test_features.set_index(IDcols)

print(train_predictors.shape)
print(test_predictors.shape)

(8523, 34)
(5681, 34)


In [9]:
def train_model(clf, param_grid, train_predictors, train_outcome):  
    pipe = make_pipeline(MinMaxScaler(), clf)
    
    grid_search = GridSearchCV(
        pipe,
        param_grid,
        n_jobs=-1
    )
    grid_search.fit(train_predictors, train_outcome.round())
    
    print('BEST SCORE', grid_search.best_score_)
    print('BEST PARAM', grid_search.best_params_)
    
    return grid_search.best_estimator_

## XGBoost

Gradient boosting algorithms surprisingly did very poor job when there are not many features

In [72]:
xgb_param_grid = {}
# xgb_param_grid = {'xgbclassifier__n_estimators': np.arange(150, 200)}

xgb_model = train_model(
    xgb.XGBClassifier(),
    xgb_param_grid,
    train_predictors, 
    train_outcome
)

## Random Forest

In [94]:
rf_param_grid = {}

rf_model = train_model(
    RandomForestClassifier(),
    rf_param_grid,
    train_predictors, 
    train_outcome
)



BEST SCORE 0.005162501466619735
BEST PARAM {}


## Decision Tree

So far, this model gives us the best score: **1426**

In [96]:
# dt_param_grid = {'decisiontreeclassifier__max_depth': np.arange(6, 10)}
# dt_param_grid = {'decisiontreeclassifier__min_samples_split': np.arange(2, 4)}
# dt_param_grid = {'decisiontreeclassifier__min_samples_leaf': np.arange(1, 4)}
# dt_param_grid = {'decisiontreeclassifier__max_leaf_nodes': np.arange(3, 4)}
dt_param_grid = {}

dt_model = train_model(
    DecisionTreeClassifier(max_depth=9, min_samples_split=2, min_samples_leaf=3, criterion="entropy"),
    dt_param_grid,
    train_predictors, 
    train_outcome
)



BEST SCORE 0.016543470608940514
BEST PARAM {}


## K-Nearest Neighbor

In [100]:
knn_param_grid = {}

knn_model = train_model(
    KNeighborsClassifier(n_jobs=-1),
    knn_param_grid,
    train_predictors, 
    train_outcome
)



BEST SCORE 0.0021119324181626186
BEST PARAM {}


## Light GBM

In [None]:
lgbm_param_grid = {}

lgbm_model = train_model(
    lgb.LGBMClassifier(),
    lgbm_param_grid,
    train_predictors, 
    train_outcome
)

## SVM

In [5]:
svm_param_grid = {}

lgbm_model = train_model(
    svm.SVC(),
    svm_param_grid,
    train_predictors, 
    train_outcome
)

## Linear Regression

In [11]:
lr_param_grid = {}

lr_model = train_model(
    LinearRegression(),
    lr_param_grid,
    train_predictors, 
    train_outcome
)



BEST SCORE 0.5603016618806848
BEST PARAM {}




# Submission

In [13]:
def submit(test_predictors, model, filename):
    preds = model.predict(test_predictors)
    
    test_predictors_copy = test_predictors.copy()
    test_predictors_copy['Item_Outlet_Sales'] = preds
    
    submission = test_predictors_copy['Item_Outlet_Sales']

    filename = 'submission-' + filename + '.csv'
    submission.to_csv(filename, header=True)

In [None]:
# XGBoost Submission
submit(test_features, test_predictors, xgb_model, 'xgboost')

In [46]:
# Random Forest Submission
submit(test_predictors, rf_model, 'random-forest')

In [79]:
# Decision Tree
submit(test_predictors, dt_model, 'decision-tree')

In [15]:
# KNN
submit(test_predictors, knn_model, 'knn')

In [16]:
# Linear Regression
submit(test_predictors, lr_model, 'linear-regression')