### Import packages

In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgb
from  sklearn.model_selection import train_test_split
from ml_metrics import rmsle as metric
import datetime
import time

def rmsle(true, labels):
    pred = labels.get_label()
    if len(pred)==len(true):
        pred[pred<0] = 0
        rmsle = np.sqrt((sum((np.log(pred+1) - np.log(true+1))**2))/len(true))
        return 'rmsle', rmsle

### Load train data

In [None]:
# Full table:   6.1Gb
# This version: 1.1Gb (-82%)
start_time = time.time()
types = {'Semana':np.uint8,'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,
         'Ruta_SAK':np.uint16, 'Cliente_ID':np.uint32, 'Producto_ID':np.uint16,
         'Demanda_uni_equil':np.uint32}

train = pd.read_csv('../../data/kaggle/train.csv', usecols=types.keys(), dtype=types)
#,nrows = 7000000
print("Elapsed time overall: %s seconds" % (time.time() - start_time))
print(train.info(memory_usage=True))

### Look at the shape of the loaded data set
With train.shape we can see information about number of rows and columns.

In [None]:
print ('Training_Shape:', train.shape)

### Print 2 rows of the train data set

In [None]:
train.head(2)

### Split train data set 
We split the train data set in a train data set (59344371 rows) and a test data set (14836093 rows)

In [None]:
# Load the test data and use the columns
test = pd.read_csv('../../data/kaggle/test.csv')
test.head(2)
ids = test['id']
test = test.drop(['id'],axis = 1)
y1 = train['Demanda_uni_equil']
y = np.log(train['Demanda_uni_equil'] + 1)

X = train[test.columns.values]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2019)

print ('Division_Set_Shapes:', X.shape, y.shape)
print ('Validation_Set_Shapes:', X_train.shape, X_test.shape)


In [None]:
X.head(2)

In [None]:
params = {}
params['objective'] = "reg:linear"
booster = "gbtree"
# params['eta'] = 0.025
#params['eta'] = 0.015
#params['eta'] = 0.04
params['eta'] = 0.03
params['max_depth'] = 10
#params['subsample'] = 0.8
params['subsample'] = 0.9
#params['colsample_bytree'] = 0.6
params['colsample_bytree'] = 0.7
params['silent'] = True


In [None]:
print ('Constructing matrix')

#xg_train = xgb.DMatrix(X_train, label=y_train)

#xg_test = xgb.DMatrix(X_test)

train_data=lgb.Dataset(X_train, label=y_train)
valid_data=lgb.Dataset(X_test,label=y_test)


In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 100

In [None]:
#Select Hyper-Parameters
params = {'metric' : 'rmse',
          'boosting_type' : 'gbdt',
          #'xgboost_dart_mode' : 'true',
          'colsample_bytree' : 0.9234,
          #'colsample_bytree' : 0.7,
          'num_leaves' : 13,
          'max_depth' : -1,
          'n_estimators' : 200,
          'min_child_samples': 399, 
          'min_child_weight': 0.1,
          'reg_alpha': 2,
          'reg_lambda': 5,
          'subsample': 0.855,
          'verbose' : -1,
          'num_threads' : 4
}




In [None]:
params = {"objective" : "regression", "metric" : "rmse", 'n_estimators':10000, 'early_stopping_rounds':100,
              "num_leaves" : 30, "learning_rate" : 0.01, "bagging_fraction" : 0.9,
              "feature_fraction" : 0.3, "bagging_seed" : 0}

## Train with light GBM

In [None]:
#Train model on selected parameters and number of iterations
start_time = time.time()
lgbm = lgb.train(params,
                 train_data,
                 2500,
                 valid_sets=valid_data,
                 early_stopping_rounds= 30,
                 verbose_eval= 10
                 )
#print('rmsle:', metric(y_test, pred))
print("Elapsed time overall: %s seconds" % (time.time() - start_time))

In [None]:
y_hat = lgbm.predict(X_test)


In [None]:
import pickle
# save the model to disk
filename = 'lgbmlassifier15nov-2.sav'
pickle.dump(lgbm, open(filename, 'wb'))

In [None]:
# import saved model
import pickle
filename = 'lgbmlassifier15nov-2.sav'
# load the model from disk
lgbm = pickle.load(open(filename, 'rb'))
result = lgbm
print(lgbm)

In [None]:
import shap
shap_values = shap.TreeExplainer(lgbm).shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
shap.summary_plot(shap_values, X, plot_type="bar")

In [None]:
shap.dependence_plot("Producto_ID", shap_values, X_test)

In [None]:
shap.dependence_plot("Canal_ID", shap_values, X_test)

In [None]:
shap.dependence_plot("Ruta_SAK", shap_values, X_test)

In [None]:
shap_values = shap.TreeExplainer(lgbm).shap_values(X)

In [None]:
shap.initjs()

In [None]:
# explain the model's predictions using SHAP values
# (same syntax works for LightGBM, CatBoost, and scikit-learn models)
# load JS visualization code to notebook
import shap

shap.initjs()
explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(X)

# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])

## Train with xgboost

In [None]:
# logistic classifier from xgboost

start_time = time.time()


xgclassifier = xgb.train(params, xg_train, num_rounds, watchlist, feval = rmsle, early_stopping_rounds= 10, verbose_eval = True)
preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration)
    
#print ('RMSLE Score:', rmsle(y_test, preds))
print('rmsle:', metric(y_test, preds))
print("Elapsed time overall: %s seconds" % (time.time() - start_time))

## Save model to disk

In [None]:
import pickle
# save the model to disk
filename = 'xgclassifier14nov.sav'
pickle.dump(xgclassifier, open(filename, 'wb'))


In [None]:
# import saved model
import pickle
filename = 'xgclassifier14nov.sav'
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model
print(result)

In [None]:
predsny = loaded_model.predict(xg_test, ntree_limit=29)
print(predsny)

## Train

In [None]:
# logistic classifier from xgboost
# train with all training data

start_time = time.time()
test_preds = np.zeros(test.shape[0])
trainData = xgb.DMatrix(X, label=y)
watchlist = [(trainData, 'train')]

num_rounds = 2

xgclassifier = xgb.train(params, trainData, num_rounds, watchlist, feval = evalerror, early_stopping_rounds= 10, verbose_eval = True)
preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration)
    
print ('RMSLE Score:', rmsle(y_test, preds))

print("Elapsed time overall: %s seconds" % (time.time() - start_time))

## Predict

In [None]:
# use the saved model
start_time2 = time.time()
test = pd.read_csv('../../data/kaggle/test.csv')
test_id = test['id']
test = test.drop(['id'],axis = 1)
test_preds = np.zeros(test.shape[0])
unlabeled_test = xgb.DMatrix(test)
fold_preds = np.around(loaded_model.predict(unlabeled_test, ntree_limit=39), decimals = 0)
test_preds += fold_preds
res=np.exp(test_preds)-1

print("Elapsed time overall: %s seconds" % (time.time() - start_time2))

## Save for submission

In [None]:
submission = pd.DataFrame({'id':id, 'Demanda_uni_equil': res})
submission.to_csv('submission-loadedmodel14nov.csv', index=False)

## Plot

In [None]:
#import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline
import pandas as pd
import numpy as np
offline.init_notebook_mode()
#res=np.exp(test_preds)-1
res2=np.exp(y_test)-1
#x = test_id[14835500:14836093]
#y = res[14835500:14836093]
#2 = test_id[14835500:14836093]
#y2 = y_test[14835500:14836093]

predicted = go.Scatter(
    x=test_id[6998251:6999251],
    y=res[6998251:6999251]
)
actual = go.Scatter(
    x=test_id[6998251:6999251],
    y= res2[6998251:6999251]
)

data = [predicted, actual]
offline.iplot(data, filename='bar-line')



In [None]:


fxg_test = xgb.DMatrix(test)
fold_preds = np.around(xgclassifier.predict(fxg_test, ntree_limit=xgclassifier.best_iteration), decimals = 1)
test_preds += fold_preds

submission = pd.DataFrame({'id':ids, 'Demanda_uni_equil': test_preds})
submission.to_csv('submission-2.csv', index=False)

In [None]:
test = pd.read_csv('../../data/kaggle/test.csv')
test_id = test['id']
test = test.drop(['id'],axis = 1)
test_preds = np.zeros(test.shape[0])
unlabeled_test = xgb.DMatrix(test)
fold_preds = np.around(xgclassifier.predict(unlabeled_test, ntree_limit=xgclassifier.best_iteration), decimals = 1)
test_preds += fold_preds

submission = pd.DataFrame({'id':test_id, 'Demanda_uni_equil': test_preds})
submission.to_csv('submissionTest2.csv', index=False)

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from ml_metrics import rmsle as metric
from sklearn import preprocessing as ppr
from sklearn.cross_validation import train_test_split

def rmsle(true, labels):
    pred = labels.get_label()
    if len(pred)==len(true):
        pred[pred<0] = 0
        rmsle = np.sqrt((sum((np.log(pred+1) - np.log(true+1))**2))/len(true))
        return 'rmsle', rmsle
        
train = pd.read_csv('../../data/kaggle/train.csv', nrows = 500000)
test = pd.read_csv('../../data/kaggle/test.csv')

test_id = test['id']
test = test.drop(['id'],axis = 1)
y = train['Demanda_uni_equil']
X = train[test.columns.values]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4518)

params = {'objective': "reg:linear",
          'eta'      : 0.03,
          'max_depth': 8}
rounds = 200

xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_test = xgb.DMatrix(X_test)

watchlist = [(xgb_train, 'train')]

xgb_reg = xgb.train(params, xgb_train, rounds, watchlist, feval = rmsle, early_stopping_rounds= 20, verbose_eval = 10)
preds = xgb_reg.predict(xgb_test, ntree_limit=xgb_reg.best_iteration)

print('rmsle:', metric(y_test, preds))

test_preds = np.zeros(test.shape[0])
unlabeled_test = xgb.DMatrix(test)
fold_preds = np.around(xgb_reg.predict(unlabeled_test, ntree_limit=xgb_reg.best_iteration), decimals = 1)
test_preds += fold_preds

submission = pd.DataFrame({'id':test_id, 'Demanda_uni_equil': test_preds})
submission.to_csv('submission.csv', index=False)