In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
from sklearn import cross_validation
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
import sklearn.grid_search as gs
import xgboost as xgb
import h2o
import time
import pylab as pl
import matplotlib.font_manager
from scipy import stats
from sklearn import svm

In [None]:
train = pd.read_csv('Data/train.csv')
store = pd.read_csv('Data/store.csv', index_col='Store')
test = pd.read_csv('Data/test.csv')

In [None]:
test.head()

# Useful functions

In [None]:
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w


def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe


def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

# Data modifying

## Some useful functions

In [None]:
def type_to_numeric(x):
    if(x == "a"):
        return 1
    elif(x == "b"):
        return 2
    elif(x == "c"):
        return 3
    elif(x == "d"):
        return 4
    
def assort_to_numeric(x):
    if(x == "a"):
        return 1
    elif(x == "b"):
        return 2
    elif(x == "c"):
        return 3
    return 0

## Process itself

In [None]:
#train = train.loc[(train.Open == 1) & (train.Sales != 0)]
train = train.loc[train.Open == 1]

In [None]:
train["StoreType"] = 1
train["Assortment"] = 1

train_final = pd.DataFrame()
train_final["Open"] = train.Open
train_final["Promo"] = train.Promo
train_final["Store"] = train.Store
train_final["DayOfWeek"] = train.DayOfWeek
train_final["CompDist"] = 1
train_final["Day"] = train.Date.apply(lambda x: int(x.split('-')[2]))
train_final["Month"] = train.Date.apply(lambda x: int(x.split('-')[1]))
train_final["Year"] = train.Date.apply(lambda x: int(x.split('-')[0]))

test["StoreType"] = 1
test["Assortment"] = 1

test_final = pd.DataFrame()
test_final["Open"] = test.Open
test_final["Promo"] = test.Promo
test_final["Store"] = test.Store
test_final["DayOfWeek"] = test.DayOfWeek
test_final["CompDist"] = 1
test_final["Day"] = test.Date.apply(lambda x: int(x.split('-')[2]))
test_final["Month"] = test.Date.apply(lambda x: int(x.split('-')[1]))
test_final["Year"] = test.Date.apply(lambda x: int(x.split('-')[0]))

In [None]:
train.loc[:,["Assortment", "StoreType"]] = store.loc[train.Store,{"StoreType", "Assortment"}].values
train.loc[:,"CompDist"] = store.loc[train.Store, "CompetitionDistance"].values
train.CompDist = train.CompDist.fillna(0)

test.loc[:,["Assortment", "StoreType"]] = store.loc[test.Store,{"StoreType", "Assortment"}].values
test.loc[:,"CompDist"] = store.loc[test.Store, "CompetitionDistance"].values
test.CompDist = test.CompDist.fillna(0)

In [None]:
plt.hist(x = train.CompDist)
plt.show()

plt.hist(x = train.CompDist.apply(lambda x: math.log(x, 2) if x != 0 else 0))
plt.show()

In [None]:
train_final["LogCompDist"] = train.CompDist.apply(lambda x: math.log(x) if x != 0 else 0)
test_final["LogCompDist"] = test.CompDist.apply(lambda x: math.log(x) if x != 0 else 0)

Let's take a look at our predicted values

In [None]:
plt.hist(train.Sales, bins = 100)
plt.show()

Thay aren't normaly distributed, so try to take a log value of it

In [None]:
plt.hist(train.Sales.apply(lambda x: math.log(x + 1, 2)), bins = 100)
plt.show()

In [None]:
#train_final.loc[:, "SalesLog"] = train.Sales.apply(lambda x: math.log(x, 2))
train_final.loc[:, "Sales"] = train.Sales

In [None]:
train_final["StoreA"] = train.StoreType.apply(lambda x: 1 if x == 'a' else 0)
train_final["StoreB"] = train.StoreType.apply(lambda x: 1 if x == 'b' else 0)
train_final["StoreC"] = train.StoreType.apply(lambda x: 1 if x == 'c' else 0)
train_final["StoreD"] = train.StoreType.apply(lambda x: 1 if x == 'd' else 0)
train_final["AssortA"] = train.StoreType.apply(lambda x: 1 if x == 'a' else 0)
train_final["AssortB"] = train.StoreType.apply(lambda x: 1 if x == 'b' else 0)
train_final["AssortC"] = train.StoreType.apply(lambda x: 1 if x == 'c' else 0)

test_final["StoreA"] = test.StoreType.apply(lambda x: 1 if x == 'a' else 0)
test_final["StoreB"] = test.StoreType.apply(lambda x: 1 if x == 'b' else 0)
test_final["StoreC"] = test.StoreType.apply(lambda x: 1 if x == 'c' else 0)
test_final["StoreD"] = test.StoreType.apply(lambda x: 1 if x == 'd' else 0)
test_final["AssortA"] = test.StoreType.apply(lambda x: 1 if x == 'a' else 0)
test_final["AssortB"] = test.StoreType.apply(lambda x: 1 if x == 'b' else 0)
test_final["AssortC"] = test.StoreType.apply(lambda x: 1 if x == 'c' else 0)

In [None]:
train_final.head()

In [None]:
test_final.head()

As I know, there is some **NaN** values in few columns in test table, let's check it.

In [None]:
len(train[(train.Sales == 0) & (train.Open == 1)])

In [None]:
len(train.loc[train_final.CompDist == 0])

In [None]:
train_tmp = train.copy()
test_tmp = test.copy()

In [None]:
train_final["SchoolHoliday"] = train.SchoolHoliday
test_final["SchoolHoliday"] = train.SchoolHoliday

In [None]:
train_final["YearWeek"] = train.Date.apply(lambda x:
                                           int(time.strftime("%W", time.strptime(x, "%Y-%m-%d"))))
test_final["YearWeek"] = test.Date.apply(lambda x:
                                           int(time.strftime("%W", time.strptime(x, "%Y-%m-%d"))))

In [None]:
train.columns

In [None]:
train = train_final.copy()
test = test_final.copy()

In [None]:
train.dtypes

In [None]:
test = test.fillna(value = 0)

In [None]:
train["LogSales"] = np.log(train["Sales"] + 1)
# train.to_csv('Data/ready_data_cleared_zero_sales_1.csv', index=False)
train.to_csv('Data/ready_data_1.csv', index=False)

In [None]:
test["Id"] = test_tmp.Id
test.to_csv('Data/ready_test_1.csv', index=False)

## Let's detect and remove outliers

In [None]:
train = pd.read_csv("Data/ready_data_1.csv")

In [None]:
# from sklearn.covariance import EllipticEnvelope

# Init of the resulting Dataframe
#result = pd.DataFrame([1])

# Example settings
def remove_outliers(train, outliers_fraction = 0.01):
    print __doc__
    clusters_separation = range(1, 13)

    clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1)

    low_bound_1 = -1
    low_bound_2 = -1
    up_bound_1 = np.max(train.Store) + 1
    up_bound_2 = np.max(train.LogSales) + 7
    xx, yy = np.meshgrid(np.linspace(low_bound_1, up_bound_2, 500), np.linspace(low_bound_2, up_bound_2, 500))

    # Fit the problem with varying cluster separation
    for i, month in enumerate(clusters_separation):
        #np.random.seed(42)
        #X = train[train.Month == month].loc[:, ["Store", "LogSales"]].values
        tmp = train[train.Month == month].copy()
        #X = tmp.loc[:, ["Store", "LogSales"]].values
        X = tmp.loc[:, ["LogSales", "LogSales"]].values

        pl.figure(figsize=(10, 5))
        pl.set_cmap(pl.cm.Blues_r)
        clf.fit(X)
        y_pred = clf.decision_function(X).ravel()
        threshold = stats.scoreatpercentile(y_pred,
                                            100 * outliers_fraction)
        y_pred = y_pred > threshold
        if (i == 0):
            result = tmp[y_pred].copy()
        else:
            result = result.append(tmp[y_pred == 1])

        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        subplot = pl.subplot(1, 1, 1)
        subplot.set_title("Outlier detection")
        subplot.contourf(xx, yy, Z,
                         levels=np.linspace(Z.min(), threshold, 7))
        a = subplot.contour(xx, yy, Z, levels=[threshold],
                            linewidths=2, colors='red')
        subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
                         colors='orange')
        b = subplot.scatter(X[:, 0], X[:, 1], c='black')
        subplot.axis('tight')
        subplot.legend(
            [a.collections[0], b],
            ['learned decision function', 'samples'],
            prop=matplotlib.font_manager.FontProperties(size=11))
        subplot.set_xlim((low_bound_1, up_bound_2))
        subplot.set_ylim((low_bound_2, up_bound_2))

        pl.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
#         print i

    pl.show()
    return result

In [None]:
result.to_csv("Data/ready_data_without_outliers_1.csv", index=False)

## XGBoost

In [None]:
train = pd.read_csv("Data/ready_data_without_outliers_1.csv")
test = pd.read_csv("Data/ready_test_1.csv")
test = First_level_train

In [None]:
def xgb_model_gen(train, features, n_est = 1100, lambda_ = 0.5, eta = 0.5):
    params = {"objective": "reg:linear",
              "booster" : "gbtree",
              "eta": eta, #0.025, 0.3
              "max_depth": 8, #8 
              "subsample": 0.8, #0.7
              "colsample_bytree": 0.7,
              "silent": 1,
              "seed" : 213,
              "lambda" : lambda_,
              "alpha" : 0.0
              }
    num_trees = n_est
    val_size = 100000
    X_train, X_test = cross_validation.train_test_split(train, test_size=0.01)
    dtrain = xgb.DMatrix(X_train[features], X_train["LogSales"])
    dvalid = xgb.DMatrix(X_test[features], X_test["LogSales"])
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    xgb_mod = xgb.train(params, dtrain, num_trees, evals=watchlist, 
                    early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)
    return xgb_mod

In [None]:
best_res = 1
for n_est in [300, 700, 900, 1100]:
    error = 0
    for i in range(4):
        params = {"objective": "reg:linear",
                  "booster" : "gbtree",
                  "eta": 0.5, #0.025, 0.3
                  "max_depth": 8, #8 
                  "subsample": 0.8, #0.7
                  "colsample_bytree": 0.7,
                  "silent": 1,
                  "seed" : 213,
                  "lambda" : 0.5,
                  "alpha" : 0.0
                  }
        num_trees = n_est

        print("Train a XGBoost model")
        val_size = 100000
        X_train, X_test = cross_validation.train_test_split(train, test_size=0.01)
        dtrain = xgb.DMatrix(X_train[features], np.log(X_train["Sales"] + 1))
        dvalid = xgb.DMatrix(X_test[features], np.log(X_test["Sales"] + 1))
        dtest = xgb.DMatrix(test[features])
        watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
        gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, 
                        early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

#             print("Validating")
        train_probs = gbm.predict(xgb.DMatrix(X_test[features]))
        indices = train_probs < 0
        train_probs[indices] = 0
        error = error + rmspe(np.exp(train_probs) - 1, X_test['Sales'].values)
#         print('error', error)

#         print("Make predictions on the test set")
#         predictions = gbm.predict(xgb.DMatrix(test[features]))

#         predictions = h2o_rf.predict(h2o_test[features])
    error = error / 4
    commit = "For used n_est = " + str(n_est) + "\nError = " + str(error)
    if (error < best_res):
        best_res = error
        best_commit = commit
    print commit
# print "Rmse = " + str(res)
print "==============================="
print best_commit
# print best_res

For used eta = 0.3, n_est = 500
Error = 0.113927837496
For eta = 0.5 n_est = 700
Error ~ 0.106...
For used eta = 0.5, n_est = 700
Error = 0.0928495165322
For used subsample = 0.8, maxdepth = 8
Error = 0.0907461244311
For used alpha = 0.0, lambda = 0.5
Error = 0.0907181528501
For used n_est = 1100
Error = 0.0902104248579

In [None]:
features = ['Open', 'Promo', 'Store', 'DayOfWeek', 'CompDist', 'Day',
       'Month', 'Year', 'LogCompDist', 'StoreA', 'StoreB',
       'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC']

# print("augment features")
# build_features(features, train)
# build_features([], test)
# print(features)
def xgboost(train, test, features, seed = 213, eta = 0.3, n_est = 300):
# My params
    params = {"objective": "reg:linear",
              "eta": eta, #0.025, 0.3
              "max_depth": 8,
              "subsample": 0.7,
              "colsample_bytree": 0.7,
              "silent": 1,
              "seed" : seed 
              }
#     params = {"objective" : "reg:linear", 
#                "booster" : "gbtree",
#                "eta" : 0.02, # 0.06, #0.01,
#                "max_depth" : 10, #changed from default of 8
#                "subsample" : 0.9, # 0.7
#                "colsample_bytree" : 0.7,
#                "seed" : seed}
    num_trees = n_est

    print("Train a XGBoost model")
    val_size = 100000
    X_train, X_test = cross_validation.train_test_split(train, test_size=0.01)
    dtrain = xgb.DMatrix(X_train[features], np.log(X_train["Sales"] + 1))
    dvalid = xgb.DMatrix(X_test[features], np.log(X_test["Sales"] + 1))
    dtest = xgb.DMatrix(test[features])
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, 
                    early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

    print("Validating")
    train_probs = gbm.predict(xgb.DMatrix(X_test[features]))
    indices = train_probs < 0
    train_probs[indices] = 0
    error = rmspe(np.exp(train_probs) - 1, X_test['Sales'].values)
    print('error', error)

    print("Make predictions on the test set")
    test_probs = gbm.predict(xgb.DMatrix(test[features]))
    indices = test_probs < 0
    test_probs[indices] = 0
    indices = test.Open == 0
    test_probs[indices] = 0
    submission = pd.DataFrame({"Id": test["Id"], "Sales": np.exp(test_probs) - 1})
    return submission
# submission.to_csv("Goose_XGBoost.csv", index=False)
# submission.Sales = submission.Sales * 0.985
# submission.to_csv("Goose_XGBoost_hint.csv", index=False)

## Random Forest (h2o)
Even after some **Native Grid Search** it scores **0.23619** using **400** estimators.   
Don't think that we need to continue experiment with it cause results are not good at all.

In [None]:
train.columns

In [None]:
X_train, X_test = cross_validation.train_test_split(train, test_size=0.01)
X_train.to_csv("tmp/data_1.csv")
X_test.to_csv("tmp/test_1.csv")

In [None]:
test = pd.read_csv('Data/ready_test_1.csv')
features = ['Open', 'Store', 'DayOfWeek', 'YearWeek', 'Promo', 'CompDist', 'Day',
       'Month', 'Year', 'SchoolHoliday', 'LogCompDist', 'StoreA', 'StoreB',
       'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC']
h2o.init(start_h2o = True)
h2o_train = h2o.upload_file(path = "/Users/Alimantu/Documents/Python/RossmanStoreSales/tmp/data_1.csv")
#h2o_train.describe()
#h2o_dl = h2o.deeplearning(x = h2o_train[features], y = h2o_train["LogSales"])
bton = True;
bdt = True;
best_res = 1;
for mabs in [0.5, 1, 3, 5, 7]:
    for bc in [False, True]:
        for md in [10, 15, 20]:
            h2o_rf = h2o.h2o.random_forest(x = h2o_train[features],
                                           y = h2o_train["LogSales"],  
                                           training_frame=h2o_train,
                                           build_tree_one_node = bton,
                                           binomial_double_trees = bdt,
                                           balance_classes = bc,
                                           max_depth = md,
                                           seed = 213,
                                           max_after_balance_size = mabs,
                                           ntrees = 50)
            #h2o_test = h2o.upload_file(path = "/Users/Alimantu/Documents/Python/RossmanStoreSales/Data/ready_test_1.csv")
            h2o_test = h2o.upload_file(path = "/Users/Alimantu/Documents/Python/RossmanStoreSales/tmp/test_1.csv")
            predictions = h2o_rf.predict(h2o_test[features])
            h2o_tmp = pd.DataFrame(np.arange(1, len(X_test) + 1), columns=["Id"])
            h2o_tmp["Sales1"] = np.exp(predictions.as_data_frame().values) - 1
            indices = test.Open == 0
            h2o_tmp.loc[indices, "Sales1"] = 0
            indices = h2o_tmp.Sales1 < 0
            h2o_tmp.loc[indices, "Sales1"] = 0
            h2o_tmp["Sales2"] = X_test.Sales.values
            res = sum(h2o_tmp.apply(lambda x: ((x.Sales1 - x.Sales2) ** 2) ** 0.5, axis = 1)) / len(h2o_tmp) / np.mean(X_test.Sales)
            commit = "For used mabs = " + str(mabs) + ", bc = " + str(bc) + ", md = " + str(md)
            if (res < best_res):
                best_res = res
                best_commit = commit
            print commit
            print "Rmse = " + str(res)
print "==============================="
print best_commit
print best_res

In [None]:
# test = pd.read_csv('Data/ready_test_1.csv')
# features = ['Open', 'Promo', 'Store', 'DayOfWeek', 'CompDist', 'Day',
#        'Month', 'Year', 'LogCompDist', 'StoreA', 'StoreB',
#        'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC']
def h2o_rf(train_path, test_path, features, default_seed = 213, n_est = 500):
    h2o.init(start_h2o = True)
#     h2o_train = h2o.upload_file(path = "/Users/Alimantu/Documents/Python/RossmanStoreSales/Data/ready_data_1.csv")
    h2o_train = h2o.upload_file(path = train_path)

    # With this values the final rmse on crossval score was about 0.265486806381
    bton = True
    bdt = True
    best_res = 1
    mabs = 3
    bc = False
    md = 20

    h2o_rf = h2o.h2o.random_forest(x = h2o_train[features],
                                   y = h2o_train["LogSales"],  
                                   training_frame=h2o_train,
                                   build_tree_one_node = True,
                                   binomial_double_trees = True,
                                   balance_classes = False,
                                   max_depth = 20,
                                   seed = default_seed,
                                   max_after_balance_size = 3,
                                   ntrees = n_est)

    h2o_test = h2o.upload_file(path = test_path)
    test = pd.read_csv(test_path)
    predictions = h2o_rf.predict(h2o_test[features])
    h2o_submission = pd.DataFrame(np.arange(1, len(test) + 1), columns=["Id"])
    h2o_submission["Sales"] = np.exp(predictions.as_data_frame().values) - 1
    indices = test.Open == 0
    h2o_submission.loc[indices, "Sales"] = 0
    indices = h2o_submission.Sales < 0
    h2o_submission.loc[indices, "Sales"] = 0
    # h2o_submission.to_csv("Result/h2o_rf.csv", index=False)
    return h2o_submission

In [None]:
def h2o_rf_model_gen(train, features, n_est = 500):
    h2o.init(start_h2o = True)
    train.to_csv("tmp/train_tmp.csv")
    h2o_train = h2o.upload_file("/Users/Alimantu/Documents/Python/RossmanStoreSales/tmp/train_tmp.csv")
    h2o_rf = h2o.h2o.random_forest(x = h2o_train[features],
                                   y = h2o_train["LogSales"],  
                                   training_frame=h2o_train,
                                   build_tree_one_node = True,
                                   binomial_double_trees = True,
                                   balance_classes = False,
                                   max_depth = 20,
                                   seed = 213,
                                   max_after_balance_size = 3,
                                   ntrees = n_est)
    return h2o_rf

## Deep learning (h2o)

In [None]:
#train = pd.read_csv('Data/ready_data_cleared_zero_sales_1.csv')
train = pd.read_csv('Data/ready_data_1.csv')
test = pd.read_csv('Data/ready_test_1.csv')
X_train, X_test = cross_validation.train_test_split(train, test_size=0.01)
train2 = X_train[X_train.Sales != 0]
train2.to_csv("tmp/data_zr_1.csv")
X_train.to_csv("tmp/data_1.csv")
X_test.to_csv("tmp/test_1.csv")

In [None]:
train.head()

In [None]:
train = pd.read_csv("Data/ready_data_without_outliers_2.csv")
X_train, X_test = cross_validation.train_test_split(train, test_size=0.01)
X_train.to_csv("tmp/data_1.csv")
X_test.to_csv("tmp/test_1.csv")

In [None]:
def h2o_dl_model_gen(train, features, hidden_ = [300, 300, 300, 300, 300], epochs_ = 120):
    h2o.init(start_h2o = True)
    train.to_csv("tmp/train_tmp.csv")
    h2o_train = h2o.upload_file("/Users/Alimantu/Documents/Python/RossmanStoreSales/tmp/train_tmp.csv")
    h2o_dl = h2o.deeplearning(x = h2o_train[features], 
                              y = h2o_train["LogSales"],
                              training_frame=h2o_train, 
                              activation = "Tanh",
                              epochs = epochs_, 
                              hidden = hidden_,
                              seed = 213,
                              loss = "MeanSquare")
    return h2o_dl

In [None]:
test = pd.read_csv('Data/ready_test_1.csv')
# features = ['Open', 'Promo', 'Store', 'DayOfWeek', 'CompDist', 'Day',
#        'Month', 'Year', 'LogCompDist', 'StoreA', 'StoreB',
#        'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC']
features = ['Open', 'Store', 'DayOfWeek', 'YearWeek', 'Promo', 'Day',
       'Month', 'Year', 'SchoolHoliday', 'LogCompDist', 'StoreA', 'StoreB',
       'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC']
h2o.init(start_h2o = True)
# h2o_train_new = h2o.upload_file(path = "/Users/Alimantu/Documents/Python/RossmanStoreSales/tmp/data_1.csv")
h2o_train = h2o.upload_file(path = "/Users/Alimantu/Documents/Python/RossmanStoreSales/tmp/data_1.csv")
# h2o_train_zr = h2o.upload_file(path = "/Users/Alimantu/Documents/Python/RossmanStoreSales/tmp/data_zr_1.csv")
h2o_test = h2o.upload_file(path = "/Users/Alimantu/Documents/Python/RossmanStoreSales/tmp/test_1.csv")
h2o_tmp = pd.DataFrame(np.arange(1, len(X_test) + 1), columns=["Id"])
h2o_tmp["Sales2"] = X_test.Sales.values
best_res = 100
best_commit = "Very bad \(>.<)/"

#searching for best activation function
#for active in ["Tanh", "TanhWithDropout", "Rectifier", "RectifierWithDropout", "Maxout", "MaxoutWithDropout"]:
active = "Tanh"
seed_ = 213
loss_ = "MeanSquare"

epochs_ = 120
#for epochs_ in [100, 1000, 5000, 10000]:
iter_ = 0
# for h2o_train in [h2o_train_new, h2o_train_zr]:
#     iter_ += 1
#for hidden_ in [[300, 300, 300, 300, 300], [200, 200, 200, 200, 200, 200]]:
for hidden_ in [[50, 50, 50], [50, 50, 50, 50]]:
    h2o_dl = h2o.deeplearning(x = h2o_train[features], 
                              y = h2o_train["LogSales"],
                              training_frame=h2o_train, 
                              activation = active,
                              epochs = epochs_, 
                              hidden = hidden_,
                              seed = seed_,
                              loss = loss_)
    predictions = h2o_dl.predict(h2o_test[features])
    h2o_tmp["Sales1"] = np.exp(predictions.as_data_frame().values) - 1
    indices = X_test.Open == 0
    indices.index = np.arange(len(X_test))
    h2o_tmp.loc[indices, "Sales1"] = 0
    indices = h2o_tmp.Sales1 < 0
    h2o_tmp.loc[indices, "Sales1"] = 0
    res = sum(h2o_tmp.apply(lambda x: ((x.Sales1 - x.Sales2) ** 2) ** 0.5, axis = 1)) / len(h2o_tmp) / np.mean(X_test.Sales)
    commit = "For used hidden_ = " + str(hidden_) + ", iter = " + str(iter_)
    if (res < best_res):
        best_res = res
        best_commit = commit
    print commit
    print "Rmse = " + str(res)
print "==============================="
print best_commit
print best_res

For used epochs_ = 100, hidden_ = [50, 50, 50, 50]   
Rmse = 0.12393553862   
For used epochs_ = 100, hidden_ = [70, 70, 70, 70]   
Rmse = 0.108514525843   
For used epochs_ = 100, hidden_ = [30, 30, 30, 30, 30]   
Rmse = 0.150923171566

For used hidden_ = [100, 100, 100, 100, 100], iter = 0   
Rmse = 0.103013410371 (result with hint .17870)   
For used hidden_ = [300, 300, 300, 300, 300], iter = 0   
Rmse = 0.0912935961785

In [None]:
test = pd.read_csv('Data/ready_test_1.csv')
features = ['Open', 'Promo', 'Store', 'DayOfWeek', 'CompDist', 'Day',
       'Month', 'Year', 'LogCompDist', 'StoreA', 'StoreB',
       'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC']
h2o.init(start_h2o = True)
h2o_train = h2o.upload_file(path = "/Users/Alimantu/Documents/Python/RossmanStoreSales/Data/ready_data_1.csv")

h2o_rf = h2o.deeplearning(x = h2o_train[features],
                          y = h2o_train["LogSales"],
                          training_frame=h2o_train,
                         )


h2o_test = h2o.upload_file(path = "/Users/Alimantu/Documents/Python/RossmanStoreSales/Data/ready_test_1.csv")
predictions = h2o_rf.predict(h2o_test[features])
h2o_submission = pd.DataFrame(np.arange(1, len(test) + 1), columns=["Id"])
h2o_submission["Sales"] = np.exp(predictions.as_data_frame().values) - 1
indices = test.Open == 0
h2o_submission.loc[indices, "Sales"] = 0
indices = h2o_submission.Sales < 0
h2o_submission.loc[indices, "Sales"] = 0
h2o_submission.to_csv("Result/h2o_rf.csv", index=False)

# Sklearn RF

In [None]:
featuresures = ['Open', 'Store', 'DayOfWeek', 'YearWeek', 'Promo', 'Day',
       'Month', 'Year', 'SchoolHoliday', 'LogCompDist', 'StoreA', 'StoreB',
       'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC']
train = pd.read_csv("Data/ready_data_1.csv")

In [None]:
def skl_rf_model_gen(train, features, n_est = 500):
    skl_rf = RandomForestRegressor(random_state=213
                                   , n_estimators = 20
                                   , criterion = 'mse'
                                   , min_samples_split = 5
                                   , min_samples_leaf = 1
                                   , max_depth = 30
                                   , min_weight_fraction_leaf = 0.
                                   , max_leaf_nodes = None
                                   , bootstrap = True
                                   , oob_score = True
                                   , n_jobs = -1)
    skl_rf.fit(train.loc[:, features], train.loc[:, "LogSales"])
    return skl_rf

In [None]:
grid_search_cv = gs.GridSearchCV(RandomForestRegressor(random_state=213
                                                       , n_estimators = 20
                                                       , criterion = 'mse'
                                                       , min_samples_split = 5
                                                       , min_samples_leaf = 1
                                                       , max_depth = 30
                                                       , min_weight_fraction_leaf = 0.
                                                       , max_leaf_nodes = None
                                                       , bootstrap = True
                                                       , oob_score = True
                                                      ), {
#         'max_depth': (10, 30)
        'oob_score' : (True, False)
#        'loss' : ('ls', 'lad', 'huber', 'quantile')#,
#        'learning_rate' : (0.0001, 0.01, 0.1, 1, 10),
#        'n_estimators' : (10, 50, 100),
#        'n_estimators' : (10, 20)
#        'min_samples_leaf' : (1, 2),
#        'min_samples_split': (10, 20)
    },
                                 scoring='mean_squared_error', n_jobs = -1, cv=4, verbose=10)
grid_search_cv.fit(train.loc[:, features], train.loc[:, "LogSales"])
print(-grid_search_cv.best_score_) ** 0.5 / np.mean(train.loc[:, "LogSales"])
grid_search_cv.best_estimator_

In [None]:
features = ['Open', 'Store', 'DayOfWeek', 'YearWeek', 'Promo', 'Day',
       'Month', 'Year', 'SchoolHoliday', 'LogCompDist', 'StoreA', 'StoreB',
       'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC']
train = pd.read_csv("Data/ready_data_without_outliers_1.csv")
skl_rf = RandomForestRegressor(n_estimators = 1000, min_samples_leaf=1, n_jobs = -1, random_state=213, criterion='mse',
                              min_samples_split=20, verbose = True)
skl_rf.fit(train.loc[:, features], train.loc[:, "LogSales"])
test = pd.read_csv("Data/ready_test_1.csv")
submission = pd.DataFrame(np.arange(1, len(test) + 1), columns=["Id"])
submission["Sales"] = (np.exp(skl_rf.predict(test.loc[:, features])) - 1) * 0.985
indices = test.Open == 0
submission.loc[indices, "Sales"] = 0
indices = submission.Sales < 0
submission.loc[indices, "Sales"] = 0
submission.to_csv("Result/Skl_rf_1000_wo_hint.csv", index = False)

# Sklearn GB

In [None]:
features = ['Open', 'Store', 'DayOfWeek', 'YearWeek', 'Promo', 'Day',
       'Month', 'Year', 'SchoolHoliday', 'LogCompDist', 'StoreA', 'StoreB',
       'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC']
train = pd.read_csv("Data/ready_data_without_outliers_3.csv")

In [None]:
# max_depth = 10
grid_search_cv = gs.GridSearchCV(GradientBoostingRegressor(random_state=213
                                                           , loss = 'ls'
                                                           , n_estimators = 10
                                                           , min_samples_leaf = 1
                                                           , min_samples_split = 1
                                                           , learning_rate = 1
                                                           , max_leaf_nodes = -1
                                                          ), {
        'max_depth': (10, 30)
#        'loss' : ('ls', 'lad', 'huber', 'quantile')#,
#        'learning_rate' : (0.0001, 0.01, 0.1, 1, 10),
#        'n_estimators' : (10, 50, 100),
#        'n_estimators' : (10, 20)
#        'min_samples_leaf' : (1, 2),
#        'min_samples_split': (10, 20)
    },
                                 scoring='mean_squared_error', n_jobs = -1, cv=4, verbose=10)
grid_search_cv.fit(train.loc[:, features], train.loc[:, "LogSales"])
print(-grid_search_cv.best_score_) ** 0.5 / np.mean(train.loc[:, "LogSales"])
grid_search_cv.best_estimator_

In [None]:
def skl_gb(train, test, features, n_est = 300):
    skl_gb = GradientBoostingRegressor(random_state=213
                                       , loss = 'ls'
                                       , min_samples_leaf = 1
                                       , min_samples_split = 1
                                       , learning_rate = 1
                                       , max_leaf_nodes = -1
                                       , n_estimators = n_est)
    skl_gb.fit(train.loc[:, features], train.loc[:, "LogSales"])
    result = pd.DataFrame(np.arange(1, len(test) + 1), columns=["Id"])
    result["LogSales"] = skl_gb.predict(test.loc[:, features])
    return result

In [None]:
def skl_gb_model_gen(train, features, n_est = 300):
    skl_gb = GradientBoostingRegressor(random_state=213
                                       , loss = 'ls'
                                       , min_samples_leaf = 1
                                       , min_samples_split = 1
                                       , learning_rate = 1
                                       , max_leaf_nodes = -1
                                       , n_estimators = n_est)
    skl_gb.fit(train.loc[:, features], train.loc[:, "LogSales"])
    return skl_gb

# Outliers detection

In [None]:
train = pd.read_csv("Data/ready_data_1.csv")
res = remove_outliers(train)
res.to_csv("Data/ready_data_without_outliers_3.csv", index=False)

# Here is the pain begins

In [None]:
# features = ['Open', 'Store', 'DayOfWeek', 'YearWeek', 'Promo', 'CompDist', 'Day',
#        'Month', 'Year', 'SchoolHoliday', 'LogCompDist', 'StoreA', 'StoreB',
#        'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC']
features = ['Open', 'Store', 'DayOfWeek', 'YearWeek', 'Promo', 'Day',
       'Month', 'Year', 'SchoolHoliday', 'LogCompDist', 'StoreA', 'StoreB',
       'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC']

In [None]:
rf_submit = h2o_rf("/Users/Alimantu/Documents/Python/RossmanStoreSales/Data/ready_data_without_outliers_2.csv",
      "/Users/Alimantu/Documents/Python/RossmanStoreSales/Data/ready_test_1.csv", features)
rf_submit.Sales = 0.985 * rf_submit.Sales
rf_submit.to_csv("Result/h2o_rf_500_wo_hint.csv", index=False)

In [None]:
features = ['Open', 'Store', 'DayOfWeek', 'YearWeek', 'Promo', 'Day',
       'Month', 'Year', 'SchoolHoliday', 'LogCompDist', 'StoreA', 'StoreB',
       'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC']
train = pd.read_csv("Data/ready_data_without_outliers_3.csv")
test = pd.read_csv("Data/ready_test_1.csv")
xgboost_submit = xgboost(train, test, features, n_est=500)
xgboost_submit.to_csv("Result/xgb_500_wo.csv",index=False)
xgboost_submit.Sales = xgboost_submit.Sales * 0.985
xgboost_submit.to_csv("Result/xgb_500_wo_hint.csv",index=False)

In [None]:
features = ['Open', 'Store', 'DayOfWeek', 'YearWeek', 'Promo', 'Day',
       'Month', 'Year', 'SchoolHoliday', 'LogCompDist', 'StoreA', 'StoreB',
       'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC']
train = pd.read_csv("Data/ready_data_without_outliers_3.csv")
test = pd.read_csv("Data/ready_test_1.csv")
skl_gb_submit = skl_gb(train = train, test = test, features = features)
# skl_gb_submit.to_csv("Result/skl_gb_300_wo.csv",index=False)
# skl_gb_submit.Sales = xgboost_submit.Sales * 0.985
# skl_gb_submit.to_csv("Result/skl_gb_300_wo_hint.csv",index=False)

In [None]:
skl_gb_submit_2 = pd.DataFrame({ "Id" :skl_gb_submit.Id.values})
skl_gb_submit_2["Sales"] = np.exp(skl_gb_submit["LogSales"]) - 1
indices = test.Open == 0
skl_gb_submit_2.loc[indices, "Sales"] = 0
indices = skl_gb_submit_2.Sales < 0
skl_gb_submit_2.loc[indices, "Sales"] = 0
skl_gb_submit_2.head()
skl_gb_submit_2.to_csv("Result/skl_gb_300_wo.csv", index=False)
skl_gb_submit_2.Sales = skl_gb_submit_2.Sales * 0.985
skl_gb_submit_2.head()
skl_gb_submit_2.to_csv("Result/skl_gb_300_wo_hint.csv", index=False)

# First ensamble
Here I'll use the first edition of the data, without any cutting of the outliers.

In [None]:
features_1 = ['Open', 'Store', 'DayOfWeek', 'YearWeek', 'Promo', 'Day',
       'Month', 'Year', 'SchoolHoliday', 'LogCompDist', 'StoreA', 'StoreB',
       'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC']
train = pd.read_csv("Data/ready_data_1.csv")
test = pd.read_csv("Data/ready_test_1.csv")
train_tmp, train_3 = cross_validation.train_test_split(train, test_size=0.2)
train_1, train_2 = cross_validation.train_test_split(train_tmp, test_size=0.5)

In [None]:
h2o_rf_model_1 = h2o_rf_model_gen(train = train_1, features = features_1, n_est = 1000)
h2o_dl_model_1 = h2o_dl_model_gen(train = train_1, features = features_1, hidden_=[70, 70, 70, 70, 70], epochs_=100)
skl_gb_model_1 = skl_gb_model_gen(train = train_1, features = features_1, n_est = 500)
skl_rf_model_1 = skl_rf_model_gen(train = train_1, features = features_1, n_est = 1000)

In [None]:
xgb_model_1 = xgb_model_gen(train = train_1, features = features_1, lambda_=1, n_est = 150)

In [None]:
train_2.to_csv('tmp/data_ensamble_1_2.csv')
train_2_h2o = h2o.upload_file('/Users/Alimantu/Documents/Python/RossmanStoreSales/tmp/data_ensamble_1_2.csv')
predictions = h2o_dl_model_1.predict(train_2_h2o[features_1])
train_2.loc[:, "Pred1"] = predictions.as_data_frame().values
predictions = h2o_rf_model_1.predict(train_2_h2o[features_1])
train_2.loc[:, "Pred2"] = predictions.as_data_frame().values
train_2.loc[:, "Pred3"] = skl_gb_model_1.predict(train_2.loc[:, features_1])
train_2.loc[:, "Pred4"] = skl_rf_model_1.predict(train_2.loc[:, features_1])
train_2.loc[:, "Pred5"] = xgb_model_1.predict(xgb.DMatrix(train_2[features_1]))
train_2.Pred5 = train_2.Pred5.astype(float)

In [None]:
train_2.head()

In [None]:
train_2.to_csv('tmp/data_ensamble_2_ready.csv')
features_2 = ['Open', 'Store', 'DayOfWeek', 'YearWeek', 'Promo', 'Day',
              'Month', 'Year', 'SchoolHoliday', 'LogCompDist', 'StoreA', 'StoreB',
              'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC', 'Pred2', 'Pred1',
              'Pred3', 'Pred4', 'Pred5']

In [None]:
h2o_rf_model_2 = h2o_rf_model_gen(train = train_2, features = features_2, n_est = 1000)
h2o_dl_model_2 = h2o_dl_model_gen(train = train_2, features = features_2, hidden_=[70, 70, 70, 70, 70], epochs_=100)
skl_gb_model_2 = skl_gb_model_gen(train = train_2, features = features_2, n_est = 500)
skl_rf_model_2 = skl_rf_model_gen(train = train_2, features = features_2, n_est = 1000)

In [None]:
xgb_model_2 = xgb_model_gen(train = train_2, features = features_2, lambda_=1, eta=0.04, n_est=700)

In [None]:
# Predictions of the first level
train_3.to_csv('tmp/data_3_tmp.csv')
train_3_h2o = h2o.upload_file('/Users/Alimantu/Documents/Python/RossmanStoreSales/tmp/data_3_tmp.csv')
predictions = h2o_dl_model_1.predict(train_3_h2o[features_1])
train_3["Pred1"] = predictions.as_data_frame().values
predictions = h2o_rf_model_1.predict(train_3_h2o[features_1])
train_3["Pred2"] = predictions.as_data_frame().values
train_3["Pred3"] = skl_gb_model_1.predict(train_3.loc[:, features_1])
train_3["Pred4"] = skl_rf_model_1.predict(train_3.loc[:, features_1])
train_3["Pred5"] = xgb_model_1.predict(xgb.DMatrix(train_3[features_1]))
train_3.Pred5 = train_3.Pred5.astype(float)

# Predictions of the second level
train_3.to_csv('tmp/data_3_tmp_2.csv')
train_3_h2o = h2o.upload_file('/Users/Alimantu/Documents/Python/RossmanStoreSales/tmp/data_3_tmp_2.csv')
predictions = h2o_dl_model_2.predict(train_3_h2o[features_2])
train_3["Pred6"] = predictions.as_data_frame().values
predictions = h2o_rf_model_2.predict(train_3_h2o[features_2])
train_3["Pred7"] = predictions.as_data_frame().values
train_3["Pred8"] = skl_gb_model_2.predict(train_3.loc[:, features_2])
train_3["Pred9"] = skl_rf_model_2.predict(train_3.loc[:, features_2])
train_3["Pred10"] = xgb_model_2.predict(xgb.DMatrix(train_3[features_2]))
train_3.Pred10 = train_3.Pred10.astype(float)

In [None]:
train_3.head()

In [None]:
train_3.to_csv('tmp/train_3_ready.csv')
features_3 = ['Open', 'Store', 'DayOfWeek', 'YearWeek', 'Promo', 'Day',
              'Month', 'Year', 'SchoolHoliday', 'LogCompDist', 'StoreA', 'StoreB',
              'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC', 'Pred2', 'Pred1',
              'Pred3', 'Pred4', 'Pred5', 'Pred6', 'Pred7', 'Pred8', 'Pred9', 'Pred10']
# h2o_rf_model_3 = h2o_rf_model_gen(train = train_3, features = features_3, n_est = 1000)
# h2o_dl_model_3 = h2o_dl_model_gen(train = train_2, features = features_2, hidden_=[70, 70, 70, 70, 70], epochs_=100)
# skl_gb_model_3 = skl_gb_model_gen(train = train_3, features = features_3, n_est = 500)
# skl_rf_model_3 = skl_rf_model_gen(train = train_3, features = features_3, n_est = 1000)
xgb_model_3 = xgb_model_gen(train = train_3, features = features_3, lambda_=1, eta=0.005, n_est=1500)

In [None]:
features_3_new = ['Open', 'Store', 'DayOfWeek', 'YearWeek', 'Promo', 'Day',
              'Month', 'Year', 'SchoolHoliday', 'LogCompDist', 'StoreA', 'StoreB',
              'StoreC', 'StoreD', 'AssortA', 'AssortB', 'AssortC', 'Pred6', 'Pred7', 'Pred8', 'Pred9', 'Pred10']
h2o_rf_model_3_new = h2o_rf_model_gen(train = train_3, features = features_3_new, n_est = 300)
# h2o_dl_model_3 = h2o_dl_model_gen(train = train_2, features = features_2, hidden_=[70, 70, 70, 70, 70], epochs_=100)
# skl_gb_model_3 = skl_gb_model_gen(train = train_3, features = features_3, n_est = 500)
# skl_rf_model_3 = skl_rf_model_gen(train = train_3, features = features_3, n_est = 1000)

In [None]:
xgb_model_3_new_2 = xgb_model_gen(train = train_3, features = features_3_new, lambda_=1, eta=0.001, n_est=7000)

In [None]:
test = pd.read_csv('Data/ready_test_1.csv')
# Predictions of the first level
test.to_csv('tmp/test_tmp.csv')
test_h2o = h2o.upload_file('/Users/Alimantu/Documents/Python/RossmanStoreSales/tmp/test_tmp.csv')
predictions = h2o_dl_model_1.predict(test_h2o[features_1])
test["Pred1"] = predictions.as_data_frame().values
predictions = h2o_rf_model_1.predict(test_h2o[features_1])
test["Pred2"] = predictions.as_data_frame().values
test["Pred3"] = skl_gb_model_1.predict(test.loc[:, features_1])
test["Pred4"] = skl_rf_model_1.predict(test.loc[:, features_1])
test["Pred5"] = xgb_model_1.predict(xgb.DMatrix(test[features_1]))
test.Pred5 = test.Pred5.astype(float)

# Predictions of the second level
test.to_csv('tmp/test_tmp_2.csv')
test_h2o = h2o.upload_file('/Users/Alimantu/Documents/Python/RossmanStoreSales/tmp/test_tmp_2.csv')
predictions = h2o_dl_model_2.predict(test_h2o[features_2])
test["Pred6"] = predictions.as_data_frame().values
predictions = h2o_rf_model_2.predict(test_h2o[features_2])
test["Pred7"] = predictions.as_data_frame().values
test["Pred8"] = skl_gb_model_2.predict(test.loc[:, features_2])
test["Pred9"] = skl_rf_model_2.predict(test.loc[:, features_2])
test["Pred10"] = xgb_model_2.predict(xgb.DMatrix(test[features_2]))
test.Pred10 = test.Pred10.astype(float)

# Final predict
result = pd.DataFrame(np.arange(1, len(test) + 1), columns=["Id"])
result["Sales"] = np.exp(xgb_model_3.predict(xgb.DMatrix(test[features_3]))) - 1
result.to_csv("Result/ensable_2.csv", index=False)
result.Sales = result.Sales * 0.985
result.to_csv("Result/ensable_2_hint.csv", index=False)
indices = test.Open == 0
result.loc[indices, "Sales"] = 0
indices = result.Sales < 0
result.loc[indices, "Sales"] = 0
result.to_csv("Result/ensable_2_corr_hint.csv", index=False)

In [None]:
test.to_csv('tmp/test_tmp_3.csv')
test_h2o = h2o.upload_file('/Users/Alimantu/Documents/Python/RossmanStoreSales/tmp/test_tmp_3.csv')
predictions = h2o_rf_model_3_new.predict(test_h2o[features_3_new])
result["Sales"] = np.exp(predictions.as_data_frame().values) - 1
indices = test.Open == 0
result.loc[indices, "Sales"] = 0
indices = result.Sales < 0
result.loc[indices, "Sales"] = 0
result.Sales = result.Sales * 0.985
result.to_csv("Result/ensable_sum_h2o_rf_hint.csv", index=False)

In [None]:
result["Sales"] = np.exp(xgb_model_2.predict(xgb.DMatrix(test[features_2]))) - 1
result.Sales = result.Sales * 0.985
indices = test.Open == 0
result.loc[indices, "Sales"] = 0
indices = result.Sales < 0
result.loc[indices, "Sales"] = 0
result.to_csv("Result/ensable_1_summ_xgb_hint.csv", index=False)

In [None]:
res = pd.read_csv('tmp/Goose_hint.csv')

In [None]:
test.head()

In [None]:
indices = test.Open == 0
res.loc[indices, "Sales"] = 0
indices = res.Sales < 0
res.loc[indices, "Sales"] = 0
res.to_csv('Result/Goose_fixed_hint.csv', index=False)