In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model, metrics, ensemble, svm
from sklearn import model_selection
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import KFold, GridSearchCV
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
def read_file(filename):
    raw_data = pd.read_csv(filename)
    data = raw_data.copy()
    return data


In [3]:
# read in train and test data -> X_train, y_train, X_test (pd.df)
X_train_path = "X_train.csv"
y_train_path = "y_train.csv"
X_test_path = "X_test.csv"

X_train_data = read_file(X_train_path)
y_train_data = read_file(y_train_path)
X_test_data = read_file(X_test_path)

X_train = X_train_data.values
y_train = np.ravel(y_train_data.values)
X_test = X_test_data.values
print("X_train shape = ", X_train.shape)
print("y_train shape = ", y_train.shape)
print("X_test = ", X_test.shape)

X_train shape =  (56490, 26)
y_train shape =  (56490,)
X_test =  (37661, 26)


In [4]:
# preprocessing
# drop same_val feature
def select_same_val_features(X_train_data):
    same_val_features = []
    for feature in X_train_data.columns:
        if len(np.unique(X_train_data[feature])) == 1:
            same_val_features.append(feature)
    return same_val_features

In [5]:
# drop same_val feature
same_val_features = select_same_val_features(X_train_data)
X_train_data_drop = X_train_data.drop(same_val_features, axis=1)
X_train_drop = X_train_data_drop.values
print("X_train_drop shape = ", X_train_drop.shape)
X_test_data_drop = X_test_data.drop(same_val_features, axis=1)
X_test_drop = X_test_data_drop.values
print("X_test_drop shape = ", X_test_drop.shape)

print("Number of same value features = ", len(same_val_features))
print("Same Value Features = ", same_val_features)


X_train_drop shape =  (56490, 22)
X_test_drop shape =  (37661, 22)
Number of same value features =  4
Same Value Features =  ['EnterQueueTimestamp_year', 'EnterQueueTimestamp_month', 'EnterQueueTimestamp_day', 'party_size']


In [27]:
def cross_validation(X_train, y_train, num_folds, clf):
    kf = model_selection.KFold(n_splits=num_folds)
    train_mse_ls = []
    valid_mse_ls = []
    for train_index, valid_index in kf.split(X_train):
        print("...START...")
        print("TRAIN: ", train_index, "VALIDATION: ", valid_index)
        X_train_cv, X_valid_cv = X_train[train_index], X_train[valid_index]
        y_train_cv, y_valid_cv = y_train[train_index], y_train[valid_index]
        
        clf.fit(X_train_cv, y_train_cv)
        y_train_cv_pred_raw = clf.predict(X_train_cv)#[:, 1]
        
        y_train_cv_pred = []
        for val in y_train_cv_pred_raw:
            if val < 0:
                y_train_cv_pred.append(0)
            else:
                y_train_cv_pred.append(val)
                
#         print("y_train_cv_pred = ", y_train_cv_pred)
#         print("y_train_cv = ", y_train_cv)
        y_valid_cv_pred_raw = clf.predict(X_valid_cv)#[:, 1]
        
        y_valid_cv_pred = []
        for val in y_valid_cv_pred_raw:
            if val < 0:
                y_valid_cv_pred.append(0)
            else:
                y_valid_cv_pred.append(val)
                
#         print("y_valid_cv_pred = ", y_valid_cv_pred)
#         print("y_valid_cv = ", y_valid_cv)
        train_cv_mse = metrics.mean_squared_error(y_train_cv, np.asarray(y_train_cv_pred))
        valid_cv_mse = metrics.mean_squared_error(y_valid_cv, np.asarray(y_valid_cv_pred))
        train_mse_ls.append(train_cv_mse)
        valid_mse_ls.append(valid_cv_mse)
        print("...END...")
    return train_mse_ls, valid_mse_ls

In [28]:
clf_xgb = XGBRegressor(n_estimators=150, scale_pos_weight=1, learning_rate=0.05, max_depth=9, gamma=0.1)
#clf_svm = svm.SVR()
num_folds = 3
train_mse_ls, valid_mse_ls = cross_validation(X_train_drop, y_train, num_folds, clf_xgb)
#train_mse_ls, valid_mse_ls = cross_validation(X_train, y_train, num_folds, clf_svm)
print("train_mse_ls: ", train_mse_ls)
print("valid_mse_ls: ", valid_mse_ls)

...START...
TRAIN:  [18830 18831 18832 ... 56487 56488 56489] VALIDATION:  [    0     1     2 ... 18827 18828 18829]
...END...
...START...
TRAIN:  [    0     1     2 ... 56487 56488 56489] VALIDATION:  [18830 18831 18832 ... 37657 37658 37659]
...END...
...START...
TRAIN:  [    0     1     2 ... 37657 37658 37659] VALIDATION:  [37660 37661 37662 ... 56487 56488 56489]
...END...
train_mse_ls:  [559.9250693455259, 693.660314403961, 840.1398341667862]
valid_mse_ls:  [2552.805180999344, 1698.9049293187445, 1593.022756202818]


In [29]:
clf_xgb = XGBRegressor(n_estimators=100, scale_pos_weight=1, learning_rate=0.05, max_depth=9, gamma=0.1)
#clf_svm = svm.SVR()
num_folds = 3
train_mse_ls, valid_mse_ls = cross_validation(X_train_drop, y_train, num_folds, clf_xgb)
#train_mse_ls, valid_mse_ls = cross_validation(X_train, y_train, num_folds, clf_svm)
print("train_mse_ls: ", train_mse_ls)
print("valid_mse_ls: ", valid_mse_ls)

...START...
TRAIN:  [18830 18831 18832 ... 56487 56488 56489] VALIDATION:  [    0     1     2 ... 18827 18828 18829]
...END...
...START...
TRAIN:  [    0     1     2 ... 56487 56488 56489] VALIDATION:  [18830 18831 18832 ... 37657 37658 37659]
...END...
...START...
TRAIN:  [    0     1     2 ... 37657 37658 37659] VALIDATION:  [37660 37661 37662 ... 56487 56488 56489]
...END...
train_mse_ls:  [644.9328506173135, 767.956621200982, 936.2239656322879]
valid_mse_ls:  [2594.872704474678, 1679.8380947648252, 1532.859433265189]


In [30]:
clf_xgb = XGBRegressor(n_estimators=50, scale_pos_weight=1, learning_rate=0.05, max_depth=9, gamma=0.2)
#clf_svm = svm.SVR()
num_folds = 3
train_mse_ls, valid_mse_ls = cross_validation(X_train_drop, y_train, num_folds, clf_xgb)
#train_mse_ls, valid_mse_ls = cross_validation(X_train, y_train, num_folds, clf_svm)
print("train_mse_ls: ", train_mse_ls)
print("valid_mse_ls: ", valid_mse_ls)

...START...
TRAIN:  [18830 18831 18832 ... 56487 56488 56489] VALIDATION:  [    0     1     2 ... 18827 18828 18829]
...END...
...START...
TRAIN:  [    0     1     2 ... 56487 56488 56489] VALIDATION:  [18830 18831 18832 ... 37657 37658 37659]
...END...
...START...
TRAIN:  [    0     1     2 ... 37657 37658 37659] VALIDATION:  [37660 37661 37662 ... 56487 56488 56489]
...END...
train_mse_ls:  [861.6924683089522, 1019.8557037325935, 1230.180519415691]
valid_mse_ls:  [2885.6060241144887, 1724.390159267831, 1299.2558128419853]


In [31]:
clf_xgb = XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=8, gamma=0.3, missing=-1)
#clf_svm = svm.SVR()
num_folds = 3
train_mse_ls, valid_mse_ls = cross_validation(X_train_drop, y_train, num_folds, clf_xgb)
#train_mse_ls, valid_mse_ls = cross_validation(X_train, y_train, num_folds, clf_svm)
print("train_mse_ls: ", train_mse_ls)
print("valid_mse_ls: ", valid_mse_ls)

...START...
TRAIN:  [18830 18831 18832 ... 56487 56488 56489] VALIDATION:  [    0     1     2 ... 18827 18828 18829]
...END...
...START...
TRAIN:  [    0     1     2 ... 56487 56488 56489] VALIDATION:  [18830 18831 18832 ... 37657 37658 37659]
...END...
...START...
TRAIN:  [    0     1     2 ... 37657 37658 37659] VALIDATION:  [37660 37661 37662 ... 56487 56488 56489]
...END...
train_mse_ls:  [695.5376312092498, 859.2628731063197, 1010.8574991063492]
valid_mse_ls:  [2621.8701666072025, 1656.2020963861294, 1480.4010649847783]


In [32]:
clf_xgb = XGBRegressor(n_estimators=100, scale_pos_weight=1, learning_rate=0.05, max_depth=9, gamma=0.1, missing=-1)
#clf_svm = svm.SVR()
num_folds = 3
train_mse_ls, valid_mse_ls = cross_validation(X_train_drop, y_train, num_folds, clf_xgb)
#train_mse_ls, valid_mse_ls = cross_validation(X_train, y_train, num_folds, clf_svm)
print("train_mse_ls: ", train_mse_ls)
print("valid_mse_ls: ", valid_mse_ls)

...START...
TRAIN:  [18830 18831 18832 ... 56487 56488 56489] VALIDATION:  [    0     1     2 ... 18827 18828 18829]
...END...
...START...
TRAIN:  [    0     1     2 ... 56487 56488 56489] VALIDATION:  [18830 18831 18832 ... 37657 37658 37659]
...END...
...START...
TRAIN:  [    0     1     2 ... 37657 37658 37659] VALIDATION:  [37660 37661 37662 ... 56487 56488 56489]
...END...
train_mse_ls:  [613.4631889964663, 759.469481410852, 920.642671391934]
valid_mse_ls:  [2608.044219470907, 1712.7809339430794, 1500.8437273240029]


In [33]:
number = 1
clf_xgb = XGBRegressor(n_estimators=100, scale_pos_weight=1, learning_rate=0.05, max_depth=9, gamma=0.1)
clf_xgb.fit(X_train_drop, y_train)
y_test_pred_raw = clf_xgb.predict(X_test_drop)#[:, 1]
y_test_pred = []
for val in y_test_pred_raw:
    if val < 0:
        y_test_pred.append(0)
    else:
        y_test_pred.append(val)


In [34]:
pd.DataFrame(np.asarray(y_test_pred)).to_csv(str(number) + ".csv")