In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model, metrics, ensemble, svm
from sklearn import model_selection
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import KFold, GridSearchCV
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
def read_file(filename):
    raw_data = pd.read_csv(filename)
    data = raw_data.copy()
    return data


In [3]:
# read in train and test data -> X_train, y_train, X_test (pd.df)
X_train_path = "X_train.csv"
y_train_path = "y_train.csv"
X_test_path = "X_test.csv"

X_train_data = read_file(X_train_path)
y_train_data = read_file(y_train_path)
X_test_data = read_file(X_test_path)

X_train = X_train_data.values
y_train = np.ravel(y_train_data.values)
X_test = X_test_data.values
print("X_train shape = ", X_train.shape)
print("y_train shape = ", y_train.shape)
print("X_test = ", X_test.shape)

X_train shape =  (56490, 26)
y_train shape =  (56490,)
X_test =  (37661, 26)


In [4]:
# preprocessing
# drop same_val feature
def select_same_val_features(X_train_data):
    same_val_features = []
    for feature in X_train_data.columns:
        if len(np.unique(X_train_data[feature])) == 1:
            same_val_features.append(feature)
    return same_val_features

In [5]:
# handle missing value
def missing_to_NaN(X_train_data):
    for feature in X_train_data.columns:
        X_train_data[feature] = X_train_data[feature].apply(lambda x: np.NaN if (x == -1) else x)
        X_test_data[feature] = X_test_data[feature].apply(lambda x: np.NaN if (x == -1) else x)

In [6]:
# features = []

In [7]:
# missing values
# missing_to_NaN(X_train_data)
# missing_to_NaN(X_test_data)
# drop same_val feature
same_val_features = select_same_val_features(X_train_data)

X_train_data_drop = X_train_data.drop(same_val_features, axis=1)
X_train_drop = X_train_data_drop.values
print("X_train_drop shape = ", X_train_drop.shape)

X_train_data_drop_ps = X_train_data_drop.drop(["entering_host", "time_to_confirm"], axis=1)

X_train_drop_ps = X_train_data_drop_ps.values
print("X_train_drop_ps shape = ", X_train_drop_ps.shape)

X_test_data_drop = X_test_data.drop(same_val_features, axis=1)
X_test_drop = X_test_data_drop.values
print("X_test_drop shape = ", X_test_drop.shape)

X_test_data_drop_ps = X_test_data_drop.drop(["entering_host", "time_to_confirm"], axis=1)
X_test_drop_ps = X_test_data_drop_ps.values
print("X_test_drop_ps shape = ", X_test_drop_ps.shape)

print("Number of same value features = ", len(same_val_features))
print("Same Value Features = ", same_val_features)


X_train_drop shape =  (56490, 22)
X_train_drop_ps shape =  (56490, 20)
X_test_drop shape =  (37661, 22)
X_test_drop_ps shape =  (37661, 20)
Number of same value features =  4
Same Value Features =  ['EnterQueueTimestamp_year', 'EnterQueueTimestamp_month', 'EnterQueueTimestamp_day', 'party_size']


In [8]:
def cross_validation(X_train, y_train, num_folds, clf):
    kf = model_selection.KFold(n_splits=num_folds)
    train_mse_ls = []
    valid_mse_ls = []
    for train_index, valid_index in kf.split(X_train):
        #print("...START...")
        #print("TRAIN: ", train_index, "VALIDATION: ", valid_index)
        X_train_cv, X_valid_cv = X_train[train_index], X_train[valid_index]
        y_train_cv, y_valid_cv = y_train[train_index], y_train[valid_index]
        
        clf.fit(X_train_cv, y_train_cv)
        y_train_cv_pred_raw = clf.predict(X_train_cv)#[:, 1]
        
        y_train_cv_pred = []
        for val in y_train_cv_pred_raw:
            if val < 0:
                y_train_cv_pred.append(0)
            else:
                y_train_cv_pred.append(val)
                
#         print("y_train_cv_pred = ", y_train_cv_pred)
#         print("y_train_cv = ", y_train_cv)
        y_valid_cv_pred_raw = clf.predict(X_valid_cv)#[:, 1]
        
        y_valid_cv_pred = []
        for val in y_valid_cv_pred_raw:
            if val < 0:
                y_valid_cv_pred.append(0)
            else:
                y_valid_cv_pred.append(val)
                
#         print("y_valid_cv_pred = ", y_valid_cv_pred)
#         print("y_valid_cv = ", y_valid_cv)
        train_cv_mse = metrics.mean_squared_error(y_train_cv, np.asarray(y_train_cv_pred))
        valid_cv_mse = metrics.mean_squared_error(y_valid_cv, np.asarray(y_valid_cv_pred))
        train_mse_ls.append(train_cv_mse)
        valid_mse_ls.append(valid_cv_mse)
        #print("...END...")
    return train_mse_ls, valid_mse_ls

In [12]:
num_estimators = [10, 20, 30, 50, 100, 120, 150]
for n in num_estimators:
    clf_xgb = XGBRegressor(n_estimators=n, scale_pos_weight=1, learning_rate=0.1, max_depth=9, gamma=0.1)
    #clf_svm = svm.SVR()
    num_folds = 5
    train_mse_ls, valid_mse_ls = cross_validation(X_train_drop, y_train, num_folds, clf_xgb)
    #train_mse_ls, valid_mse_ls = cross_validation(X_train, y_train, num_folds, clf_svm)
    print("----------------------------")
    print("CURRENT n_estimators = ", n)
    print("train_mse_ls: ", train_mse_ls)
    print("valid_mse_ls: ", valid_mse_ls)
    print("----------------------------")

----------------------------
CURRENT n_estimators =  10
train_mse_ls:  [2275.2910087020114, 2157.4203788918003, 2328.062652689648, 2482.756023905037, 2668.5192381256643]
valid_mse_ls:  [3424.105623468512, 4587.6670251919395, 3094.612611513153, 1502.4322924449034, 1449.101738581878]
----------------------------
----------------------------
CURRENT n_estimators =  20
train_mse_ls:  [1190.0096874320736, 1195.1541080644702, 1264.1269059822755, 1305.6386194410288, 1397.9359231164055]
valid_mse_ls:  [2369.119999121446, 2762.7867336554636, 1834.5991671981076, 1155.9275613413315, 1169.0285580305015]
----------------------------
----------------------------
CURRENT n_estimators =  30
train_mse_ls:  [934.4213820980466, 982.6975991511996, 1030.5582936347776, 1032.5352743640055, 1127.3170986155396]
valid_mse_ls:  [2186.008997495155, 2326.805686849829, 1558.1842626148796, 1243.6708430057167, 1270.9896588132583]
----------------------------
----------------------------
CURRENT n_estimators =  50
tra

In [10]:
max_depths = [3, 5, 7, 9]
for m in max_depths:
    clf_xgb = XGBRegressor(n_estimators=50, scale_pos_weight=1, learning_rate=0.1, max_depth=m, gamma=0.1)
    num_folds = 5
    train_mse_ls, valid_mse_ls = cross_validation(X_train_drop, y_train, num_folds, clf_xgb)
    print("----------------------------")
    print("CURRENT max_depth = ", m)
    print("train_mse_ls: ", train_mse_ls)
    print("valid_mse_ls: ", valid_mse_ls)
    print("----------------------------")

----------------------------
CURRENT max_depth =  3
train_mse_ls:  [1481.2814239394097, 1530.5443540260144, 1619.7876757495205, 1730.9168986170403, 1783.1940661031956]
valid_mse_ls:  [2597.4419198036167, 2633.7097903355257, 1978.5778501262707, 1360.9834408848499, 1357.7919989341274]
----------------------------
----------------------------
CURRENT max_depth =  5
train_mse_ls:  [1151.3912448231383, 1194.7335682412581, 1254.1986865847778, 1337.7904111063851, 1372.9786173530933]
valid_mse_ls:  [2283.682678926802, 2377.7289268200398, 1696.051575985479, 1295.2740457500872, 1310.8100566016024]
----------------------------
----------------------------
CURRENT max_depth =  7
train_mse_ls:  [937.6410911332791, 981.9601695570608, 1036.7970713719224, 1031.368320409267, 1096.1725572156142]
valid_mse_ls:  [2170.8070080134166, 2240.477229800002, 1663.287069064007, 1311.164217483118, 1443.2380652898669]
----------------------------
----------------------------
CURRENT max_depth =  9
train_mse_ls:  [7

In [13]:
gammas = [0, 0.1, 0.2, 0.3, 0.5]
for ga in gammas:
    clf_xgb = XGBRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, gamma=ga)
    #clf_svm = svm.SVR()
    num_folds = 5
    train_mse_ls, valid_mse_ls = cross_validation(X_train_drop, y_train, num_folds, clf_xgb)
    #train_mse_ls, valid_mse_ls = cross_validation(X_train, y_train, num_folds, clf_svm)
    print("----------------------------")
    print("CURRENT gamma = ", ga)
    print("train_mse_ls: ", train_mse_ls)
    print("valid_mse_ls: ", valid_mse_ls)
    print("----------------------------")

----------------------------
CURRENT gamma =  0
train_mse_ls:  [937.6410618219718, 981.9602998318221, 1030.672073652913, 1031.368320409267, 1096.1725572156142]
valid_mse_ls:  [2170.806946458576, 2240.475868757172, 1673.319448580219, 1311.164217483118, 1443.2380652898669]
----------------------------
----------------------------
CURRENT gamma =  0.1
train_mse_ls:  [937.6410911332791, 981.9601695570608, 1036.7970713719224, 1031.368320409267, 1096.1725572156142]
valid_mse_ls:  [2170.8070080134166, 2240.477229800002, 1663.287069064007, 1311.164217483118, 1443.2380652898669]
----------------------------
----------------------------
CURRENT gamma =  0.2
train_mse_ls:  [937.6410911332791, 981.9601695570608, 1036.79739794318, 1031.3683486635423, 1096.1725572156142]
valid_mse_ls:  [2170.8070080134166, 2240.477229800002, 1663.2871358061682, 1311.1641242120943, 1443.2380652898669]
----------------------------
----------------------------
CURRENT gamma =  0.3
train_mse_ls:  [937.6410378769874, 981

In [15]:
lrs = [0.05, 0.1, 0.15, 0.2]
for lr in lrs:
    clf_xgb = XGBRegressor(n_estimators=50, learning_rate=lr, max_depth=3)
    num_folds = 5
    train_mse_ls, valid_mse_ls = cross_validation(X_train, y_train, num_folds, clf_xgb)
    print("----------------------------")
    print("CURRENT learning rate = ", lr)
    print("train_mse_ls: ", train_mse_ls)
    print("valid_mse_ls: ", valid_mse_ls)
    print("----------------------------")

----------------------------
CURRENT learning rate =  0.05
train_mse_ls:  [1876.3740095217308, 1938.8744356968368, 2043.3710671570286, 2179.0686724024195, 2203.41892056255]
valid_mse_ls:  [2952.4607716945343, 3467.944085139743, 2341.193788903668, 1567.2036885838784, 1348.5535040396308]
----------------------------
----------------------------
CURRENT learning rate =  0.1
train_mse_ls:  [1544.3575949751637, 1575.895952293231, 1697.9571904337743, 1792.0455162437597, 1836.1766923283544]
valid_mse_ls:  [2721.6426134217986, 2707.3529423152245, 2014.7405037666565, 1412.583603882135, 1362.8043829396115]
----------------------------
----------------------------
CURRENT learning rate =  0.15
train_mse_ls:  [1369.2463471997983, 1435.7836998225584, 1544.7231377107942, 1624.4913100333943, 1641.7924439376839]
valid_mse_ls:  [2531.240395416254, 2602.765342345424, 1926.7980456989321, 1364.0539995139409, 1297.9666736859115]
----------------------------
----------------------------
CURRENT learning rat

In [9]:
clf_xgb = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=8, scale_pos_weight=1, gamma=2)
num_folds = 5
train_mse_ls, valid_mse_ls = cross_validation(X_train, y_train, num_folds, clf_xgb)
print("----------------------------")
print("train_mse_ls: ", train_mse_ls)
print("valid_mse_ls: ", valid_mse_ls)
print("----------------------------")

----------------------------
train_mse_ls:  [856.7469300385118, 910.6886525038619, 952.6489120810884, 956.9425617375364, 1038.3132877547419]
valid_mse_ls:  [2138.6621515994284, 2228.430397462126, 1626.5486092599942, 1324.49519841778, 1329.0374061916757]
----------------------------


In [10]:
clf_xgb = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=8, scale_pos_weight=1, gamma=2)
num_folds = 5
train_mse_ls, valid_mse_ls = cross_validation(X_train, y_train, num_folds, clf_xgb)
print("----------------------------")
print("train_mse_ls: ", train_mse_ls)
print("valid_mse_ls: ", valid_mse_ls)
print("----------------------------")

----------------------------
train_mse_ls:  [732.1884143470472, 772.1921441222973, 818.469324085864, 820.3839879431207, 901.5244603882923]
valid_mse_ls:  [2098.9061754286386, 2254.649464986203, 1596.2973036055737, 1444.554176116236, 1392.622885679231]
----------------------------


In [9]:
number = 16
#1(handle <0 values, drop same_val fea) 1765.17651
#clf_xgb = XGBRegressor(n_estimators=100, scale_pos_weight=1, learning_rate=0.05, max_depth=9, gamma=0.1)
#clf_xgb.fit(X_train_drop, y_train)

#2(handle <0 values, drop same_val fea, hanlde -1) 1570.92007
#3(no handle <0 values, drop same_val fea, handle -1) 1572.75197
#clf_xgb = XGBRegressor(n_estimators=50, scale_pos_weight=1, learning_rate=0.1, max_depth=3)
#clf_xgb.fit(X_train_drop, y_train)

#4(hanlde <0 values, no drop same_val fea, handle -1) 1543.60717
# clf_xgb = XGBRegressor(n_estimators=50, scale_pos_weight=1, learning_rate=0.1, max_depth=3)
# clf_xgb.fit(X_train, y_train)

#5(hanlde <0 values, no drop same_val fea, handle -1, learning_rate changes) 1562.17963
# clf_xgb = XGBRegressor(n_estimators=50, learning_rate=0.2, max_depth=3)
# clf_xgb.fit(X_train, y_train)

#6(hanlde <0 values as mean value*, no drop same_val fea, handle -1) 1983.79563
# clf_xgb = XGBRegressor(n_estimators=50, learning_rate=0.2, max_depth=3)
# clf_xgb.fit(X_train, y_train)

#7(hanlde <0 values, no drop same_val fea, handle -1, tune params) 1863.01851

#8(hanlde <0 values, drop same_val fea, handle -1, tune params as #7) 1863.01851

#9(add subsample, others as #8) 1907

#10 no drop same, tune params 1682.27579
# clf_xgb = XGBRegressor(n_estimators=80, learning_rate=0.2, max_depth=4, scale_pos_weight=1, gamma=1, subsample=0.8)

#11 no drop same, drop party_size, nan, < 10 values in result 1570.92007
# clf_xgb = XGBRegressor(n_estimators=50, scale_pos_weight=1, learning_rate=0.1, max_depth=3)

#12 no drop same, drop party_size, < 10 values in result)1543.60717
#clf_xgb = XGBRegressor(n_estimators=50, scale_pos_weight=1, learning_rate=0.1, max_depth=3)

#13 no drop same, drop party_size + entering host, < 10 values in result) 1524.65555
#clf_xgb = XGBRegressor(n_estimators=50, scale_pos_weight=1, learning_rate=0.1, max_depth=3)

#14 drop ["party_size","entering_host", "time_to_confirm"], no drop same, < 10 values in result 1524.65555
#clf_xgb = XGBRegressor(n_estimators=50, scale_pos_weight=1, learning_rate=0.1, max_depth=3)

# 15 drop same, above features, < 10, missing nan
#clf_xgb = XGBRegressor(n_estimators=50, scale_pos_weight=1, learning_rate=0.1, max_depth=3)

# 16 = 15 - nan
clf_xgb = XGBRegressor(n_estimators=50, scale_pos_weight=1, learning_rate=0.1, max_depth=3)
clf_xgb.fit(X_train_drop_ps, y_train)
y_test_pred_raw = clf_xgb.predict(X_test_drop_ps)

In [10]:
y_test_pred = []
for val in y_test_pred_raw:
    if val < 0:
        y_test_pred.append(0)
    else:
        y_test_pred.append(val)

In [11]:
pd.DataFrame(np.asarray(y_test_pred), columns=['actual_wait div 60000']).to_csv(str(number) + ".csv")