In [9]:
import pickle
import numpy as np
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
%matplotlib inline
import graphviz
import itertools

import pandas as pd
import xgboost as xgb

with open('./MTH9899_2017Spring/final/ml_finalproj_train_vF.pkl', 'rb') as f:
    rawdata = pickle.load(f)
    
with open('./MTH9899_2017Spring/final/ml_finalproj_holdout.pkl', 'rb') as f:
    rawdata_holdout = pickle.load(f)

In [6]:
def winsorize(x):
    y = x.copy()
    thresh = 4*np.std(x)
    y[y > thresh] = thresh
    y[y < -thresh] = -thresh
    return y

In [5]:
def AddMovingAvg(data):
    new_columns = ['x22_avg','x17_avg','x2_avg','x25_30_avg','x42_51_avg']
    new_data = pd.concat([data, pd.DataFrame(columns=new_columns)], axis=1)
    new_data['x25_30'] = new_data.x25+new_data.x30
    new_data['x42_51'] = new_data.x42+new_data.x51

    ids = new_data['id'].unique()
    for i in ids:
        s_i = new_data.loc[new_data.id==i, ['x22','x17','x2','x25_30','x42_51']]
        new_data.loc[data.id==i, 'x22_avg'] = s_i.x22.rolling(window=5, min_periods=1).mean()
        new_data.loc[data.id==i, 'x17_avg'] = s_i.x17.rolling(window=5, min_periods=1).mean()
        new_data.loc[data.id==i, 'x2_avg'] = s_i.x2.rolling(window=5, min_periods=1).mean()
        new_data.loc[data.id==i, 'x25_30_avg'] = s_i.x25_30.rolling(window=5, min_periods=1).mean()
        new_data.loc[data.id==i, 'x42_51_avg'] = s_i.x42_51.rolling(window=5, min_periods=1).mean()

    for f in new_columns:
        new_data[f] = pd.to_numeric(new_data[f])
        
    return new_data

def AddDiff(datain, cols):
    new_data = pd.concat([datain, pd.DataFrame(np.zeros([datain.shape[0], len(cols)]), columns=[col+'_diff' for col in cols])], axis=1)
    ids = new_data['id'].unique()
    for i in ids:
        s_i = datain.loc[datain.id==i, cols]
        new_data.loc[datain.id==i, [col+'_diff' for col in cols]] = s_i.diff().values
    return new_data

In [16]:
categorical = []
for i in rawdata.columns:
    if i!='timestamp' and rawdata[i][:1000].unique().shape[0]<20:
        categorical.append(i)
categorical
    
notin = ['id','y','weight','timestamp']
continum = [i for i in rawdata.columns if i not in categorical and i not in notin]
filt = ['x22', 'x25', 'x30', 'x42', 'x17','x0','x13']
syn = ['x22_avg','x17_avg','x2_avg','x25_30_avg','x42_51_avg']
feat = continum+categorical+syn
feat = [i for i in feat if i not in filt]

In [18]:
def preprocess(datain, add_diff):
    datain[continum].apply(winsorize)
    
    if add_diff:
        feat = feat + [i+'_diff' for i in continum]
        
    datain = AddMovingAvg(datain)
    if add_diff:
        data = AddDiff(datain, continum)
    else:
        data = datain
    return data

In [19]:
data = preprocess(rawdata, False)

In [20]:
def ave_predict(models, X, n):
    res = np.zeros(n)
    for model in models:
        res = res + model.predict(X)
    return res/len(models)

In [21]:
def grid_search(etas, gammas, max_depths, subsamples, colsample_bytrees, min_child_weight, rounds=[20], verbose=False):
    cut1 = 400
    cut2 = 430

    X_train = data[feat][data.timestamp <= cut1].values
    y_train = data.y[data.timestamp <= cut1].values
    weight_train = data.weight[data.timestamp <= cut1].values
    X_valid = data[feat][data.timestamp > cut2].values
    y_valid = data.y[data.timestamp > cut2].values
    weight_valid = data.weight[data.timestamp > cut2].values

    xgmat_train = xgb.DMatrix(X_train, label=y_train, weight=np.log(weight_train), feature_names=feat)
    xgmat_valid = xgb.DMatrix(X_valid, label=y_valid, weight=np.log(weight_valid), feature_names=feat)
    
    train_R2 = []
    valid_R2 = []
    for comb in itertools.product(etas, gammas, max_depths, subsamples, colsample_bytrees, min_child_weight, rounds):
        params_xgb = {'objective':'reg:linear',
                        'eta'             : comb[0],
                        'max_depth'       : comb[2],
                        'gamma'           : comb[1],
                        'subsample'       : comb[3],
                        'colsample_bytree': comb[4],
                        'min_child_weight': comb[5],
                        'rounds'          : comb[6],
#                         'lambda'          : ,
                       'base_score' : 0.0000,
                      'early_stopping_rounds' : True
                        }

        models = []
        for i in range(10):
            params_xgb['seed'] = 2333+i*100
            bst = xgb.train(params_xgb, xgmat_train)
            models.append(bst)
    
        print('combinations:')
        print(comb)
        
        if verbose:
            print('test r2', r2_score(y_valid, ave_predict(models, xgmat_valid, y_valid.shape[0])))
            print('test weighted r2', r2_score(y_valid, ave_predict(models, xgmat_valid, y_valid.shape[0]), weight_valid))
            print('train r2', r2_score(y_train, ave_predict(models, xgmat_train, y_train.shape[0])))
            print('train weighted r2', r2_score(y_train, ave_predict(models, xgmat_train, y_train.shape[0]), weight_train))
            train_R2.append(r2_score(y_train, ave_predict(models, xgmat_train, y_train.shape[0]), weight_train))
            valid_R2.append(r2_score(y_valid, ave_predict(models, xgmat_valid, y_valid.shape[0]), weight_valid))
        else:
            print("score is: ", r2_score(y_valid, ave_predict(models, xgmat_valid, y_valid.shape[0]), weight_valid))
            return trainR2, valid_R2

In [11]:
# Set up parameters to look at
etas = np.arange(0.05, 0.35, 0.05)
max_depths = np.arange(7, 15, 1)
gammas = (0.00001, 0.00005, 0.0001)
subsamples = (0.5, 0.6, 0.7, 0.8, 0.9)
colsample_bytrees = (0.5, 0.6, 0.7, 0.8, 0.9)
min_child_weight = (100, 150, 200)

In [22]:
# Grid search
a, b = grid_search(etas, gammas, max_depths, subsamples, colsample_bytrees, min_child_weight, verbose=True)

combinations:
(0.050000000000000003, 1e-05, 7, 0.5, 0.5, 100, 20)
test r2 0.00175234668911
test weighted r2 0.000974571933168
train r2 0.010092298712
train weighted r2 0.00781455383922
combinations:
(0.050000000000000003, 1e-05, 7, 0.5, 0.5, 150, 20)
test r2 0.00171658965057
test weighted r2 0.000958600330339
train r2 0.00972419909638
train weighted r2 0.00761236446939
combinations:
(0.050000000000000003, 1e-05, 7, 0.5, 0.5, 200, 20)
test r2 0.00180996233196
test weighted r2 0.00104444103394
train r2 0.00953825677468
train weighted r2 0.00751190277092
combinations:
(0.050000000000000003, 1e-05, 7, 0.5, 0.6, 100, 20)
test r2 0.00201093704014
test weighted r2 0.00134281050088
train r2 0.0109628092847
train weighted r2 0.00835185123154
combinations:
(0.050000000000000003, 1e-05, 7, 0.5, 0.6, 150, 20)
test r2 0.00196445385807
test weighted r2 0.00121892008638
train r2 0.0106516678174
train weighted r2 0.00816508869672
combinations:
(0.050000000000000003, 1e-05, 7, 0.5, 0.6, 200, 20)
test r

KeyboardInterrupt: 

### Optimal combinations:
(0.050000000000000003, 1e-05, 7, 0.5, 0.9, 200, 20)

test r2 0.00255791169279

test weighted r2 0.00165898939465

train r2 0.0116847637575

train weighted r2 0.00900018419573


(0.050000000000000003, 1e-05, 8, 0.5, 0.9, 200, 20)

test r2 0.00262989487873

test weighted r2 0.00167114005845

train r2 0.0156572129129

train weighted r2 0.0121584881498


(0.050000000000000003, 1e-05, 9, 0.5, 0.9, 200, 20)

test r2 0.00277923351012

test weighted r2 0.00177947156375

train r2 0.0203969457098

train weighted r2 0.0160713614076

(0.050000000000000003, 1e-05, 10, 0.5, 0.8, 200, 20)

test r2 0.0030381541574

test weighted r2 0.00209913363707

train r2 0.0249879096699

train weighted r2 0.0199297130826