In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [40]:
train = pd.read_csv("train.csv")
leaderboard = pd.read_csv("leaderboard_dataset.csv")
test = pd.read_csv("test.csv")

In [41]:
train = train.fillna(0)
leaderboard = leaderboard.fillna(0)
test = test.fillna(0)

In [42]:
y = train["VAR21"]
X = train.drop(["VAR21"], axis=1)

In [43]:
from sklearn.preprocessing import LabelEncoder

for c in X.columns:
    if X[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(X[c].values) + list(test[c].values) + list(leaderboard[c].values)) 
        X[c] = lbl.transform(list(X[c].values))
        test[c] = lbl.transform(list(test[c].values))
        leaderboard[c] = lbl.transform(list(leaderboard[c].values))

In [31]:
lbl = LabelEncoder() 
lbl.fit(list(y.values)) 
y = lbl.transform(list(y))

In [32]:
y

array([1, 0, 2, ..., 1, 1, 0])

In [33]:
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_X = tsvd.fit_transform(X)
tsvd_results_test = tsvd.transform(test)
tsvd_results_leaderboard = tsvd.transform(leaderboard)

# PCA
pca = PCA(n_components=12, random_state=420)
pca2_results_X = pca.fit_transform(X)
pca2_results_test = pca.transform(test)
pca2_results_leaderboard = pca.transform(leaderboard)

# ICA
ica = FastICA(n_components=12, random_state=420)
ica2_results_X = ica.fit_transform(X)
ica2_results_test = ica.transform(test)
ica2_results_leaderboard = ica.transform(leaderboard)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_X = grp.fit_transform(X)
grp_results_test = grp.transform(test)
grp_results_leaderboard = grp.transform(leaderboard)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_X = srp.fit_transform(X)
srp_results_test = srp.transform(test)
srp_results_leaderboard = srp.transform(leaderboard)

X_trans = X
leaderboard_trans = leaderboard
test_trans = test

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    X_trans['pca_' + str(i)] = pca2_results_X[:,i-1]
    test_trans['pca_' + str(i)] = pca2_results_test[:, i-1]
    leaderboard_trans['pca_' + str(i)] = pca2_results_leaderboard[:, i-1]
    
    X_trans['ica_' + str(i)] = ica2_results_X[:,i-1]
    test_trans['ica_' + str(i)] = ica2_results_test[:, i-1]
    leaderboard_trans['ica_' + str(i)] = ica2_results_leaderboard[:, i-1]

    X_trans['tsvd_' + str(i)] = tsvd_results_X[:,i-1]
    test_trans['tsvd_' + str(i)] = tsvd_results_test[:, i-1]
    leaderboard_trans['tsvd_' + str(i)] = tsvd_results_leaderboard[:, i-1]
    
    X_trans['grp_' + str(i)] = grp_results_X[:,i-1]
    test_trans['grp_' + str(i)] = grp_results_test[:, i-1]
    leaderboard_trans['grp_' + str(i)] = grp_results_leaderboard[:, i-1]
    
    X_trans['srp_' + str(i)] = srp_results_X[:,i-1]
    test_trans['srp_' + str(i)] = srp_results_test[:, i-1]
    leaderboard_trans['srp_' + str(i)] = srp_results_leaderboard[:, i-1]

In [36]:
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 700, 
    'eta': 0.01,
    'max_depth': 5,
    'subsample': 0.93,
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
    'silent': 1,
    'num_class' :3
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X, y)
dtest = xgb.DMatrix(leaderboard)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=1000, # increase to have better results (~700)
                   verbose_eval=50,
                   early_stopping_rounds=50
                  )

num_boost_rounds = len(cv_result)
print('num_boost_rounds=' + str(num_boost_rounds))

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


# check f2-score (to get higher score - increase num_boost_round in previous cell)

# make predictions and save results
y_preds = model.predict(dtest)

[0]	train-mlogloss:1.09566+4.15077e-05	test-mlogloss:1.09585+2.99592e-05
[50]	train-mlogloss:0.987976+0.0015813	test-mlogloss:0.996494+0.00139471
[100]	train-mlogloss:0.92762+0.00225673	test-mlogloss:0.943512+0.00259833
[150]	train-mlogloss:0.890774+0.00263562	test-mlogloss:0.913657+0.0036094
[200]	train-mlogloss:0.866443+0.00297708	test-mlogloss:0.895978+0.0043741
[250]	train-mlogloss:0.849421+0.00308606	test-mlogloss:0.885321+0.00500246
[300]	train-mlogloss:0.836785+0.0031878	test-mlogloss:0.878686+0.00552618
[350]	train-mlogloss:0.826849+0.00329083	test-mlogloss:0.874596+0.00591989
[400]	train-mlogloss:0.818416+0.00345825	test-mlogloss:0.871881+0.00615638
[450]	train-mlogloss:0.811031+0.00345992	test-mlogloss:0.870088+0.00639032
[500]	train-mlogloss:0.804297+0.00344186	test-mlogloss:0.868835+0.00650673
[550]	train-mlogloss:0.797968+0.00352956	test-mlogloss:0.867941+0.00659538
[600]	train-mlogloss:0.792011+0.0036081	test-mlogloss:0.867314+0.00673062
[650]	train-mlogloss:0.786379+0.00

KeyError: 'VAR1'

In [38]:
leaderboard

Unnamed: 0,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,VAR8,VAR9,VAR10,VAR11,...,pca_11,ica_11,tsvd_11,grp_11,srp_11,pca_12,ica_12,tsvd_12,grp_12,srp_12
0,878.823529,0.833333,821.428571,620.835806,5.181818,0.833974,197.500187,0.000000,58.632548,9.433609,...,-3.536202,-0.000987,-1.402654,226.656629,64.830000,0.529224,0.000046,0.305961,630.854061,-536.094258
1,891.764706,0.138889,357.142857,614.092215,3.909091,0.595187,197.016843,253.896073,58.632548,10.144612,...,-6.755607,-0.000733,-5.488303,195.292847,70.011218,-0.349585,-0.000215,-0.485189,753.741204,-409.721936
2,955.294118,0.055556,0.000000,611.574748,6.363636,0.918652,197.660051,22.086661,0.000000,0.583494,...,-0.944650,-0.000100,-3.496931,-99.527851,74.234224,-0.886917,-0.000448,-0.801769,549.895516,-13.663195
3,831.764706,0.111111,250.000000,617.740617,6.545455,2.383924,200.526288,0.000000,48.539365,0.000000,...,-10.407826,-0.001294,-9.423359,-29.126986,110.650078,-0.023365,-0.000233,-0.171930,509.190863,-179.928261
4,957.647059,0.166667,0.000000,623.426802,0.000000,0.711240,197.151458,5.013668,49.379748,0.255712,...,7.771680,-0.000527,7.792893,-101.465232,67.176734,-0.544748,-0.000332,-0.550136,547.992956,-32.936770
5,890.588235,0.138889,107.142857,611.574748,10.909091,0.681405,197.150320,5.933411,0.000000,12.406516,...,-4.108310,0.000030,-6.215257,65.860875,167.226536,-1.486900,-0.000507,-1.465241,617.921963,-75.627879
6,923.529412,0.013889,71.428571,648.165558,11.363636,1.092656,198.414715,107.066619,58.632548,0.317387,...,-2.171180,-0.000523,-2.894619,-47.940781,85.601656,-0.637181,-0.000362,-0.499184,609.737099,-143.107052
7,881.176471,0.083333,285.714286,599.039931,11.363636,0.720824,196.419063,16.995992,47.899686,10.879870,...,-6.345641,-0.000732,-4.488981,107.482723,123.735070,-0.719919,-0.000376,-0.993676,608.497569,-217.868210
8,960.000000,0.333333,0.000000,612.078941,11.818182,0.564360,196.932449,75.068125,58.632548,0.801784,...,-2.261552,-0.000724,-1.197004,-67.246162,77.385487,-0.239200,-0.000231,-0.355211,587.070203,-81.063993
9,876.470588,0.000000,0.000000,641.218888,6.363636,0.633561,197.253087,11.755597,58.632548,0.000000,...,-11.227097,-0.000494,-11.495108,-67.997853,205.850478,-1.134201,-0.000458,-1.081760,581.678775,-42.422601


In [37]:
d = {'col1': leaderboard["VAR1"], 'col2': [int(i) for i in y_preds]}
df = pd.DataFrame(data=d)
df["col2"][df["col2"] == 0] = "High"
df["col2"][df["col2"] == 1] = "Low"
df["col2"][df["col2"] == 2] = "Medium"

import shutil 

filename = "Quant404_IITGuwahati_11"
df.to_csv(filename+'.csv', index=False, header=False)
shutil.copyfile("Untitled-Copy1.ipynb", filename+".ipynb")

array([2., 1., 2., ..., 2., 2., 1.], dtype=float32)

In [None]:
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 700, 
    'eta': 0.01,
    'max_depth': 5,
    'subsample': 0.93,
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
    'silent': 1,
    'num_class' :3
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X_trans, y)
dtest = xgb.DMatrix(leaderboard_trans)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=1000, # increase to have better results (~700)
                   verbose_eval=50, 
                   early_stopping_rounds=50
                  )

num_boost_rounds = len(cv_result)
print('num_boost_rounds=' + str(num_boost_rounds))

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


# check f2-score (to get higher score - increase num_boost_round in previous cell)

# make predictions and save results
y_preds = model.predict(dtest)


d = {'col1': leaderboard["VAR1"], 'col2': [int(i) for i in y_preds]}
df = pd.DataFrame(data=d)
df["col2"][df["col2"] == 0] = "High"
df["col2"][df["col2"] == 1] = "Low"
df["col2"][df["col2"] == 2] = "Medium"

import shutil 

filename = "Quant404_IITGuwahati_12"
df.to_csv(filename+'.csv', index=False, header=False)
shutil.copyfile("Untitled-Copy1.ipynb", filename+".ipynb")

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.scorer import make_scorer


def my_custom_accuracy(y_true, y_pred):
    total=0
    for t,p in zip(y_true, y_pred):
        if(t==p):
            total+=1000
        if(t==0 and p==1):
            total-=100
        if(t==0 and p==2):
            total-=50
        if(t==2 and p==1):
            total-=50
        if(t==2 and p==0):
            total-=100
        if(t==1 and p==2):
            total-=100
        if(t==1 and p==0):
            total-=200
    return float(sum(y_pred == y_true))

# Make a custom a scorer from the custom metric function
# Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized.
my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=30000, test_size=4000)

#tpot = TPOTClassifier(generations=10, population_size=20, verbosity=3, scoring=my_custom_scorer, cv=3)

In [None]:
tpot.fit(X_train, y_train)

30 operators have been imported by TPOT.


A Jupyter Widget

Skipped pipeline #14 due to time out. Continuing to the next pipeline.
Skipped pipeline #17 due to time out. Continuing to the next pipeline.
