In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [2]:
train = pd.read_csv("train.csv")
leaderboard = pd.read_csv("leaderboard_dataset.csv")
test = pd.read_csv("test.csv")

In [3]:
train = train.fillna(0)
leaderboard = leaderboard.fillna(0)
test = test.fillna(0)

In [4]:
y = train["VAR21"]
X = train.drop(["VAR21"], axis=1)

In [5]:
from sklearn.preprocessing import LabelEncoder

for c in X.columns:
    if X[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(X[c].values) + list(test[c].values) + list(leaderboard[c].values)) 
        X[c] = lbl.transform(list(X[c].values))
        test[c] = lbl.transform(list(test[c].values))
        leaderboard[c] = lbl.transform(list(leaderboard[c].values))

In [6]:
lbl = LabelEncoder() 
lbl.fit(list(y.values)) 
y = lbl.transform(list(y))

In [7]:
y

array([1, 0, 2, ..., 1, 1, 0])

In [8]:
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_X = tsvd.fit_transform(X)
tsvd_results_test = tsvd.transform(test)
tsvd_results_leaderboard = tsvd.transform(leaderboard)

# PCA
pca = PCA(n_components=12, random_state=420)
pca2_results_X = pca.fit_transform(X)
pca2_results_test = pca.transform(test)
pca2_results_leaderboard = pca.transform(leaderboard)

# ICA
ica = FastICA(n_components=12, random_state=420)
ica2_results_X = ica.fit_transform(X)
ica2_results_test = ica.transform(test)
ica2_results_leaderboard = ica.transform(leaderboard)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_X = grp.fit_transform(X)
grp_results_test = grp.transform(test)
grp_results_leaderboard = grp.transform(leaderboard)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_X = srp.fit_transform(X)
srp_results_test = srp.transform(test)
srp_results_leaderboard = srp.transform(leaderboard)

X_trans = X
leaderboard_trans = leaderboard
test_trans = test

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    X_trans['pca_' + str(i)] = pca2_results_X[:,i-1]
    test_trans['pca_' + str(i)] = pca2_results_test[:, i-1]
    leaderboard_trans['pca_' + str(i)] = pca2_results_leaderboard[:, i-1]
    
    X_trans['ica_' + str(i)] = ica2_results_X[:,i-1]
    test_trans['ica_' + str(i)] = ica2_results_test[:, i-1]
    leaderboard_trans['ica_' + str(i)] = ica2_results_leaderboard[:, i-1]

    X_trans['tsvd_' + str(i)] = tsvd_results_X[:,i-1]
    test_trans['tsvd_' + str(i)] = tsvd_results_test[:, i-1]
    leaderboard_trans['tsvd_' + str(i)] = tsvd_results_leaderboard[:, i-1]
    
    X_trans['grp_' + str(i)] = grp_results_X[:,i-1]
    test_trans['grp_' + str(i)] = grp_results_test[:, i-1]
    leaderboard_trans['grp_' + str(i)] = grp_results_leaderboard[:, i-1]
    
    X_trans['srp_' + str(i)] = srp_results_X[:,i-1]
    test_trans['srp_' + str(i)] = srp_results_test[:, i-1]
    leaderboard_trans['srp_' + str(i)] = srp_results_leaderboard[:, i-1]

In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.metrics import log_loss
scoring = ['precision_macro', 'recall_macro']

clf = HistGradientBoostingClassifier()
print(cross_val_score(clf, X_trans, y, cv=3)) 

from sklearn.ensemble import BaggingClassifier

[0.53440974 0.54910439 0.36583429]


In [10]:
bag = BaggingClassifier(clf, n_estimators=5)
print(cross_val_score(bag, X_trans, y, cv=3)) 

[0.53449797 0.56234007 0.36565781]


In [16]:
bag = BaggingClassifier(clf, n_estimators=20)
print(cross_val_score(bag, X_trans, y, cv=3)) 

KeyboardInterrupt: 

In [14]:
from xgboost.sklearn import XGBClassifier
xg = XGBClassifier()
ada = AdaBoostClassifier(n_estimators=5, base_estimator=xg)

print(cross_val_score(ada, X_trans, y, cv=3)) 

[0.1101994  0.11020912 0.11020912]


In [33]:
xg = XGBClassifier()
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_trans, y, test_size=4000, stratify=y, random_state=42)

def my_custom_accuracy(y_true, y_pred):
    total=0
    for t,p in zip(y_true, y_pred):
        if(t==p):
            total+=1000
        if(t==0 and p==1):
            total-=100
        if(t==0 and p==2):
            total-=50
        if(t==2 and p==1):
            total-=50
        if(t==2 and p==0):
            total-=100
        if(t==1 and p==2):
            total-=100
        if(t==1 and p==0):
            total-=200
    return total


my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)
two_scorer = make_scorer(multilabel_confusion_matrix)
print(cross_val_score(xg, X_train, y_train, cv=3, scoring=my_custom_scorer)) 

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train, y_train, test_size=10000, stratify=y, random_state=42)

xg.fit(X_test2, y_test2)
preds = xg.predict(leaderboard)


[5361400 5407350 5447650]


ValueError: Found input variables with inconsistent numbers of samples: [30000, 34000]

In [34]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train, y_train, test_size=10000, stratify=y_train, random_state=42)

xg.fit(X_test2, y_test2)
preds = xg.predict(leaderboard)

In [37]:
d = {'col1': leaderboard["VAR1"], 'col2': [int(i) for i in preds]}
df = pd.DataFrame(data=d)
df["col2"][df["col2"] == 0] = "High"
df["col2"][df["col2"] == 1] = "Low"
df["col2"][df["col2"] == 2] = "Medium"

import shutil 

filename = "Quant404_IITGuwahati_35"
df.to_csv(filename+'.csv', index=False, header=False)
shutil.copyfile("Quant404_IITGuwahati_14.ipynb", filename+".ipynb")

'Quant404_IITGuwahati_35.ipynb'

In [32]:
xg = XGBClassifier()
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_trans, y, test_size=4000, stratify=y, random_state=42)

def my_custom_accuracy(y_true, y_pred):
    total=0
    for t,p in zip(y_true, y_pred):
        if(t==p):
            total+=1000
        if(t==0 and p==1):
            total-=100
        if(t==0 and p==2):
            total-=50
        if(t==2 and p==1):
            total-=50
        if(t==2 and p==0):
            total-=100
        if(t==1 and p==2):
            total-=100
        if(t==1 and p==0):
            total-=200
    return total


my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)
two_scorer = make_scorer(multilabel_confusion_matrix)
print(cross_val_score(clf, X_train, y_train, cv=3, scoring=my_custom_scorer)) 

[5323400 5345450 5372800]


In [26]:
y

array([1, 0, 2, ..., 1, 1, 0])

In [None]:
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 700, 
    'eta': 0.01,
    'max_depth': 6,
    'subsample': 0.93,
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
    'silent': 1,
    'num_class' :3,
    'nthread':4
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X_trans, y)
dtest = xgb.DMatrix(leaderboard_trans)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=2200, # increase to have better results (~700)
                   verbose_eval=50,
                   early_stopping_rounds=50
                  )

num_boost_rounds = len(cv_result)
print('num_boost_rounds=' + str(num_boost_rounds))

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


# check f2-score (to get higher score - increase num_boost_round in previous cell)

# make predictions and save results
y_preds = model.predict(dtest)

d = {'col1': leaderboard["VAR1"], 'col2': [int(i) for i in y_preds]}
df = pd.DataFrame(data=d)
df["col2"][df["col2"] == 0] = "High"
df["col2"][df["col2"] == 1] = "Low"
df["col2"][df["col2"] == 2] = "Medium"

import shutil 

filename = "Quant404_IITGuwahati_13"
df.to_csv(filename+'.csv', index=False, header=False)
shutil.copyfile("Untitled-Copy1.ipynb", filename+".ipynb")

[0]	train-mlogloss:1.09546+5.12272e-05	test-mlogloss:1.09578+2.78129e-05
[50]	train-mlogloss:0.979724+0.00160848	test-mlogloss:0.994454+0.00167022
[100]	train-mlogloss:0.913761+0.00242372	test-mlogloss:0.940842+0.00299623
[150]	train-mlogloss:0.872216+0.00277071	test-mlogloss:0.910789+0.00406495
[200]	train-mlogloss:0.843639+0.00301682	test-mlogloss:0.893214+0.00481768
[250]	train-mlogloss:0.822686+0.00302518	test-mlogloss:0.882626+0.00544124
[300]	train-mlogloss:0.806804+0.00281609	test-mlogloss:0.876077+0.00591538
[350]	train-mlogloss:0.794175+0.00267161	test-mlogloss:0.872062+0.00633703
[400]	train-mlogloss:0.783294+0.00249531	test-mlogloss:0.869487+0.00669279
[450]	train-mlogloss:0.773591+0.00249327	test-mlogloss:0.867862+0.00691721
[500]	train-mlogloss:0.764751+0.00298162	test-mlogloss:0.866749+0.00702949
[550]	train-mlogloss:0.756276+0.0030981	test-mlogloss:0.866013+0.0071353
[600]	train-mlogloss:0.748279+0.00333062	test-mlogloss:0.865569+0.00725253
[650]	train-mlogloss:0.740742+

'Quant404_IITGuwahati_13.ipynb'

In [None]:
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 700, 
    'eta': 0.01,
    'max_depth': 7,
    'subsample': 0.93,
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
    'silent': 1,
    'num_class' :3,
    'nthread':4
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X_trans, y)
dtest = xgb.DMatrix(leaderboard_trans)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=2200, # increase to have better results (~700)
                   verbose_eval=50, 
                   early_stopping_rounds=50
                  )

num_boost_rounds = len(cv_result)
print('num_boost_rounds=' + str(num_boost_rounds))

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


# check f2-score (to get higher score - increase num_boost_round in previous cell)

# make predictions and save results
y_preds = model.predict(dtest)


d = {'col1': leaderboard["VAR1"], 'col2': [int(i) for i in y_preds]}
df = pd.DataFrame(data=d)
df["col2"][df["col2"] == 0] = "High"
df["col2"][df["col2"] == 1] = "Low"
df["col2"][df["col2"] == 2] = "Medium"

import shutil 

filename = "Quant404_IITGuwahati_14"
df.to_csv(filename+'.csv', index=False, header=False)
shutil.copyfile("Untitled-Copy1.ipynb", filename+".ipynb")

[0]	train-mlogloss:1.0952+5.63106e-05	test-mlogloss:1.09572+2.74024e-05
[50]	train-mlogloss:0.968323+0.00183996	test-mlogloss:0.992689+0.0016189
[100]	train-mlogloss:0.89432+0.00287841	test-mlogloss:0.938692+0.00306198
[150]	train-mlogloss:0.846123+0.00330994	test-mlogloss:0.908724+0.00417618
[200]	train-mlogloss:0.811663+0.00365102	test-mlogloss:0.891262+0.00498612
[250]	train-mlogloss:0.785642+0.00350469	test-mlogloss:0.880926+0.00562466
[300]	train-mlogloss:0.765406+0.00347686	test-mlogloss:0.874523+0.00604547
[350]	train-mlogloss:0.749053+0.00340089	test-mlogloss:0.870756+0.00645945
[400]	train-mlogloss:0.734978+0.00327302	test-mlogloss:0.868298+0.00673399
[450]	train-mlogloss:0.722338+0.00329753	test-mlogloss:0.866939+0.00688388
[500]	train-mlogloss:0.71087+0.00389402	test-mlogloss:0.866062+0.00705302
[550]	train-mlogloss:0.699879+0.00426086	test-mlogloss:0.865482+0.00710715
[600]	train-mlogloss:0.689449+0.00464952	test-mlogloss:0.865206+0.00720728
[650]	train-mlogloss:0.679448+0.

In [53]:
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 700, 
    'eta': 0.01,
    'max_depth': 8,
    'subsample': 0.93,
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
    'silent': 1,
    'num_class' :3,
    'nthread':4
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X_trans, y)
dtest = xgb.DMatrix(leaderboard_trans)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=2200, # increase to have better results (~700)
                   verbose_eval=50, 
                   early_stopping_rounds=50
                  )

num_boost_rounds = len(cv_result)
print('num_boost_rounds=' + str(num_boost_rounds))

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


# check f2-score (to get higher score - increase num_boost_round in previous cell)

# make predictions and save results
y_preds = model.predict(dtest)


d = {'col1': leaderboard["VAR1"], 'col2': [int(i) for i in y_preds]}
df = pd.DataFrame(data=d)
df["col2"][df["col2"] == 0] = "High"
df["col2"][df["col2"] == 1] = "Low"
df["col2"][df["col2"] == 2] = "Medium"

import shutil 

filename = "Quant404_IITGuwahati_15"
df.to_csv(filename+'.csv', index=False, header=False)
shutil.copyfile("Untitled-Copy1.ipynb", filename+".ipynb")

[0]	train-mlogloss:1.09567+3.87843e-05	test-mlogloss:1.09586+3.23557e-05
[50]	train-mlogloss:0.988552+0.00141342	test-mlogloss:0.996762+0.00177267
[100]	train-mlogloss:0.928611+0.00219887	test-mlogloss:0.943884+0.00301666
[150]	train-mlogloss:0.892078+0.00264109	test-mlogloss:0.913998+0.00407081
[200]	train-mlogloss:0.86806+0.00286799	test-mlogloss:0.896348+0.00484156
[250]	train-mlogloss:0.851102+0.00298996	test-mlogloss:0.88564+0.00554985
[300]	train-mlogloss:0.83864+0.00296459	test-mlogloss:0.878977+0.00609341
[350]	train-mlogloss:0.828861+0.00287962	test-mlogloss:0.874807+0.00648909
[400]	train-mlogloss:0.820723+0.0027224	test-mlogloss:0.872068+0.00681436
[450]	train-mlogloss:0.813547+0.00268878	test-mlogloss:0.870177+0.00710087
[500]	train-mlogloss:0.807035+0.00286632	test-mlogloss:0.868848+0.00729202
[550]	train-mlogloss:0.800863+0.0029347	test-mlogloss:0.867879+0.00740911
[600]	train-mlogloss:0.795126+0.00307687	test-mlogloss:0.86715+0.00749668
[650]	train-mlogloss:0.789856+0.00

'Quant404_IITGuwahati_12.ipynb'

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.scorer import make_scorer


def my_custom_accuracy(y_true, y_pred):
    total=0
    for t,p in zip(y_true, y_pred):
        if(t==p):
            total+=1000
        if(t==0 and p==1):
            total-=100
        if(t==0 and p==2):
            total-=50
        if(t==2 and p==1):
            total-=50
        if(t==2 and p==0):
            total-=100
        if(t==1 and p==2):
            total-=100
        if(t==1 and p==0):
            total-=200
    return float(sum(y_pred == y_true))

# Make a custom a scorer from the custom metric function
# Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized.
my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=30000, test_size=4000)

#tpot = TPOTClassifier(generations=10, population_size=20, verbosity=3, scoring=my_custom_scorer, cv=3)

In [None]:
tpot.fit(X_train, y_train)

30 operators have been imported by TPOT.


A Jupyter Widget

Skipped pipeline #14 due to time out. Continuing to the next pipeline.
Skipped pipeline #17 due to time out. Continuing to the next pipeline.
