In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [16]:
sub = pd.read_csv("Quant404_IITGuwahati_27.csv")
sub["Medium"]= 'Medium'
sub.to_csv("Quant404_IITGuwahati_30.csv", index=None)

In [151]:
train = pd.read_csv("train.csv")
leaderboard = pd.read_csv("leaderboard_dataset.csv")
test = pd.read_csv("test.csv")

In [152]:
train = train.fillna(0)
leaderboard = leaderboard.fillna(0)
test = test.fillna(0)

In [153]:
y = train["VAR21"]
X = train.drop(["VAR21"], axis=1)

In [154]:
from sklearn.preprocessing import LabelEncoder

for c in X.columns:
    if X[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(X[c].values) + list(test[c].values) + list(leaderboard[c].values)) 
        X[c] = lbl.transform(list(X[c].values))
        test[c] = lbl.transform(list(test[c].values))
        leaderboard[c] = lbl.transform(list(leaderboard[c].values))

In [155]:
lbl = LabelEncoder() 
lbl.fit(list(y.values)) 
y = lbl.transform(list(y))

In [156]:
y

array([1, 0, 2, ..., 1, 1, 0])

In [159]:
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_X = tsvd.fit_transform(X)
tsvd_results_test = tsvd.transform(test)
tsvd_results_leaderboard = tsvd.transform(leaderboard)

# PCA
pca = PCA(n_components=12, random_state=420)
pca2_results_X = pca.fit_transform(X)
pca2_results_test = pca.transform(test)
pca2_results_leaderboard = pca.transform(leaderboard)

# ICA
ica = FastICA(n_components=12, random_state=420)
ica2_results_X = ica.fit_transform(X)
ica2_results_test = ica.transform(test)
ica2_results_leaderboard = ica.transform(leaderboard)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_X = grp.fit_transform(X)
grp_results_test = grp.transform(test)
grp_results_leaderboard = grp.transform(leaderboard)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_X = srp.fit_transform(X)
srp_results_test = srp.transform(test)
srp_results_leaderboard = srp.transform(leaderboard)

X_trans = X
leaderboard_trans = leaderboard
test_trans = test

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    X_trans['pca_' + str(i)] = pca2_results_X[:,i-1]
    test_trans['pca_' + str(i)] = pca2_results_test[:, i-1]
    leaderboard_trans['pca_' + str(i)] = pca2_results_leaderboard[:, i-1]
    
    X_trans['ica_' + str(i)] = ica2_results_X[:,i-1]
    test_trans['ica_' + str(i)] = ica2_results_test[:, i-1]
    leaderboard_trans['ica_' + str(i)] = ica2_results_leaderboard[:, i-1]

    X_trans['tsvd_' + str(i)] = tsvd_results_X[:,i-1]
    test_trans['tsvd_' + str(i)] = tsvd_results_test[:, i-1]
    leaderboard_trans['tsvd_' + str(i)] = tsvd_results_leaderboard[:, i-1]
    
    X_trans['grp_' + str(i)] = grp_results_X[:,i-1]
    test_trans['grp_' + str(i)] = grp_results_test[:, i-1]
    leaderboard_trans['grp_' + str(i)] = grp_results_leaderboard[:, i-1]
    
    X_trans['srp_' + str(i)] = srp_results_X[:,i-1]
    test_trans['srp_' + str(i)] = srp_results_test[:, i-1]
    leaderboard_trans['srp_' + str(i)] = srp_results_leaderboard[:, i-1]

In [160]:
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 700, 
    'eta': 0.01,
    'max_depth': 7,
    'subsample': 0.93,
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
    'silent': 1,
    'num_class' :3,
    'nthread':4
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X_trans, y)
dtest = xgb.DMatrix(leaderboard_trans)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=2200, # increase to have better results (~700)
                   verbose_eval=50, 
                   early_stopping_rounds=50
                  )

num_boost_rounds = len(cv_result)
print('num_boost_rounds=' + str(num_boost_rounds))

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


# check f2-score (to get higher score - increase num_boost_round in previous cell)

# make predictions and save results
y_preds = model.predict(dtest)


d = {'col1': leaderboard["VAR1"], 'col2': [int(i) for i in y_preds]}
df = pd.DataFrame(data=d)
df["col2"][df["col2"] == 0] = "High"
df["col2"][df["col2"] == 1] = "Low"
df["col2"][df["col2"] == 2] = "Medium"

import shutil 

filename = "Quant404_IITGuwahati_17"
df.to_csv(filename+'.csv', index=False, header=False)
shutil.copyfile("Untitled.ipynb", filename+".ipynb")

[0]	train-mlogloss:1.09522+5.63462e-05	test-mlogloss:1.09573+3.73304e-05
[50]	train-mlogloss:0.969591+0.0013568	test-mlogloss:0.992859+0.00168788
[100]	train-mlogloss:0.896136+0.00226662	test-mlogloss:0.93869+0.00283176
[150]	train-mlogloss:0.848233+0.00233968	test-mlogloss:0.908668+0.00384027
[200]	train-mlogloss:0.814063+0.00221618	test-mlogloss:0.891227+0.00462623
[250]	train-mlogloss:0.788404+0.00197743	test-mlogloss:0.880964+0.00526465
[300]	train-mlogloss:0.768781+0.00194333	test-mlogloss:0.874743+0.00562563
[350]	train-mlogloss:0.753344+0.00234524	test-mlogloss:0.871065+0.00605806
[400]	train-mlogloss:0.739905+0.00261448	test-mlogloss:0.86878+0.00628787
[450]	train-mlogloss:0.727711+0.00277675	test-mlogloss:0.867463+0.00651507
[500]	train-mlogloss:0.71657+0.00295327	test-mlogloss:0.866686+0.00660897
[550]	train-mlogloss:0.705943+0.00295813	test-mlogloss:0.866245+0.00661186
[600]	train-mlogloss:0.695807+0.00313774	test-mlogloss:0.866032+0.00677635
[650]	train-mlogloss:0.686194+0.

KeyboardInterrupt: 

In [109]:
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 700, 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
    'silent': 1,
    'num_class' :3
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X, y)
dtest = xgb.DMatrix(leaderboard)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=3000, # increase to have better results (~700)
                   verbose_eval=50, 
                  )

num_boost_rounds = len(cv_result)
print('num_boost_rounds=' + str(num_boost_rounds))

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


# check f2-score (to get higher score - increase num_boost_round in previous cell)

# make predictions and save results
y_preds = model.predict(dtest)


[0]	train-mlogloss:1.09737+1.94422e-05	test-mlogloss:1.09741+1.74611e-05
[50]	train-mlogloss:1.04457+0.00076767	test-mlogloss:1.0461+0.00064087
[100]	train-mlogloss:1.00521+0.00122188	test-mlogloss:1.00815+0.00120716
[150]	train-mlogloss:0.975235+0.0014886	test-mlogloss:0.979499+0.00178054
[200]	train-mlogloss:0.95203+0.00171672	test-mlogloss:0.957629+0.00231829
[250]	train-mlogloss:0.933833+0.00190879	test-mlogloss:0.940665+0.00279052
[300]	train-mlogloss:0.919273+0.00206486	test-mlogloss:0.927352+0.00321739
[350]	train-mlogloss:0.907547+0.00220922	test-mlogloss:0.916894+0.00361833
[400]	train-mlogloss:0.897991+0.0023502	test-mlogloss:0.908535+0.00390237
[450]	train-mlogloss:0.8901+0.00243324	test-mlogloss:0.901824+0.00424849
[500]	train-mlogloss:0.883516+0.00250287	test-mlogloss:0.896431+0.00450969
[550]	train-mlogloss:0.877957+0.00254501	test-mlogloss:0.892022+0.00476006
[600]	train-mlogloss:0.873197+0.00260457	test-mlogloss:0.888405+0.00495477
[650]	train-mlogloss:0.869073+0.002630

KeyboardInterrupt: 

In [146]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.scorer import make_scorer


def my_custom_accuracy(y_true, y_pred):
    total=0
    for t,p in zip(y_true, y_pred):
        if(t==p):
            total+=1000
        if(t==0 and p==1):
            total-=100
        if(t==0 and p==2):
            total-=50
        if(t==2 and p==1):
            total-=50
        if(t==2 and p==0):
            total-=100
        if(t==1 and p==2):
            total-=100
        if(t==1 and p==0):
            total-=200
    return float(sum(y_pred == y_true))

# Make a custom a scorer from the custom metric function
# Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized.
my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=30000, test_size=4000)

tpot = TPOTClassifier(generations=10, population_size=20, verbosity=3,
                      scoring=my_custom_scorer, cv=3)

In [147]:
tpot.fit(X_train, y_train)

30 operators have been imported by TPOT.


A Jupyter Widget

Skipped pipeline #14 due to time out. Continuing to the next pipeline.
Skipped pipeline #17 due to time out. Continuing to the next pipeline.
Skipped pipeline #21 due to time out. Continuing to the next pipeline.
_pre_test decorator: _random_mutation_operator: num_test=0 Unsupported set of arguments: The combination of penalty='l1' and loss='hinge' is not supported, Parameters: penalty='l1', loss='hinge', dual=False.
Skipped pipeline #25 due to time out. Continuing to the next pipeline.
Skipped pipeline #30 due to time out. Continuing to the next pipeline.


TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=3,
        disable_update_check=False, early_stop=None, generations=10,
        max_eval_time_mins=5, max_time_mins=None, memory=None,
        mutation_rate=0.9, n_jobs=1, offspring_size=None,
        periodic_checkpoint_folder=None, population_size=20,
        random_state=None, scoring=make_scorer(my_custom_accuracy),
        subsample=1.0, template=None, use_dask=False, verbosity=3,
        warm_start=False)