In [45]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [46]:
train = pd.read_csv("train.csv")
leaderboard = pd.read_csv("leaderboard_dataset.csv")
test = pd.read_csv("test.csv")

In [47]:
train = train.fillna(0)
leaderboard = leaderboard.fillna(0)
test = test.fillna(0)

In [48]:
y = train["VAR21"]
X = train.drop(["VAR21"], axis=1)

In [49]:
from sklearn.preprocessing import LabelEncoder

for c in X.columns:
    if X[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(X[c].values) + list(test[c].values) + list(leaderboard[c].values)) 
        X[c] = lbl.transform(list(X[c].values))
        test[c] = lbl.transform(list(test[c].values))
        leaderboard[c] = lbl.transform(list(leaderboard[c].values))

In [50]:
lbl = LabelEncoder() 
lbl.fit(list(y.values)) 
y = lbl.transform(list(y))

In [51]:
y

array([1, 0, 2, ..., 1, 1, 0])

In [52]:
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_X = tsvd.fit_transform(X)
tsvd_results_test = tsvd.transform(test)
tsvd_results_leaderboard = tsvd.transform(leaderboard)

# PCA
pca = PCA(n_components=12, random_state=420)
pca2_results_X = pca.fit_transform(X)
pca2_results_test = pca.transform(test)
pca2_results_leaderboard = pca.transform(leaderboard)

# ICA
ica = FastICA(n_components=12, random_state=420)
ica2_results_X = ica.fit_transform(X)
ica2_results_test = ica.transform(test)
ica2_results_leaderboard = ica.transform(leaderboard)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_X = grp.fit_transform(X)
grp_results_test = grp.transform(test)
grp_results_leaderboard = grp.transform(leaderboard)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_X = srp.fit_transform(X)
srp_results_test = srp.transform(test)
srp_results_leaderboard = srp.transform(leaderboard)

X_trans = X
leaderboard_trans = leaderboard
test_trans = test

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    X_trans['pca_' + str(i)] = pca2_results_X[:,i-1]
    test_trans['pca_' + str(i)] = pca2_results_test[:, i-1]
    leaderboard_trans['pca_' + str(i)] = pca2_results_leaderboard[:, i-1]
    
    X_trans['ica_' + str(i)] = ica2_results_X[:,i-1]
    test_trans['ica_' + str(i)] = ica2_results_test[:, i-1]
    leaderboard_trans['ica_' + str(i)] = ica2_results_leaderboard[:, i-1]

    X_trans['tsvd_' + str(i)] = tsvd_results_X[:,i-1]
    test_trans['tsvd_' + str(i)] = tsvd_results_test[:, i-1]
    leaderboard_trans['tsvd_' + str(i)] = tsvd_results_leaderboard[:, i-1]
    
    X_trans['grp_' + str(i)] = grp_results_X[:,i-1]
    test_trans['grp_' + str(i)] = grp_results_test[:, i-1]
    leaderboard_trans['grp_' + str(i)] = grp_results_leaderboard[:, i-1]
    
    X_trans['srp_' + str(i)] = srp_results_X[:,i-1]
    test_trans['srp_' + str(i)] = srp_results_test[:, i-1]
    leaderboard_trans['srp_' + str(i)] = srp_results_leaderboard[:, i-1]

In [None]:
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 700, 
    'eta': 0.01,
    'max_depth': 6,
    'subsample': 0.93,
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
    'silent': 1,
    'num_class' :3
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X, y)
dtest = xgb.DMatrix(leaderboard)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=1200, # increase to have better results (~700)
                   verbose_eval=50,
                   early_stopping_rounds=50
                  )

num_boost_rounds = len(cv_result)
print('num_boost_rounds=' + str(num_boost_rounds))

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


# check f2-score (to get higher score - increase num_boost_round in previous cell)

# make predictions and save results
y_preds = model.predict(dtest)

d = {'col1': leaderboard["VAR1"], 'col2': [int(i) for i in y_preds]}
df = pd.DataFrame(data=d)
df["col2"][df["col2"] == 0] = "High"
df["col2"][df["col2"] == 1] = "Low"
df["col2"][df["col2"] == 2] = "Medium"

import shutil 

filename = "Quant404_IITGuwahati_13"
df.to_csv(filename+'.csv', index=False, header=False)
shutil.copyfile("Untitled-Copy1.ipynb", filename+".ipynb")

In [None]:
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 700, 
    'eta': 0.01,
    'max_depth': 5,
    'subsample': 0.93,
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
    'silent': 1,
    'num_class' :3
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X_trans, y)
dtest = xgb.DMatrix(leaderboard_trans)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=1000, # increase to have better results (~700)
                   verbose_eval=50, 
                   early_stopping_rounds=50
                  )

num_boost_rounds = len(cv_result)
print('num_boost_rounds=' + str(num_boost_rounds))

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


# check f2-score (to get higher score - increase num_boost_round in previous cell)

# make predictions and save results
y_preds = model.predict(dtest)


d = {'col1': leaderboard["VAR1"], 'col2': [int(i) for i in y_preds]}
df = pd.DataFrame(data=d)
df["col2"][df["col2"] == 0] = "High"
df["col2"][df["col2"] == 1] = "Low"
df["col2"][df["col2"] == 2] = "Medium"

import shutil 

filename = "Quant404_IITGuwahati_12"
df.to_csv(filename+'.csv', index=False, header=False)
shutil.copyfile("Untitled-Copy1.ipynb", filename+".ipynb")

[0]	train-mlogloss:1.09567+3.87843e-05	test-mlogloss:1.09586+3.23557e-05
[50]	train-mlogloss:0.988552+0.00141342	test-mlogloss:0.996762+0.00177267


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.scorer import make_scorer


def my_custom_accuracy(y_true, y_pred):
    total=0
    for t,p in zip(y_true, y_pred):
        if(t==p):
            total+=1000
        if(t==0 and p==1):
            total-=100
        if(t==0 and p==2):
            total-=50
        if(t==2 and p==1):
            total-=50
        if(t==2 and p==0):
            total-=100
        if(t==1 and p==2):
            total-=100
        if(t==1 and p==0):
            total-=200
    return float(sum(y_pred == y_true))

# Make a custom a scorer from the custom metric function
# Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized.
my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=30000, test_size=4000)

#tpot = TPOTClassifier(generations=10, population_size=20, verbosity=3, scoring=my_custom_scorer, cv=3)

In [None]:
tpot.fit(X_train, y_train)

30 operators have been imported by TPOT.


A Jupyter Widget

Skipped pipeline #14 due to time out. Continuing to the next pipeline.
Skipped pipeline #17 due to time out. Continuing to the next pipeline.
