In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [2]:
train = pd.read_csv("train.csv")
leaderboard = pd.read_csv("leaderboard_dataset.csv")
test = pd.read_csv("test.csv")

In [3]:
y = train["VAR21"]
X = train.drop(["VAR21"], axis=1)

In [4]:
from sklearn.preprocessing import LabelEncoder

for c in X.columns:
    if X[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(X[c].values) + list(test[c].values) + list(leaderboard[c].values)) 
        X[c] = lbl.transform(list(X[c].values))
        test[c] = lbl.transform(list(test[c].values))
        leaderboard[c] = lbl.transform(list(leaderboard[c].values))

In [5]:
lbl = LabelEncoder() 
lbl.fit(list(y.values)) 
y = lbl.transform(list(y))

In [6]:
X["Target"] = y

In [7]:
# High
X[X["Target"] == 0]=
# Low
X[X["Target"] == 1]
# Medium
X[X["Target"] == 2]

Unnamed: 0,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,VAR8,VAR9,VAR10,...,VAR12,VAR13,VAR14,VAR15,VAR16,VAR17,VAR18,VAR19,VAR20,Target
1,2,911.764706,0.027778,,611.574748,8.181818,1.344479,198.600020,22.086661,,...,15.012510,,2,,1.614613,,0,1,146.654045,0
4,5,914.117647,0.083333,,626.514988,5.181818,1.372928,198.790477,85.938202,58.632548,...,210.175146,,0,,1.558341,,1,0,101.268503,0
13,14,882.352941,0.833333,178.571429,611.574748,6.636364,0.655025,197.075291,22.086661,,...,,,2,,0.984960,,0,1,342.450947,0
15,16,808.235294,8.333333,678.571429,611.574748,,0.585166,196.987351,22.206442,,...,300.250208,4.000000,2,4.000000,0.966037,1000.00,0,1,57.633391,0
30,31,845.882353,1.944444,107.142857,690.263984,11.363636,1.166687,197.809000,3.306796,53.158788,...,,,10,,1.552790,,0,1,133.807229,0
33,34,883.529412,0.138889,642.857143,733.297270,4.545455,3.816744,212.605150,,48.981559,...,210.175146,16.613333,10,16.613333,2.779121,1000.00,0,1,138.414802,0
34,35,940.000000,0.138889,35.714286,620.455910,39.090909,0.616254,197.023002,10.887188,58.632548,...,,20.000000,10,20.000000,0.970746,,0,1,103.497418,0
36,37,887.058823,0.111111,285.714286,662.438792,4.545455,0.769814,197.408885,,58.632548,...,,,2,,1.145384,,1,0,120.220310,0
44,45,880.000000,0.111111,678.571429,679.870240,22.727273,6.011623,203.296665,,58.632548,...,250.208507,9.066667,15,9.066667,4.754492,617.21,1,0,386.598967,0
57,58,994.117647,1.216667,35.714286,629.475376,9.090909,1.284523,198.479924,243.907239,58.632548,...,200.166806,12.666667,2,12.666667,1.491383,,1,0,96.041308,0


In [8]:
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 700, 
    'eta': 0.01,
    'max_depth': 5,
    'subsample': 0.92,
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
    'silent': 1,
    'num_class' :3
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X, y)
dtest = xgb.DMatrix(leaderboard)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=1200, # increase to have better results (~700)
                   verbose_eval=50,
                   early_stopping_rounds=50
                  )

num_boost_rounds = len(cv_result)
print('num_boost_rounds=' + str(num_boost_rounds))

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


# check f2-score (to get higher score - increase num_boost_round in previous cell)

# make predictions and save results
y_preds = model.predict(dtest)

d = {'col1': leaderboard["VAR1"], 'col2': [int(i) for i in y_preds]}
df = pd.DataFrame(data=d)
df["col2"][df["col2"] == 0] = "High"
df["col2"][df["col2"] == 1] = "Low"
df["col2"][df["col2"] == 2] = "Medium"

import shutil 

filename = "Quant404_IITGuwahati_24"
df.to_csv(filename+'.csv', index=False, header=False)
shutil.copyfile("Quant404_IITGuwahati_12.ipynb", filename+".ipynb")

[0]	train-mlogloss:1.09568+3.58267e-05	test-mlogloss:1.09582+2.29541e-05
[50]	train-mlogloss:0.989505+0.0014094	test-mlogloss:0.996191+0.00125144
[100]	train-mlogloss:0.9306+0.00187451	test-mlogloss:0.943169+0.0023004
[150]	train-mlogloss:0.895134+0.00215201	test-mlogloss:0.913224+0.00322606
[200]	train-mlogloss:0.87213+0.00231356	test-mlogloss:0.895548+0.00407053
[250]	train-mlogloss:0.856005+0.00246	test-mlogloss:0.884611+0.00459239
[300]	train-mlogloss:0.844014+0.00258351	test-mlogloss:0.87768+0.00508668
[350]	train-mlogloss:0.834707+0.00248425	test-mlogloss:0.873327+0.00539159
[400]	train-mlogloss:0.826983+0.00240359	test-mlogloss:0.870246+0.00556645
[450]	train-mlogloss:0.820524+0.00233567	test-mlogloss:0.86826+0.00570957
[500]	train-mlogloss:0.814665+0.00221058	test-mlogloss:0.86689+0.00582958
[550]	train-mlogloss:0.809282+0.00225323	test-mlogloss:0.865891+0.00584704
[600]	train-mlogloss:0.804306+0.00220314	test-mlogloss:0.86511+0.00592121
[650]	train-mlogloss:0.799675+0.00230723

'Quant404_IITGuwahati_24.ipynb'

In [None]:
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 700, 
    'eta': 0.01,
    'max_depth': 6,
    'subsample': 0.90,
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
    'silent': 1,
    'num_class' :3
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X, y)
dtest = xgb.DMatrix(leaderboard)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=1200, # increase to have better results (~700)
                   verbose_eval=50,
                   early_stopping_rounds=50
                  )

num_boost_rounds = len(cv_result)
print('num_boost_rounds=' + str(num_boost_rounds))

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


# check f2-score (to get higher score - increase num_boost_round in previous cell)

# make predictions and save results
y_preds = model.predict(dtest)

d = {'col1': leaderboard["VAR1"], 'col2': [int(i) for i in y_preds]}
df = pd.DataFrame(data=d)
df["col2"][df["col2"] == 0] = "High"
df["col2"][df["col2"] == 1] = "Low"
df["col2"][df["col2"] == 2] = "Medium"

import shutil 

filename = "Quant404_IITGuwahati_25"
df.to_csv(filename+'.csv', index=False, header=False)
shutil.copyfile("Quant404_IITGuwahati_23.ipynb", filename+".ipynb")

[0]	train-mlogloss:1.09549+4.03072e-05	test-mlogloss:1.09575+1.84451e-05
[50]	train-mlogloss:0.98153+0.00135223	test-mlogloss:0.993748+0.0012607
[100]	train-mlogloss:0.917237+0.0018322	test-mlogloss:0.939922+0.00238902
[150]	train-mlogloss:0.877228+0.00194761	test-mlogloss:0.909662+0.00337444
[200]	train-mlogloss:0.850482+0.00215229	test-mlogloss:0.892066+0.00420626
[250]	train-mlogloss:0.831015+0.00234337	test-mlogloss:0.881289+0.0047568
[300]	train-mlogloss:0.816068+0.00247658	test-mlogloss:0.87457+0.00522031
[350]	train-mlogloss:0.804266+0.00223424	test-mlogloss:0.870389+0.00549063
[400]	train-mlogloss:0.79421+0.00229057	test-mlogloss:0.86764+0.00568867
[450]	train-mlogloss:0.785585+0.00222746	test-mlogloss:0.865839+0.00586117
[500]	train-mlogloss:0.777664+0.00221564	test-mlogloss:0.864716+0.00596999
[550]	train-mlogloss:0.770397+0.00247084	test-mlogloss:0.863956+0.0060063
[600]	train-mlogloss:0.763576+0.0026513	test-mlogloss:0.863403+0.00610004
[650]	train-mlogloss:0.757198+0.00279

In [None]:
tpot.fit(X_train, y_train)

30 operators have been imported by TPOT.


A Jupyter Widget

Skipped pipeline #14 due to time out. Continuing to the next pipeline.
Skipped pipeline #17 due to time out. Continuing to the next pipeline.


In [None]:
from sklearn.ensemble import GradientBoostingClassifier