In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [2]:
train = pd.read_csv("train.csv")
leaderboard = pd.read_csv("leaderboard_dataset.csv")
test = pd.read_csv("test.csv")

In [3]:
y = train["VAR21"]
X = train.drop(["VAR21"], axis=1)

In [4]:
from sklearn.preprocessing import LabelEncoder

for c in X.columns:
    if X[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(X[c].values) + list(test[c].values) + list(leaderboard[c].values)) 
        X[c] = lbl.transform(list(X[c].values))
        test[c] = lbl.transform(list(test[c].values))
        leaderboard[c] = lbl.transform(list(leaderboard[c].values))

In [5]:
lbl = LabelEncoder() 
lbl.fit(list(y.values)) 
y = lbl.transform(list(y))

In [6]:
y

array([1, 0, 2, ..., 1, 1, 0])

In [8]:
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 700, 
    'eta': 0.01,
    'max_depth': 5,
    'subsample': 0.92,
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
    'silent': 1,
    'num_class' :3
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X, y)
dtest = xgb.DMatrix(leaderboard)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=1200, # increase to have better results (~700)
                   verbose_eval=50,
                   early_stopping_rounds=50
                  )

num_boost_rounds = len(cv_result)
print('num_boost_rounds=' + str(num_boost_rounds))

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


# check f2-score (to get higher score - increase num_boost_round in previous cell)

# make predictions and save results
y_preds = model.predict(dtest)

d = {'col1': leaderboard["VAR1"], 'col2': [int(i) for i in y_preds]}
df = pd.DataFrame(data=d)
df["col2"][df["col2"] == 0] = "High"
df["col2"][df["col2"] == 1] = "Low"
df["col2"][df["col2"] == 2] = "Medium"

import shutil 

filename = "Quant404_IITGuwahati_24"
df.to_csv(filename+'.csv', index=False, header=False)
shutil.copyfile("Quant404_IITGuwahati_12.ipynb", filename+".ipynb")

[0]	train-mlogloss:1.09568+3.58267e-05	test-mlogloss:1.09582+2.29541e-05
[50]	train-mlogloss:0.989505+0.0014094	test-mlogloss:0.996191+0.00125144
[100]	train-mlogloss:0.9306+0.00187451	test-mlogloss:0.943169+0.0023004
[150]	train-mlogloss:0.895134+0.00215201	test-mlogloss:0.913224+0.00322606
[200]	train-mlogloss:0.87213+0.00231356	test-mlogloss:0.895548+0.00407053
[250]	train-mlogloss:0.856005+0.00246	test-mlogloss:0.884611+0.00459239
[300]	train-mlogloss:0.844014+0.00258351	test-mlogloss:0.87768+0.00508668
[350]	train-mlogloss:0.834707+0.00248425	test-mlogloss:0.873327+0.00539159
[400]	train-mlogloss:0.826983+0.00240359	test-mlogloss:0.870246+0.00556645
[450]	train-mlogloss:0.820524+0.00233567	test-mlogloss:0.86826+0.00570957
[500]	train-mlogloss:0.814665+0.00221058	test-mlogloss:0.86689+0.00582958
[550]	train-mlogloss:0.809282+0.00225323	test-mlogloss:0.865891+0.00584704
[600]	train-mlogloss:0.804306+0.00220314	test-mlogloss:0.86511+0.00592121
[650]	train-mlogloss:0.799675+0.00230723

'Quant404_IITGuwahati_24.ipynb'

In [11]:
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 700, 
    'eta': 0.01,
    'max_depth': 6,
    'subsample': 0.90,
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
    'silent': 1,
    'num_class' :3
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X, y)
dtest = xgb.DMatrix(leaderboard)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=1200, # increase to have better results (~700)
                   verbose_eval=50,
                   early_stopping_rounds=50
                  )

num_boost_rounds = len(cv_result)
print('num_boost_rounds=' + str(num_boost_rounds))

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


# check f2-score (to get higher score - increase num_boost_round in previous cell)

# make predictions and save results
y_preds = model.predict(dtest)

d = {'col1': leaderboard["VAR1"], 'col2': [int(i) for i in y_preds]}
df = pd.DataFrame(data=d)
df["col2"][df["col2"] == 0] = "High"
df["col2"][df["col2"] == 1] = "Low"
df["col2"][df["col2"] == 2] = "Medium"

import shutil 

filename = "Quant404_IITGuwahati_25"
df.to_csv(filename+'.csv', index=False, header=False)
shutil.copyfile("Quant404_IITGuwahati_23.ipynb", filename+".ipynb")

[0]	train-mlogloss:1.09549+4.03072e-05	test-mlogloss:1.09575+1.84451e-05
[50]	train-mlogloss:0.98153+0.00135223	test-mlogloss:0.993748+0.0012607
[100]	train-mlogloss:0.917237+0.0018322	test-mlogloss:0.939922+0.00238902
[150]	train-mlogloss:0.877228+0.00194761	test-mlogloss:0.909662+0.00337444
[200]	train-mlogloss:0.850482+0.00215229	test-mlogloss:0.892066+0.00420626
[250]	train-mlogloss:0.831015+0.00234337	test-mlogloss:0.881289+0.0047568
[300]	train-mlogloss:0.816068+0.00247658	test-mlogloss:0.87457+0.00522031
[350]	train-mlogloss:0.804266+0.00223424	test-mlogloss:0.870389+0.00549063
[400]	train-mlogloss:0.79421+0.00229057	test-mlogloss:0.86764+0.00568867
[450]	train-mlogloss:0.785585+0.00222746	test-mlogloss:0.865839+0.00586117
[500]	train-mlogloss:0.777664+0.00221564	test-mlogloss:0.864716+0.00596999
[550]	train-mlogloss:0.770397+0.00247084	test-mlogloss:0.863956+0.0060063
[600]	train-mlogloss:0.763576+0.0026513	test-mlogloss:0.863403+0.00610004
[650]	train-mlogloss:0.757198+0.00279

'Quant404_IITGuwahati_25.ipynb'

In [None]:
tpot.fit(X_train, y_train)

30 operators have been imported by TPOT.


A Jupyter Widget

Skipped pipeline #14 due to time out. Continuing to the next pipeline.
Skipped pipeline #17 due to time out. Continuing to the next pipeline.


In [33]:
from xgboost.sklearn import XGBClassifier

def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=y)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='mlogloss', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(y, dtrain_predictions))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [34]:
predictors = [x for x in X.columns]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softmax',
 nthread=4,
 scale_pos_weight=1,
 seed=27,
 num_class =3)
modelfit(xgb1, X, predictors)

KeyError: 'Disbursed'

In [None]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_