In [2]:
# General
from os import path
from random import randrange

from sklearn.model_selection import train_test_split, GridSearchCV #cross validation
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, make_scorer
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score

from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

import pickle
import joblib 

import lazypredict
from lazypredict.Supervised import LazyClassifier

## TRAIN SET

In [3]:
trainDataFull = pd.read_csv("trainData.csv")
trainDataFull.head(3)

Unnamed: 0,v1,v10,v100,v101,v102,v103,v11,v12,v13,v14,...,v91,v92,v93,v94,v95,v96,v97,v98,v99,target
0,1.4,0.0,0.2,1.0,4.2,0.4,0.0,0.0,0.0,1.2,...,0.6,0.2,0.0,3.2,1.0,0.2,0.0,1.6,0.4,9
1,0.0,0.0,0.0,2.8,0.0,0.8,0.0,0.2,1.2,1.4,...,0.0,0.0,1.2,0.0,1.2,0.2,0.2,2.6,2.2,6
2,0.0,0.0,0.0,0.4,0.0,0.6,0.8,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.8,0.2,0.8,1.4,0.0,3


In [4]:
trainDataFull.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61878 entries, 0 to 61877
Columns: 104 entries, v1 to target
dtypes: float64(103), int64(1)
memory usage: 49.1 MB


In [5]:
trainDataFull.describe()

Unnamed: 0,v1,v10,v100,v101,v102,v103,v11,v12,v13,v14,...,v91,v92,v93,v94,v95,v96,v97,v98,v99,target
count,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,...,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0
mean,0.08,0.03,0.19,0.56,0.16,0.51,0.34,0.06,0.13,0.53,...,0.05,0.32,0.04,0.29,0.62,0.3,0.43,0.84,0.28,4.84
std,0.52,0.33,0.74,1.0,0.48,0.79,0.63,0.39,0.64,0.93,...,0.15,0.52,0.21,0.74,0.93,0.92,0.8,1.2,0.69,2.51
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,2.0
50%,0.0,0.0,0.0,0.2,0.0,0.2,0.0,0.0,0.0,0.2,...,0.0,0.2,0.0,0.0,0.2,0.0,0.2,0.4,0.0,5.0
75%,0.0,0.0,0.2,0.6,0.2,0.6,0.4,0.0,0.0,0.6,...,0.0,0.4,0.0,0.2,0.8,0.2,0.4,1.0,0.2,7.0
max,19.4,17.4,17.0,15.2,13.4,15.2,6.6,12.4,21.0,52.6,...,4.4,9.8,9.6,20.0,16.6,15.2,17.4,53.2,18.2,9.0


In [6]:
trainData = trainDataFull.loc[:,'v1':'v99']
trainData.head(3)

Unnamed: 0,v1,v10,v100,v101,v102,v103,v11,v12,v13,v14,...,v90,v91,v92,v93,v94,v95,v96,v97,v98,v99
0,1.4,0.0,0.2,1.0,4.2,0.4,0.0,0.0,0.0,1.2,...,0.2,0.6,0.2,0.0,3.2,1.0,0.2,0.0,1.6,0.4
1,0.0,0.0,0.0,2.8,0.0,0.8,0.0,0.2,1.2,1.4,...,0.0,0.0,0.0,1.2,0.0,1.2,0.2,0.2,2.6,2.2
2,0.0,0.0,0.0,0.4,0.0,0.6,0.8,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.8,0.2,0.8,1.4,0.0


In [7]:
trainLabels = trainDataFull.loc[:,'target']
trainLabels.unique()

array([9, 6, 3, 4, 2, 8, 7, 1, 5])

In [8]:
# encode string class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(trainLabels)
label_encoded_y = label_encoder.transform(trainLabels)
label_encoded_y

array([8, 5, 2, ..., 7, 4, 2])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(trainData.values, 
                                                    label_encoded_y, 
                                                    test_size = 0.3, 
                                                    random_state = 33,
                                                    shuffle = True,
                                                    stratify = label_encoded_y)

In [10]:
clf = LazyClassifier(verbose=1,ignore_warnings=True, custom_metric=None)

In [None]:
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
models

  3%|▎         | 1/30 [00:04<02:24,  5.00s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.6926308985132514, 'Balanced Accuracy': 0.5803213894062684, 'ROC AUC': None, 'F1 Score': 0.6667560861746957, 'Time taken': 4.998732566833496}


  7%|▋         | 2/30 [00:12<02:40,  5.74s/it]

{'Model': 'BaggingClassifier', 'Accuracy': 0.7720857573798751, 'Balanced Accuracy': 0.6970010545877124, 'ROC AUC': None, 'F1 Score': 0.7663470645464825, 'Time taken': 7.457876443862915}


 10%|█         | 3/30 [00:12<01:50,  4.11s/it]

{'Model': 'BernoulliNB', 'Accuracy': 0.6018099547511312, 'Balanced Accuracy': 0.5496323173329816, 'ROC AUC': None, 'F1 Score': 0.6043789686431189, 'Time taken': 0.3032815456390381}


 13%|█▎        | 4/30 [04:49<37:15, 86.00s/it]

{'Model': 'CalibratedClassifierCV', 'Accuracy': 0.7518853695324283, 'Balanced Accuracy': 0.6427035541158805, 'ROC AUC': None, 'F1 Score': 0.7296418287614819, 'Time taken': 277.0766348838806}


 20%|██        | 6/30 [04:50<16:54, 42.27s/it]

{'Model': 'CheckingClassifier', 'Accuracy': 0.03118939883645766, 'Balanced Accuracy': 0.1111111111111111, 'ROC AUC': None, 'F1 Score': 0.0018867117929592006, 'Time taken': 0.16010332107543945}


 27%|██▋       | 8/30 [04:51<07:42, 21.01s/it]

{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.7039969834087482, 'Balanced Accuracy': 0.6448096590046344, 'ROC AUC': None, 'F1 Score': 0.7052976657835813, 'Time taken': 1.207240343093872}
{'Model': 'DummyClassifier', 'Accuracy': 0.170706744236156, 'Balanced Accuracy': 0.11465220639916504, 'ROC AUC': None, 'F1 Score': 0.17077003402720356, 'Time taken': 0.15891361236572266}


 30%|███       | 9/30 [04:51<05:10, 14.78s/it]

{'Model': 'ExtraTreeClassifier', 'Accuracy': 0.6428571428571429, 'Balanced Accuracy': 0.5679207025244943, 'ROC AUC': None, 'F1 Score': 0.6436871472577306, 'Time taken': 0.24262738227844238}


 33%|███▎      | 10/30 [05:00<04:18, 12.95s/it]

{'Model': 'ExtraTreesClassifier', 'Accuracy': 0.8083925878043525, 'Balanced Accuracy': 0.7230225063019105, 'ROC AUC': None, 'F1 Score': 0.7979380140484333, 'Time taken': 8.658900022506714}


 37%|███▋      | 11/30 [05:00<02:53,  9.13s/it]

{'Model': 'GaussianNB', 'Accuracy': 0.6291208791208791, 'Balanced Accuracy': 0.5936866408208042, 'ROC AUC': None, 'F1 Score': 0.6351752920698441, 'Time taken': 0.23978519439697266}


 40%|████      | 12/30 [07:00<12:39, 42.19s/it]

{'Model': 'KNeighborsClassifier', 'Accuracy': 0.771870286576169, 'Balanced Accuracy': 0.6997055760852897, 'ROC AUC': None, 'F1 Score': 0.7656932555446602, 'Time taken': 119.33744549751282}


## MODEL-1 (XGBOOST)

In [None]:
xgb_model = xgb.XGBClassifier(n_estimators=10000,
                          learning_rate=0.01, #Default 0.05
                          reg_lambda=10,
                          max_depth=8,
                          gamma=0.25,
#                           subsample=1,
#                           colsample_bytree=1,
                          n_jobs=2,
                          seed=33)

print(xgb_model)

In [None]:
xgb_model.fit(X_train, 
              y_train, 
              early_stopping_rounds=100,
              eval_metric='mlogloss',
              eval_set=[(X_train, y_train), (X_test, y_test)], 
              verbose=True)

In [None]:
# make predictions for test data
y_pred = xgb_model.predict(X_test)
y_pred

In [None]:
predictions = [round(value) for value in y_pred]

In [None]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
# Default 82.15

In [None]:
#fig = plt.figure(figsize=(10,10))
plot_confusion_matrix(xgb_model,
                     X_test,
                     y_test,
                     values_format='d')

## Save Valid Score

In [None]:
y_score = xgb_model.predict_proba(X_test)
y_score[0]

In [None]:
valid_score = pd.DataFrame(y_score, columns=['c1','c2','c3','c4','c5','c6','c7','c8','c9'])
valid_score

In [None]:
valid_score.to_csv('./results/valid-submission-xgboost.csv', index = False)

## Save & Load Model

In [None]:
xgb_model.save_model('./model/xgboost-1')

In [None]:
xgb_model = xgb.XGBClassifier()
xgb_model.load_model('./model/xgboost-1')
predictions = xgb_model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## joblib

#### xgboost model

In [None]:
# Save the model as a pickle in a file 
joblib.dump(xgb_model, './model/model_xgboost.pkl') 
  
# Load the model from the file 
xgboost_from_joblib = joblib.load('./model/model_xgboost.pkl')  
  
# Use the loaded model to make predictions 
xgboost_predictions = xgboost_from_joblib.predict(X_test) 

# evaluate predictions
accuracy = accuracy_score(y_test, xgboost_predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## GridSearchCV 

In [None]:
clf = GridSearchCV(xgb_model,
                   {'max_depth': [4, 6],
                    'n_estimators': [100, 200]}, 
                    verbose=1,
                    cv=2)
clf.fit(X_train, 
        y_train, 
        early_stopping_rounds=10,
        eval_metric='mlogloss',
        eval_set=[(X_train, y_train), (X_test, y_test)], 
        verbose=True)
print(clf.best_score_)
print(clf.best_params_)

In [None]:
# Save the model as a pickle in a file 
joblib.dump(clf.best_estimator_, './model/clf.pkl')

# Load the model from the file 
clf_from_joblib = joblib.load('./model/clf.pkl')  

# Use the loaded model to make predictions 
clf_predictions = clf_from_joblib.predict(X_test) 

# evaluate predictions
accuracy = accuracy_score(y_test, clf_predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# TEST

In [None]:
testData = pd.read_csv("testData.csv")
testData

In [None]:
# Use the loaded model to make predictions 
test_predictions = xgb_model.predict(testData.values)
test_predictions

In [None]:
# Use the loaded model to make predictions probability
test_predictions = xgb_model.predict_proba(testData.values)
test_predictions

In [None]:
result = pd.DataFrame(test_predictions, columns=['c1','c2','c3','c4','c5','c6','c7','c8','c9'])
result

In [None]:
result.to_csv('./results/submission-xgboost.csv', index = False)

## REFERENCES

1- https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

2- https://github.com/dmlc/xgboost/blob/master/demo/guide-python/sklearn_examples.py

3- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

4- https://www.datacamp.com/community/tutorials/xgboost-in-python

5- https://scikit-learn.org/stable/modules/ensemble.html#voting-classifier