In [1]:
# General
from os import path
from random import randrange

from sklearn.model_selection import train_test_split, GridSearchCV #cross validation
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, make_scorer
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score

from sklearn.preprocessing import LabelEncoder, StandardScaler

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

import pickle
import joblib 


## TRAIN SET

In [2]:
trainDataFull = pd.read_csv("trainData.csv")
trainDataFull.head(3)

Unnamed: 0,v1,v10,v100,v101,v102,v103,v11,v12,v13,v14,...,v91,v92,v93,v94,v95,v96,v97,v98,v99,target
0,1.4,0.0,0.2,1.0,4.2,0.4,0.0,0.0,0.0,1.2,...,0.6,0.2,0.0,3.2,1.0,0.2,0.0,1.6,0.4,9
1,0.0,0.0,0.0,2.8,0.0,0.8,0.0,0.2,1.2,1.4,...,0.0,0.0,1.2,0.0,1.2,0.2,0.2,2.6,2.2,6
2,0.0,0.0,0.0,0.4,0.0,0.6,0.8,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.8,0.2,0.8,1.4,0.0,3


In [3]:
trainDataFull.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61878 entries, 0 to 61877
Columns: 104 entries, v1 to target
dtypes: float64(103), int64(1)
memory usage: 49.1 MB


In [4]:
trainDataFull.describe()

Unnamed: 0,v1,v10,v100,v101,v102,v103,v11,v12,v13,v14,...,v91,v92,v93,v94,v95,v96,v97,v98,v99,target
count,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,...,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0,61878.0
mean,0.077721,0.030062,0.194955,0.561411,0.157148,0.510892,0.339339,0.060755,0.133253,0.528776,...,0.052726,0.316979,0.042697,0.290546,0.617066,0.304186,0.430156,0.837012,0.279162,4.842335
std,0.515539,0.328176,0.737317,1.003366,0.479018,0.786976,0.632642,0.385161,0.639593,0.925803,...,0.151387,0.515414,0.208958,0.742261,0.928514,0.923758,0.804039,1.204724,0.692345,2.510794
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,2.0
50%,0.0,0.0,0.0,0.2,0.0,0.2,0.0,0.0,0.0,0.2,...,0.0,0.2,0.0,0.0,0.2,0.0,0.2,0.4,0.0,5.0
75%,0.0,0.0,0.2,0.6,0.2,0.6,0.4,0.0,0.0,0.6,...,0.0,0.4,0.0,0.2,0.8,0.2,0.4,1.0,0.2,7.0
max,19.4,17.4,17.0,15.2,13.4,15.2,6.6,12.4,21.0,52.6,...,4.4,9.8,9.6,20.0,16.6,15.2,17.4,53.2,18.2,9.0


In [5]:
trainData = trainDataFull.loc[:,'v1':'v99']
trainData.head(3)

Unnamed: 0,v1,v10,v100,v101,v102,v103,v11,v12,v13,v14,...,v90,v91,v92,v93,v94,v95,v96,v97,v98,v99
0,1.4,0.0,0.2,1.0,4.2,0.4,0.0,0.0,0.0,1.2,...,0.2,0.6,0.2,0.0,3.2,1.0,0.2,0.0,1.6,0.4
1,0.0,0.0,0.0,2.8,0.0,0.8,0.0,0.2,1.2,1.4,...,0.0,0.0,0.0,1.2,0.0,1.2,0.2,0.2,2.6,2.2
2,0.0,0.0,0.0,0.4,0.0,0.6,0.8,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.8,0.2,0.8,1.4,0.0


In [6]:
trainLabels = trainDataFull.loc[:,'target']
trainLabels.unique()

array([9, 6, 3, 4, 2, 8, 7, 1, 5])

In [7]:
# encode string class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(trainLabels)
label_encoded_y = label_encoder.transform(trainLabels)
label_encoded_y

array([8, 5, 2, ..., 7, 4, 2])

## Normalize

In [8]:
scaler = StandardScaler()

In [9]:
scaler.fit(trainData.values)

StandardScaler()

In [10]:
scaler.mean_

array([0.07772068, 0.03006238, 0.19495459, 0.56141116, 0.15714794,
       0.5108924 , 0.3393387 , 0.06075503, 0.13325253, 0.52877598,
       0.17498303, 0.04794919, 0.20598597, 0.02522706, 0.0281748 ,
       0.0732215 , 0.06966289, 0.09711691, 0.05261321, 0.09430492,
       0.41449303, 0.04993374, 0.07870972, 0.02850448, 0.13248651,
       0.04779405, 0.13960374, 0.33061185, 0.05278128, 0.01065969,
       0.04444552, 0.05515369, 0.13932254, 0.14599373, 0.06182165,
       0.08841268, 0.06069362, 0.17855781, 0.09934387, 0.07602379,
       0.09619574, 0.22571512, 0.30690391, 0.03759656, 0.06008921,
       0.1717444 , 0.00513914, 0.25057371, 0.15581628, 0.09155435,
       0.13961343, 0.14969133, 0.18928214, 0.13959404, 0.03371796,
       0.20225928, 0.02973593, 0.01569217, 0.11821003, 0.28269175,
       0.0248392 , 0.11508452, 0.25135913, 0.07858043, 0.01415043,
       0.20875917, 0.25687967, 0.06485665, 0.11262161, 0.09022916,
       0.16222567, 0.11216587, 0.18029348, 0.16248424, 0.07328

In [11]:
normalized_standart = scaler.transform(trainData.values)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(normalized_standart, 
                                                    label_encoded_y, 
                                                    test_size = 0.05, 
                                                    random_state = 33,
                                                    shuffle = True,
                                                    stratify = label_encoded_y)

In [28]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

## MODEL-1 (XGBOOST)

In [13]:
# xgb_model = xgb.XGBClassifier(n_estimators=10000,
#                           learning_rate=0.05, #Default 0.05
#                           reg_lambda=10,
#                           max_depth=32, #8
#                           gamma=0.25,
# #                           subsample=1,
# #                           colsample_bytree=1,
#                           n_jobs=2,
#                           seed=33)

# print(xgb_model)

In [14]:
xgb_model = xgb.XGBClassifier(objective='multi:softmax',
                              n_estimators=10000,
                              learning_rate=0.03,
                              max_depth=16,   
                              reg_lambda=10,
                              reg_alpha=0,
                              gamma=0.25,
                              n_jobs=12,
                              random_state=5,
                              subsample=0.8,
                              colsample_bytree=0.9,
                              colsample_bynode=0.8,
                              eta = 1,
                              min_child_weight=1,
                              booster= 'gbtree',
                              tree_method='hist',
                              num_parallel_tree= 10)

print(xgb_model)

XGBClassifier(base_score=None, booster='gbtree', colsample_bylevel=None,
              colsample_bynode=0.8, colsample_bytree=0.9, eta=1, gamma=0.25,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.03, max_delta_step=None, max_depth=16,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=10000, n_jobs=12, num_parallel_tree=10,
              objective='multi:softmax', random_state=5, reg_alpha=0,
              reg_lambda=10, scale_pos_weight=None, subsample=0.8,
              tree_method='hist', validate_parameters=None, verbosity=None)


In [15]:
xgb_model.fit(X_train, 
              y_train, 
              early_stopping_rounds=30,
              eval_metric='mlogloss',
              eval_set=[(X_train, y_train), (X_test, y_test)], 
              verbose=True)

[0]	validation_0-mlogloss:2.09461	validation_1-mlogloss:2.09965
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 30 rounds.
[1]	validation_0-mlogloss:2.00170	validation_1-mlogloss:2.01181
[2]	validation_0-mlogloss:1.90915	validation_1-mlogloss:1.92450
[3]	validation_0-mlogloss:1.81629	validation_1-mlogloss:1.83630
[4]	validation_0-mlogloss:1.71941	validation_1-mlogloss:1.74441
[5]	validation_0-mlogloss:1.64198	validation_1-mlogloss:1.67207
[6]	validation_0-mlogloss:1.57453	validation_1-mlogloss:1.61005
[7]	validation_0-mlogloss:1.51697	validation_1-mlogloss:1.55753
[8]	validation_0-mlogloss:1.45534	validation_1-mlogloss:1.50070
[9]	validation_0-mlogloss:1.39776	validation_1-mlogloss:1.44779
[10]	validation_0-mlogloss:1.34486	validation_1-mlogloss:1.39985
[11]	validation_0-mlogloss:1.29796	validation_1-mlogloss:1.35767
[12]	validation_0-mlogloss:1.25144	validation_1-mlogloss:1.3155

[124]	validation_0-mlogloss:0.11199	validation_1-mlogloss:0.44853
[125]	validation_0-mlogloss:0.11090	validation_1-mlogloss:0.44837
[126]	validation_0-mlogloss:0.11003	validation_1-mlogloss:0.44823
[127]	validation_0-mlogloss:0.10914	validation_1-mlogloss:0.44809
[128]	validation_0-mlogloss:0.10817	validation_1-mlogloss:0.44785
[129]	validation_0-mlogloss:0.10727	validation_1-mlogloss:0.44773
[130]	validation_0-mlogloss:0.10631	validation_1-mlogloss:0.44757
[131]	validation_0-mlogloss:0.10548	validation_1-mlogloss:0.44748
[132]	validation_0-mlogloss:0.10456	validation_1-mlogloss:0.44746
[133]	validation_0-mlogloss:0.10368	validation_1-mlogloss:0.44720
[134]	validation_0-mlogloss:0.10281	validation_1-mlogloss:0.44705
[135]	validation_0-mlogloss:0.10193	validation_1-mlogloss:0.44697
[136]	validation_0-mlogloss:0.10110	validation_1-mlogloss:0.44681
[137]	validation_0-mlogloss:0.10032	validation_1-mlogloss:0.44675
[138]	validation_0-mlogloss:0.09960	validation_1-mlogloss:0.44667
[139]	vali

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=0.8, colsample_bytree=0.9, eta=1, gamma=0.25,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.03, max_delta_step=0, max_depth=16,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=10000, n_jobs=12, num_parallel_tree=10,
              objective='multi:softprob', random_state=5, reg_alpha=0,
              reg_lambda=10, scale_pos_weight=None, subsample=0.8,
              tree_method='hist', validate_parameters=1, verbosity=None)

In [16]:
# make predictions for test data
y_pred = xgb_model.predict(X_test)
y_pred

array([7, 7, 7, ..., 5, 2, 7])

In [17]:
# # A parameter grid for XGBoost
# params = {
#         'min_child_weight': [1, 5, 10],
#         'gamma': [0.5, 1, 1.5, 2, 5],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'max_depth': [3, 4, 5]
#         }

In [18]:
predictions = [round(value) for value in y_pred]

In [19]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
# Default 83.48

Accuracy: 83.48%


In [None]:
#fig = plt.figure(figsize=(10,10))
plot_confusion_matrix(xgb_model,
                     X_test,
                     y_test,
                     values_format='d')

## Save Valid Score

In [20]:
y_score = xgb_model.predict_proba(X_test)
y_score[0]

array([1.01462225e-04, 1.14787923e-04, 2.21365670e-04, 4.21350305e-05,
       1.71775537e-05, 6.60826045e-04, 3.29457725e-05, 9.98791158e-01,
       1.82002841e-05], dtype=float32)

In [21]:
valid_score = pd.DataFrame(y_score, columns=['c1','c2','c3','c4','c5','c6','c7','c8','c9'])
valid_score

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9
0,0.000101,0.000115,0.000221,0.000042,0.000017,0.000661,0.000033,0.998791,0.000018
1,0.000051,0.000007,0.000009,0.000002,0.000012,0.000577,0.000098,0.999205,0.000038
2,0.004228,0.000037,0.000045,0.000079,0.000183,0.002042,0.006516,0.986677,0.000193
3,0.001666,0.000080,0.000136,0.000307,0.000060,0.002291,0.000077,0.000434,0.994950
4,0.001465,0.000501,0.000218,0.002832,0.000322,0.967463,0.003802,0.021743,0.001654
...,...,...,...,...,...,...,...,...,...
3089,0.000947,0.734892,0.114818,0.042715,0.000565,0.002262,0.101421,0.002206,0.000174
3090,0.000137,0.088631,0.047648,0.858327,0.000906,0.000455,0.000914,0.000442,0.002541
3091,0.001477,0.000007,0.000009,0.000139,0.000034,0.996841,0.000672,0.000785,0.000036
3092,0.000865,0.080111,0.892771,0.022512,0.000755,0.001008,0.000440,0.000313,0.001224


In [22]:
valid_score.to_csv('./results/valid-submission-xgboost-2-optimal.csv', index = False)

## Save & Load Model

In [None]:
xgb_model.save_model('./model/xgboost-more')

In [None]:
xgb_model = xgb.XGBClassifier()
xgb_model.load_model('./model/xgboost-more')
predictions = xgb_model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## joblib

#### xgboost model

In [None]:
# Save the model as a pickle in a file 
joblib.dump(xgb_model, './model/xgboost-3-standart-norm-3.pkl') 
  
# Load the model from the file 
xgboost_from_joblib = joblib.load('./model/xgboost-3-standart-norm-3.pkl')  
  
# Use the loaded model to make predictions 
xgboost_predictions = xgboost_from_joblib.predict(X_test) 

# evaluate predictions
accuracy = accuracy_score(y_test, xgboost_predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## GridSearchCV 

In [None]:
clf = GridSearchCV(xgb_model,
                   {'max_depth': [4, 6],
                    'n_estimators': [100, 200]}, 
                    verbose=1,
                    cv=2)
clf.fit(X_train, 
        y_train, 
        early_stopping_rounds=10,
        eval_metric='mlogloss',
        eval_set=[(X_train, y_train), (X_test, y_test)], 
        verbose=True)
print(clf.best_score_)
print(clf.best_params_)

In [None]:
# Save the model as a pickle in a file 
joblib.dump(clf.best_estimator_, './model/clf.pkl')

# Load the model from the file 
clf_from_joblib = joblib.load('./model/clf.pkl')  

# Use the loaded model to make predictions 
clf_predictions = clf_from_joblib.predict(X_test) 

# evaluate predictions
accuracy = accuracy_score(y_test, clf_predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# TEST

In [23]:
testData = pd.read_csv("testData.csv")
testData

Unnamed: 0,v1,v10,v100,v101,v102,v103,v11,v12,v13,v14,...,v90,v91,v92,v93,v94,v95,v96,v97,v98,v99
0,0.0,0.0,0.0,0.0,0.2,0.2,0.6,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.4,0.2,0.6,5.0,0.0
1,0.0,0.0,0.0,5.2,0.0,0.0,0.0,0.0,1.6,0.8,...,0.0,0.0,0.8,0.0,0.0,3.6,0.4,0.0,2.4,0.0
2,0.0,0.0,0.8,2.2,0.0,0.4,0.0,0.0,0.2,0.0,...,0.0,0.0,0.4,0.0,0.2,1.6,0.6,0.0,0.6,1.6
3,0.0,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
4,0.0,0.6,0.2,0.0,0.0,0.4,0.0,0.0,0.0,0.8,...,0.0,0.0,0.6,0.0,0.0,0.2,0.2,0.6,0.8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144363,0.0,0.0,0.2,0.2,0.2,0.0,0.0,0.0,0.0,0.4,...,0.0,0.0,0.0,0.0,1.2,0.0,0.4,0.0,0.6,0.0
144364,0.0,0.0,0.0,0.0,0.4,0.0,0.6,0.0,0.0,0.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,2.6,0.0
144365,0.0,0.0,0.0,0.4,0.2,1.4,1.4,0.0,0.2,0.2,...,0.0,0.6,0.2,0.0,0.2,1.2,0.0,1.4,0.6,0.2
144366,0.0,0.0,0.0,0.2,0.0,0.8,1.2,0.0,0.0,0.4,...,0.0,0.0,0.4,0.0,0.0,0.2,0.2,1.2,1.4,0.0


### Normalize

In [24]:
test_normalized_standart = scaler.transform(testData.values)
test_normalized_standart

array([[-0.15075752, -0.09160521, -0.2644128 , ...,  0.21123996,
         3.45558241, -0.40321577],
       [-0.15075752, -0.09160521, -0.2644128 , ..., -0.53499811,
         1.2973938 , -0.40321577],
       [-0.15075752, -0.09160521,  0.82061035, ..., -0.53499811,
        -0.19673677,  1.90778902],
       ...,
       [-0.15075752, -0.09160521, -0.2644128 , ...,  1.20622404,
        -0.19673677, -0.11434017],
       [-0.15075752, -0.09160521, -0.2644128 , ...,  0.95747802,
         0.46732126, -0.40321577],
       [-0.15075752, -0.09160521,  0.00684299, ..., -0.28625209,
         0.96536479, -0.40321577]])

In [None]:
# Use the loaded model to make predictions 
# test_predictions = xgb_model.predict(test_normalized_standart)
# test_predictions

In [25]:
# Use the loaded model to make predictions probability
test_predictions = xgb_model.predict_proba(test_normalized_standart)
test_predictions

array([[4.46073158e-04, 7.86947235e-02, 1.08564414e-01, ...,
        3.25177121e-03, 2.57292559e-04, 5.98593178e-05],
       [4.29020030e-03, 2.53191981e-02, 1.98254664e-03, ...,
        3.49035906e-03, 8.23050261e-01, 1.97169185e-03],
       [5.81149907e-05, 1.40903694e-05, 4.09782406e-05, ...,
        6.31143339e-05, 7.09893764e-04, 3.10091782e-05],
       ...,
       [5.65026246e-04, 5.92487335e-01, 3.23780417e-01, ...,
        3.98256723e-03, 3.85089981e-04, 2.96472572e-04],
       [3.69621732e-04, 4.29249287e-01, 1.90122854e-02, ...,
        4.13576141e-04, 1.64441852e-04, 3.70325834e-05],
       [6.09783456e-04, 5.01633704e-01, 4.56383735e-01, ...,
        2.52902359e-02, 6.14306773e-04, 2.17926732e-04]], dtype=float32)

In [26]:
result = pd.DataFrame(test_predictions, columns=['c1','c2','c3','c4','c5','c6','c7','c8','c9'])
result

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9
0,0.000446,0.078695,0.108564,0.808224,0.000431,0.000071,0.003252,0.000257,0.000060
1,0.004290,0.025319,0.001983,0.001405,0.001005,0.137486,0.003490,0.823050,0.001972
2,0.000058,0.000014,0.000041,0.000022,0.000022,0.999038,0.000063,0.000710,0.000031
3,0.000563,0.763614,0.221859,0.008738,0.000502,0.000344,0.000450,0.001220,0.002711
4,0.042744,0.000797,0.000350,0.000127,0.001041,0.002581,0.000740,0.029084,0.922536
...,...,...,...,...,...,...,...,...,...
144363,0.731872,0.002439,0.000574,0.021166,0.001195,0.204021,0.009010,0.013791,0.015932
144364,0.001058,0.183415,0.669365,0.090108,0.001116,0.000877,0.053317,0.000542,0.000202
144365,0.000565,0.592487,0.323780,0.076721,0.001283,0.000499,0.003983,0.000385,0.000296
144366,0.000370,0.429249,0.019012,0.549869,0.000820,0.000065,0.000414,0.000164,0.000037


In [27]:
result.to_csv('./results/submission-model-2-optimal', index = False)

## REFERENCES

1- https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

2- https://github.com/dmlc/xgboost/blob/master/demo/guide-python/sklearn_examples.py

3- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

4- https://www.datacamp.com/community/tutorials/xgboost-in-python

5- https://scikit-learn.org/stable/modules/ensemble.html#voting-classifier