# Multi Class Classification using CatBoost

## Import the required libraries

In [None]:
!pip install --user --upgrade catboost
!pip install --user --upgrade ipywidgets
!pip install shap
!pip install sklearn
!pip install --upgrade numpy
!jupyter nbextension enable --py widgetsnbextension

In [None]:
import catboost
print(catboost.__version__)
!python --version

In [2]:
import pandas as pd
import numpy as np
#import requests
#import json
import re
from io import StringIO
from string import digits
import itertools
import os
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn import metrics
from datetime import datetime
import math

## Data Import and Cleanup

In [3]:
fname = os.path.join("<<your path and input excel file name with extension>>")
df = pd.read_csv(fname, sep=',', delimiter=None, header='infer')

## Print all the columns. This method helps wen the list of columns is large. 

In [None]:
[print(a) for a in df.columns]

## Derive the label column based on valuesin the other columns

In [27]:
# 0 = under booking, 1 = No over or under booking, 2 = over booking
def create_label(row):
    if pd.isna(row['BOOKED_QUANTITY']) or pd.isna(row['BOOKED_VOLUME']):
        val = 0
    elif pd.isna(row['ACTUAL_QUANTITY']) or pd.isna(row['ACTUAL_VOLUME']):
        val = 2
    elif row['BOOKED_QUANTITY'] > row['ACTUAL_QUANTITY'] or row['BOOKED_VOLUME'] > row['ACTUAL_VOLUME']:
        val = 2
    elif row['BOOKED_QUANTITY'] < row['ACTUAL_QUANTITY'] or row['BOOKED_VOLUME'] < row['ACTUAL_VOLUME']:
        val = 0
    else:
        val = 1
    return val

In [28]:
df["label"] = df.apply(create_label, axis=1)

## Check if the data is balanced. If it is not balanced, CatBoost has a way to handle the imbalance, as we will see during the model training process later in this script. Note that, as a best practice, the balancing needs to happen after the test data (hold out data) is separated.

In [None]:
df['label'].value_counts()

## Derive features such as year, month, day, day of the week, hour, minute, second, etc from the time stamp columns

In [32]:
def convert_dateid_to_date(dateid):
    return datetime.strptime(dateid, '%Y%m%d')

def convert_timestampstr_to_date(timestamp):
    if timestamp == 'nan' or '':
        return ''
    else:
        return datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S.%f')

In [None]:
df['TIMESTAMP'] = df['TIMESTAMP'].apply(str).apply(convert_timestampstr_to_date)
df['TIMESTAMP_YEAR'] = df['TIMESTAMP'].dt.year
df['TIMESTAMP_MONTH'] = df['TIMESTAMP'].dt.month
df['TIMESTAMP_DAYOFTHEMONTH'] = df['TIMESTAMP'].dt.day
df['TIMESTAMP_HOUR'] = df['TIMESTAMP'].dt.hour
df['TIMESTAMP_MINUTE'] = df['TIMESTAMP'].dt.minute
df['TIMESTAMP_SECOND'] = df['TIMESTAMP'].dt.second
df['TIMESTAMP_WEEK'] = df['TIMESTAMP'].dt.isocalendar().week
df['TIMESTAMP_WEEKDAY'] = df['TIMESTAMP'].dt.dayofweek

## Drop columns that are not useful for the training and prediction

In [42]:
df.drop(['ID', 'ACTUAL_QUANTITY', 'ACTUAL_VOLUME', 'TIMESTAMP'], inplace=True, axis=1)

## Identify the non categorical columns

In [43]:
NON_CAT_COLS = ['BOOKED_QUANTITY', 'BOOKED_VOLUME'] # add other non categorical columns. Here, only a couple of columns are shown for the sake of illustration

## Rest of the columns (except label) are categoricals

In [44]:
CAT_COLS = [col for col in df.columns if col not in NON_CAT_COLS and col != 'label']

In [None]:
[print(x) for x in CAT_COLS]

## Split the data into categorical and non-categorical. This is required as all the categorical features need to be converted into string as per the requirements of CatBoost.

In [46]:
df_non_cat = df[NON_CAT_COLS]

In [47]:
df_cat = df[CAT_COLS + ['label']].astype(str).astype(object)

In [52]:
print("#Categorical Features: " + str(df_cat.shape[1] - 1))
print("#Non Categorical Features: " + str(df_non_cat.shape[1]))
print("Total #Features: " + str(df_cat.shape[1] + df_non_cat.shape[1] - 1))

#Categorical Features: 113
#Non Categorical Features: 51
Total #Features: 164


## Free up some of the pandas dataframes that are no needed anymore

In [None]:
import gc

del [[df, df_join, df_non_cat, df_cat]]
gc.collect()
df_chart1=pd.DataFrame()
df_chart2=pd.DataFrame()
df_chart3=pd.DataFrame()
df_chart4=pd.DataFrame()

## Join the categorical and non-categorical features into X and separate the labels into y

In [53]:
X = df_cat.join(df_non_cat)
y = X.label
X = X.drop('label', axis=1)

## Split the data into train, validation and test data.

In [54]:
# split the dataset into training/validation and test datasets 
X_train_validation, X_test, y_train_validation, y_test = model_selection.train_test_split(X, y, stratify=y)

In [None]:
# X_train_validation.head()
# y_train_validation.head()

## Check if the data is balanced now.

In [None]:
y_train_validation.value_counts()

## Split the data into train and validation sets. 

In [56]:
# split the dataset into training and validation datasets 
# X_train, X_validation, y_train, y_validation = model_selection.train_test_split(X_train_validation_ov, y_train_validation_ov, stratify=y_train_validation_ov)
X_train, X_validation, y_train, y_validation = model_selection.train_test_split(X_train_validation, y_train_validation, stratify=y_train_validation)

## Create and fit the CatBoost model for multi-class classification. The number of iterations here is 100, but can be set to whatever is appropriate in a given scenario. The class weights are important as CatBoost uses them to handle the unbalanced data.

## The weights are calculated by first dividing the number of records per class in the training data set by the total number of records in the training data set. Then subtract each value from 1 to arrive at the weights for each class.

In [57]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(
    iterations=100,
    random_seed=43,
    loss_function='MultiClass',
    class_weights={0: 0.989352004, 1: 0.078047624, 2: 0.932600372}
)
model.fit(
    X_train, y_train,
    cat_features=CAT_COLS,
    eval_set=(X_validation, y_validation),
    verbose=True,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.293503
0:	learn: 0.8953525	test: 0.8943565	best: 0.8943565 (0)	total: 25.5s	remaining: 42m 7s
1:	learn: 0.7932036	test: 0.7915256	best: 0.7915256 (1)	total: 48.9s	remaining: 39m 54s
2:	learn: 0.7137698	test: 0.7121791	best: 0.7121791 (2)	total: 1m 11s	remaining: 38m 30s
3:	learn: 0.6617851	test: 0.6616413	best: 0.6616413 (3)	total: 1m 28s	remaining: 35m 22s
4:	learn: 0.6311685	test: 0.6315308	best: 0.6315308 (4)	total: 1m 43s	remaining: 32m 55s
5:	learn: 0.6014221	test: 0.6009598	best: 0.6009598 (5)	total: 2m 1s	remaining: 31m 41s
6:	learn: 0.5620689	test: 0.5613036	best: 0.5613036 (6)	total: 2m 18s	remaining: 30m 44s
7:	learn: 0.5461100	test: 0.5457544	best: 0.5457544 (7)	total: 2m 34s	remaining: 29m 35s
8:	learn: 0.5335934	test: 0.5325835	best: 0.5325835 (8)	total: 2m 52s	remaining: 29m 2s
9:	learn: 0.5229896	test: 0.5219737	best: 0.5219737 (9)	total: 3m 10s	remaining: 28m 38s
10:	learn: 0.5154749	test: 0.5145894	best: 0.5145894 (10)	total: 3m 31s	remaining: 28

91:	learn: 0.2873832	test: 0.2975971	best: 0.2975971 (91)	total: 26m 7s	remaining: 2m 16s
92:	learn: 0.2871506	test: 0.2975546	best: 0.2975546 (92)	total: 26m 22s	remaining: 1m 59s
93:	learn: 0.2866048	test: 0.2973057	best: 0.2973057 (93)	total: 26m 37s	remaining: 1m 41s
94:	learn: 0.2862997	test: 0.2971717	best: 0.2971717 (94)	total: 26m 53s	remaining: 1m 24s
95:	learn: 0.2860989	test: 0.2970655	best: 0.2970655 (95)	total: 27m 10s	remaining: 1m 7s
96:	learn: 0.2858406	test: 0.2969869	best: 0.2969869 (96)	total: 27m 26s	remaining: 50.9s
97:	learn: 0.2856983	test: 0.2969519	best: 0.2969519 (97)	total: 27m 42s	remaining: 33.9s
98:	learn: 0.2845121	test: 0.2958672	best: 0.2958672 (98)	total: 27m 59s	remaining: 17s
99:	learn: 0.2843772	test: 0.2958216	best: 0.2958216 (99)	total: 28m 16s	remaining: 0us

bestTest = 0.2958215783
bestIteration = 99



<catboost.core.CatBoostClassifier at 0x1d4f4f52c40>

## Print the feature importance (in this case I am filtering out the features that contribute less than 1% of predictivesignal). 

In [None]:
feature_imp = model.get_feature_importance(prettified=True)
feature_imp[feature_imp['Importances'] > 1].sort_values(by=['Importances'], ascending=False)

## Predict the outputs for the test data (hold out data).

In [59]:
predictions = model.predict(X_test)

## Convert the y_test back to int as both y_test and predictions need to be of the same type to be compared to determine model performance. Note that y_test was converted to string along with all the other categorical columns earlier in the script.

In [60]:
y_test = y_test.astype(int)

## Print the classification model performance report.

In [61]:
print('Classification Report for test data')
print(metrics.classification_report(y_test, predictions))

Classification Report for test data
              precision    recall  f1-score   support

           0       0.39      0.42      0.41      1947
           1       0.99      0.98      0.98    168552
           2       0.81      0.90      0.85     12322

    accuracy                           0.97    182821
   macro avg       0.73      0.77      0.75    182821
weighted avg       0.97      0.97      0.97    182821



## As you can see, the precision, recall and f1 scores for the output classes 1 and 2 are good, but the scores for the output class 0 is not that great. This could possibly be resolved by getting more samples for this minority class in the training and testing data or by engineering more that have greater predictive signal. 