# Petfinder.my competition: modelling, part one

## Introduction

## Libraries

In [46]:
import os
import itertools
from collections import Counter

import feather

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import cohen_kappa_score, confusion_matrix

import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

## Loading data

In [2]:
INPUT_PATH = os.path.join(os.pardir, 'data', 'interim')

In [3]:
all_train = feather.read_dataframe(os.path.join(INPUT_PATH, 'train.feather'))
all_test = feather.read_dataframe(os.path.join(INPUT_PATH, 'test.feather'))

## Preparing data

Splits the training data into training and validation data sets. The size of the validation data was calculated to match the size of the final unseen testing data relative to the original training data.

In [4]:
cat_colnames = ['RescuerID', 'PetID', 'PrimaryLabel', 'SecondaryLabel']

In [5]:
# Dealing with categorical data
all_data = [all_train, all_test]

for df in all_data:
    for col in cat_colnames:
        df[col] = pd.Categorical(df[col])
        df[col] = df[col].cat.codes
        df[col] = pd.Categorical(df[col])

In [6]:
drop_cols = ['Name', 'Description', 'AdoptionSpeed']

In [7]:
X_train = all_train.drop(drop_cols, axis=1)
y_train = all_train['AdoptionSpeed']

In [8]:
# X_train, X_valid, y_train, y_valid = train_test_split(training_data, training_labels,
#                                                      test_size=len(all_test) / len(all_train))

In [9]:
print(X_train.shape, y_train.shape)

(14993, 346) (14993,)


## LightGBM: initial model

Fits a LightGBM classification model.

### Scoring

The competition uses the quadratic weighted kappa for scoring.

In [10]:
def kappa_metric(predictions, actuals):
    """
    Competition scores are calculated with the quadratic weighted kappa.
    The cohen_kappa_score from sklearn.metrics is identical
    """
    return cohen_kappa_score(predictions, actuals, weights='quadratic')

In [22]:
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix'):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    From https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """

    print(cm)

    df_cm = pd.DataFrame(cm, index = [i for i in range(0, 5)],
                      columns = [i for i in range(0, 5)])
    plt.figure(figsize = (10,7))
    sns.heatmap(df_cm, annot=True)

### Model parameters

In [12]:
params = {'objective': 'softmax',
          'num_class': 5,
          'boosting': 'gbdt',
          'nthread': 4,
          'num_iterations': 10000,
          'learning_rate': 0.01,
          'num_leaves': 80,
          'max_depth': -1,
          'min_data_in_leaf': 60,
          'min_sum_hessian_in_leaf': 0.01,
          'bagging_fraction': 0.75,
          'bagging_frequency': 2,
          'feature_fraction': 0.75,
          'lambda_l2': 0.05,
          'min_gain_to_split': 0.0,
          'max_bin': 255,
          'early_stopping_rounds': 100,
          'data_random_seed': 42,
          'verbosity': -1,
          'verbose_eval': 100
         }

### Model function

In [49]:
def lgb_model(train_data, train_labels, valid_data, valid_labels,
              model_params, scoring_metric, X_test=None):
    
    # Create training and validation lgb dataset objects
    lgb_X_train = lgb.Dataset(data=train_data, label=train_labels,
                              feature_name='auto', categorical_feature='auto',
                              free_raw_data=False)
    lgb_X_valid = lgb.Dataset(data=valid_data, label=valid_labels,
                              feature_name='auto', categorical_feature='auto',
                              free_raw_data=False)
    
    # Get parameters
    params2 = model_params.copy()
    num_iterations = params2.pop('num_iterations')
    early_stopping = params2.pop('early_stopping_rounds')
    verbose_eval = params2.pop('verbose_eval')
    
    # Train LightGBM model
    print("Training the LightGBM model...")
    model = lgb.train(params2,
                          lgb_X_train,
                          num_boost_round=num_iterations,
                          valid_sets=[lgb_X_train, lgb_X_valid],
                          early_stopping_rounds=early_stopping,
                          verbose_eval=verbose_eval)
    
    # Get model predictions on validation set
    print("Fitting the model to the validation data...")
    y_probs = model.predict(valid_data, num_iteration=model.best_iteration,
                           verbose_eval=verbose_eval) # Class probabilities from model
    
    
    y_preds = np.apply_along_axis(np.argmax, 1, y_probs) # Get label value according to highest probability
    qwk = scoring_metric(y_preds, valid_labels) # Compute kappa score
    conf_matrix = confusion_matrix(valid_labels, y_preds)
    
    print("-" * 40)
    print(f"Actual distribution of labels: {Counter(valid_labels)}")
    print(f"Predicted distribution of labels: {Counter(y_preds)}")
    print(f"CV split QWK score: {qwk}")
    print("Confusion matrix:\n", conf_matrix)
    print("-" * 40)
    
    return y_probs, y_preds, qwk, conf_matrix

### Cross Validation

In [53]:
def cv_model(X:pd.DataFrame, y:list, nsplits:int, params):
    fold_counter = 1 #The current fold number
    skf = StratifiedKFold(n_splits=nsplits, shuffle=True)
    
    lgb_preds = np.zeros((X.shape[0], nsplits))
    lgb_pred_probs = []
    lgb_confusion_matrices = []
    lgb_qwk_scores = []
    
    for train_index, valid_index in skf.split(X, y):
        print(f"Fold {fold_counter} / {nsplits}:")
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        
        # Fit model
        y_probs, y_preds, qwk, conf_matrix = lgb_model(X_train, y_train, X_valid, y_valid, params, kappa_metric)
        
        lgb_preds[valid_index] = y_preds.reshape(-1, 1)
        lgb_pred_probs.append(y_probs)
        lgb_qwk_scores.append(qwk)
        lgb_confusion_matrices.append(conf_matrix)
        
        fold_counter += 1
    print("=" * 40)
    print(f"QWK scores: {lgb_qwk_scores}")
    print(f"Mean QWK score: {np.mean(lgb_qwk_scores)}")
    print(f"QWK score sd: {np.std(lgb_qwk_scores)}")
    return lgb_pred_probs, lgb_preds, lgb_qwk_scores, lgb_confusion_matrices

In [52]:
lgb_pred_probs, lgb_preds, lgb_qwk_scores, lgb_confusion_matrices = cv_model(X_train, y_train, 5, params)

Fold 1 / 5:
Training the LightGBM model...




Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 1.16233	valid_1's multi_logloss: 1.3511
[200]	training's multi_logloss: 0.961456	valid_1's multi_logloss: 1.30181
[300]	training's multi_logloss: 0.81081	valid_1's multi_logloss: 1.27727
[400]	training's multi_logloss: 0.691224	valid_1's multi_logloss: 1.2635
[500]	training's multi_logloss: 0.593743	valid_1's multi_logloss: 1.25729
[600]	training's multi_logloss: 0.512941	valid_1's multi_logloss: 1.25426
[700]	training's multi_logloss: 0.445001	valid_1's multi_logloss: 1.25348
Early stopping, best iteration is:
[666]	training's multi_logloss: 0.466804	valid_1's multi_logloss: 1.25326
Fitting the model to the validation data...
----------------------------------------
Actual distribution of labels: Counter({4: 840, 2: 808, 3: 652, 1: 618, 0: 82})
Predicted distribution of labels: Counter({4: 1215, 2: 868, 1: 522, 3: 388, 0: 7})
CV split QWK score: 0.4274781890782652
Confusion matrix:
 [[  5  2

In [78]:
params = {'objective': 'softmax',
          'num_class': 5,
          'boosting': 'gbdt',
          'nthread': 4,
          'num_iterations': 10000,
          'learning_rate': 0.01,
          'num_leaves': 60,
          'max_depth': -1,
          'min_data_in_leaf': 120,
          'min_sum_hessian_in_leaf': 0.01,
          'bagging_fraction': 0.8,
          'bagging_frequency': 2,
          'feature_fraction': 0.8,
          'lambda_l2': 0.05,
          'min_gain_to_split': 0.0,
          'max_bin': 255,
          'early_stopping_rounds': 100,
          'data_random_seed': 42,
          'verbosity': -1,
          'verbose_eval': 100
         }

lgb_pred_probs, lgb_preds, lgb_qwk_scores, lgb_confusion_matrices = cv_model(X_train, y_train, 5, params)

Fold 1 / 5:
Training the LightGBM model...




Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 1.22375	valid_1's multi_logloss: 1.36018
[200]	training's multi_logloss: 1.06119	valid_1's multi_logloss: 1.31451
[300]	training's multi_logloss: 0.93395	valid_1's multi_logloss: 1.28991
[400]	training's multi_logloss: 0.827995	valid_1's multi_logloss: 1.27735
[500]	training's multi_logloss: 0.737889	valid_1's multi_logloss: 1.27035
[600]	training's multi_logloss: 0.660146	valid_1's multi_logloss: 1.26645
[700]	training's multi_logloss: 0.592492	valid_1's multi_logloss: 1.26467
[800]	training's multi_logloss: 0.53299	valid_1's multi_logloss: 1.26406
Early stopping, best iteration is:
[786]	training's multi_logloss: 0.54088	valid_1's multi_logloss: 1.26405
Fitting the model to the validation data...
----------------------------------------
Actual distribution of labels: Counter({4: 840, 2: 808, 3: 652, 1: 618, 0: 82})
Predicted distribution of labels: Counter({4: 1245, 2: 753, 1: 587, 3: 408, 