In [10]:
#scipy
import scipy.sparse as sp

#pandas
import pandas as pd

#numpy
import numpy as np

#ploting
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

#sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
#Tree
from sklearn.tree import DecisionTreeClassifier
#Ensemble
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
#Regression
from sklearn.linear_model import LogisticRegression
#SVM
from sklearn.svm import LinearSVC

from sklearn.model_selection import learning_curve


# score
from sklearn.metrics import classification_report, precision_score, make_scorer, recall_score, f1_score, confusion_matrix, accuracy_score, roc_auc_score, cohen_kappa_score

#h5py dataset processing
import h5py

#supress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
target_names = ['No Rain', 'Rain']

In [3]:
# Open the file as readonly
h5f = h5py.File('task1data/Final_Final_dataset.h5', 'r')

# Load the training, test and validation set
X_train = h5f['X_train'][:]
X_test = h5f['X_test'][:]
X_train_Over = h5f['X_train_Over'][:]
X_train_Under = h5f['X_train_Under'][:]
yTrain = h5f['yTrain'][:]
yTest = h5f['yTest'][:]
yTrain_Over = h5f['yTrain_Over'][:]
yTrain_Under = h5f['yTrain_Under'][:]

# Close this file
h5f.close()

In [5]:
print('Training Normal ', X_train.shape,  yTrain.shape)
print('Training Oversampled', X_train_Over.shape,  yTrain_Over.shape)
print('Training Undersampled', X_train_Under.shape, yTrain_Under.shape)
print('Test', X_test.shape, yTest.shape)

Training Normal  (124907, 20) (124907,)
Training Oversampled (194072, 20) (194072,)
Training Undersampled (55742, 20) (55742,)
Test (17286, 20) (17286,)


# Models

In [6]:
# basic function that takes classifier and datasets to display the metrics
def algo(model, x_train, y_train, x_test, y_test):
    
    model.fit(x_train, y_train)
    print('\t\tConfusion Matrix & Performance Metrics')
    print('  ')
    predictions = model.predict(x_test)
    conf_mat = pd.DataFrame(confusion_matrix(y_test, predictions),columns=['Pred No Rain', 'Pred Rain'], index=['No Rain', 'Rain'])
    print(conf_mat)
    print('  ')
    print('\t\tClassification Report')
    print(classification_report(y_test, predictions, target_names=target_names))
    metrics = {
              'ROC AUC': [roc_auc_score(y_test, predictions)],
              'Accuracy': [accuracy_score(y_test, predictions)],
              }
    
    print('  ')
    print(pd.DataFrame(data=metrics, index=['']))
    print('  ')
   
   
    return

# Algorithms based on the imbalanced training data

For each version of the training set (imbalaned, undersampled, oversampled) 4 classifiers were selected and trained to observe their performance.

Tree based classifiers are among the best performing classifiers out of the box. This will allow me to get good results without ignorantly fingering the hyperparameters.

Tree Ensemble classifiers almost always outperform single decision trees, so I jumped to that first, Selecting the Random Forest and Gradient Boosting classifiers. Random Forest classifiers are robust as it trains a number of trees using different samples of the data which helps it outperform single trees, while Gradient Boosting builds trees one at a time but each new tree can build on the errors from previous trees. Adaboost was used because it boosts weak learners and with scikit the default learner is a 1 depth decision tree. Finally, I attempted to train what I believe is a cost sensitive SVM classifier which penalizes mistakes on the minority class.

## Random Forest

In [15]:
RF_clf = RandomForestClassifier(random_state=66)
algo(RF_clf, X_train, yTrain, X_test, yTest)

		Confusion Matrix & Performance Metrics
  
         Pred No Rain  Pred Rain
No Rain         12758        522
Rain             2714       1292
  
		Classification Report
              precision    recall  f1-score   support

     No Rain       0.82      0.96      0.89     13280
        Rain       0.71      0.32      0.44      4006

   micro avg       0.81      0.81      0.81     17286
   macro avg       0.77      0.64      0.67     17286
weighted avg       0.80      0.81      0.78     17286

  
   ROC AUC  Accuracy
  0.641604  0.812796
  


## Gradient Boost

In [16]:
GB_clf = GradientBoostingClassifier(random_state=66)
algo(GB_clf, X_train, yTrain, X_test, yTest)

		Confusion Matrix & Performance Metrics
  
         Pred No Rain  Pred Rain
No Rain         12914        366
Rain             2600       1406
  
		Classification Report
              precision    recall  f1-score   support

     No Rain       0.83      0.97      0.90     13280
        Rain       0.79      0.35      0.49      4006

   micro avg       0.83      0.83      0.83     17286
   macro avg       0.81      0.66      0.69     17286
weighted avg       0.82      0.83      0.80     17286

  
   ROC AUC  Accuracy
  0.661707  0.828416
  


## ADA Boost

In [19]:
%%time
ADB_clf = AdaBoostClassifier(random_state=66)
algo(ADB_clf, X_train, yTrain, X_test, yTest)

		Confusion Matrix & Performance Metrics
  
         Pred No Rain  Pred Rain
No Rain         13121        159
Rain             3133        873
  
		Classification Report
              precision    recall  f1-score   support

     No Rain       0.81      0.99      0.89     13280
        Rain       0.85      0.22      0.35      4006

   micro avg       0.81      0.81      0.81     17286
   macro avg       0.83      0.60      0.62     17286
weighted avg       0.82      0.81      0.76     17286

  
   ROC AUC  Accuracy
  0.602975  0.809557
  
Wall time: 16.7 s


## SVM

In [25]:
%%time
SVM_clf = LinearSVC(class_weight='balanced',random_state=66)
algo(SVM_clf, X_train, yTrain, X_test, yTest)

		Confusion Matrix & Performance Metrics
  
         Pred No Rain  Pred Rain
No Rain         12992        288
Rain             3758        248
  
		Classification Report
              precision    recall  f1-score   support

     No Rain       0.78      0.98      0.87     13280
        Rain       0.46      0.06      0.11      4006

   micro avg       0.77      0.77      0.77     17286
   macro avg       0.62      0.52      0.49     17286
weighted avg       0.70      0.77      0.69     17286

  
  ROC AUC  Accuracy
  0.52011  0.765938
  
Wall time: 33.3 s


# Algorithms based on the Undersampled training data

## Random Forest

In [35]:
RF_clf = RandomForestClassifier(random_state=66)
algo(RF_clf, X_train_Under, yTrain_Under, X_test, yTest)

		Confusion Matrix & Performance Metrics
  
         Pred No Rain  Pred Rain
No Rain         11597       1683
Rain             1872       2134
  
		Classification Report
              precision    recall  f1-score   support

     No Rain       0.86      0.87      0.87     13280
        Rain       0.56      0.53      0.55      4006

   micro avg       0.79      0.79      0.79     17286
   macro avg       0.71      0.70      0.71     17286
weighted avg       0.79      0.79      0.79     17286

  
   ROC AUC  Accuracy
  0.702985  0.794342
  


## Gradient Boost

In [36]:
GB_clf = GradientBoostingClassifier(random_state=66)
algo(GB_clf, X_train_Under, yTrain_Under, X_test, yTest)

		Confusion Matrix & Performance Metrics
  
         Pred No Rain  Pred Rain
No Rain         11758       1522
Rain             1676       2330
  
		Classification Report
              precision    recall  f1-score   support

     No Rain       0.88      0.89      0.88     13280
        Rain       0.60      0.58      0.59      4006

   micro avg       0.81      0.81      0.81     17286
   macro avg       0.74      0.73      0.74     17286
weighted avg       0.81      0.81      0.81     17286

  
  ROC AUC  Accuracy
  0.73351  0.814995
  


## ADA Boost

In [37]:
%%time
ADB_clf = AdaBoostClassifier(random_state=66)
algo(ADB_clf, X_train_Under, yTrain_Under, X_test, yTest)

		Confusion Matrix & Performance Metrics
  
         Pred No Rain  Pred Rain
No Rain         12235       1045
Rain             2072       1934
  
		Classification Report
              precision    recall  f1-score   support

     No Rain       0.86      0.92      0.89     13280
        Rain       0.65      0.48      0.55      4006

   micro avg       0.82      0.82      0.82     17286
   macro avg       0.75      0.70      0.72     17286
weighted avg       0.81      0.82      0.81     17286

  
   ROC AUC  Accuracy
  0.702043  0.819681
  
Wall time: 6.2 s


## SVM

In [38]:
%%time
SVM_clf = LinearSVC(class_weight='balanced',random_state=66)
algo(SVM_clf, X_train_Under, yTrain_Under, X_test, yTest)

		Confusion Matrix & Performance Metrics
  
         Pred No Rain  Pred Rain
No Rain         12992        288
Rain             3757        249
  
		Classification Report
              precision    recall  f1-score   support

     No Rain       0.78      0.98      0.87     13280
        Rain       0.46      0.06      0.11      4006

   micro avg       0.77      0.77      0.77     17286
   macro avg       0.62      0.52      0.49     17286
weighted avg       0.70      0.77      0.69     17286

  
   ROC AUC  Accuracy
  0.520235  0.765996
  
Wall time: 8.49 s


# Algorithms based on the Oversampled training data

## Random Forest

In [39]:
RF_clf = RandomForestClassifier(random_state=66)
algo(RF_clf, X_train_Over, yTrain_Over, X_test, yTest)

		Confusion Matrix & Performance Metrics
  
         Pred No Rain  Pred Rain
No Rain         12783        497
Rain             2840       1166
  
		Classification Report
              precision    recall  f1-score   support

     No Rain       0.82      0.96      0.88     13280
        Rain       0.70      0.29      0.41      4006

   micro avg       0.81      0.81      0.81     17286
   macro avg       0.76      0.63      0.65     17286
weighted avg       0.79      0.81      0.77     17286

  
   ROC AUC  Accuracy
  0.626819  0.806954
  


## Gradient Boost

In [40]:
GB_clf = GradientBoostingClassifier(random_state=66)
algo(GB_clf, X_train_Over, yTrain_Over, X_test, yTest)

		Confusion Matrix & Performance Metrics
  
         Pred No Rain  Pred Rain
No Rain         11838       1442
Rain             1701       2305
  
		Classification Report
              precision    recall  f1-score   support

     No Rain       0.87      0.89      0.88     13280
        Rain       0.62      0.58      0.59      4006

   micro avg       0.82      0.82      0.82     17286
   macro avg       0.74      0.73      0.74     17286
weighted avg       0.81      0.82      0.82     17286

  
   ROC AUC  Accuracy
  0.733401  0.818177
  


## ADA Boost

In [41]:
%%time
ADB_clf = AdaBoostClassifier(random_state=66)
algo(ADB_clf, X_train_Over, yTrain_Over, X_test, yTest)

		Confusion Matrix & Performance Metrics
  
         Pred No Rain  Pred Rain
No Rain         12557        723
Rain             2379       1627
  
		Classification Report
              precision    recall  f1-score   support

     No Rain       0.84      0.95      0.89     13280
        Rain       0.69      0.41      0.51      4006

   micro avg       0.82      0.82      0.82     17286
   macro avg       0.77      0.68      0.70     17286
weighted avg       0.81      0.82      0.80     17286

  
   ROC AUC  Accuracy
  0.675849  0.820548
  
Wall time: 29.5 s


## SVM

In [44]:
%%time
SVM_clf = LinearSVC(class_weight='balanced',random_state=66)
algo(SVM_clf, X_train_Over, yTrain_Over, X_test, yTest)

		Confusion Matrix & Performance Metrics
  
         Pred No Rain  Pred Rain
No Rain         12991        289
Rain             3757        249
  
		Classification Report
              precision    recall  f1-score   support

     No Rain       0.78      0.98      0.87     13280
        Rain       0.46      0.06      0.11      4006

   micro avg       0.77      0.77      0.77     17286
   macro avg       0.62      0.52      0.49     17286
weighted avg       0.70      0.77      0.69     17286

  
   ROC AUC  Accuracy
  0.520197  0.765938
  
Wall time: 27.8 s
