# Lab 7.5: Handling data imbalance and classification

## Importing libraries

In [65]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.tree import export_text
from sklearn import tree

In [68]:
# Importing the cleaned dataset from Monday's lab:

donors = pd.read_csv('learningSet_clean.csv')
donors.shape

(95412, 336)

In [69]:
print(donors.shape)
donors.dtypes

(95412, 336)


ODATEDW       int64
TCODE         int64
DOB           int64
AGE         float64
INCOME      float64
             ...   
RFA_2R       object
RFA_2A       object
GEOCODE2     object
DOMAIN_A     object
DOMAIN_B      int64
Length: 336, dtype: object

In [70]:
nulls = donors.isnull().sum()
nulls.sort_values(ascending=False)

SOLIH       89212
VETERANS    84986
NEXTDATE     9973
EIC16           0
EC1             0
            ...  
ETHC6           0
ETHC5           0
ETHC4           0
ETHC3           0
DOMAIN_B        0
Length: 336, dtype: int64

In [71]:
# Dropping these columns
# Far too many nulls to make any sense in our 336 variable data set
# Manufacturing data may actually be worse than using the data

donors = donors.drop(['SOLIH', 'VETERANS', 'NEXTDATE'], axis=1)

## X, y split and dealing with categoricals / numericals

In [74]:
X = donors.drop('TARGET_B', axis=1)
y = donors['TARGET_B']

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

In [76]:
X_train_num = X_train.select_dtypes(include = np.number)
transformer = MinMaxScaler().fit(X_train_num) 

In [77]:
X_train_normalized = transformer.transform(X_train_num)
X_train_norm = pd.DataFrame(X_train_normalized, columns=X_train_num.columns)
X_train_norm = X_train_norm.reset_index(drop=True)

In [None]:
# X_train_norm.head()

In [78]:
X_test_num = X_test.select_dtypes(include = np.number)

In [79]:
X_test_normalized = transformer.transform(X_test_num)
X_test_norm = pd.DataFrame(X_test_normalized, columns=X_test_num.columns)
X_test_norm = X_test_norm.reset_index(drop=True)

In [None]:
# X_test_norm.head()

In [80]:
X_train_categorical = X_train.select_dtypes(include = object)
X_test_categorical = X_test.select_dtypes(include = object)

In [81]:
encoder = OneHotEncoder(drop='first').fit(X_train_categorical)
encoded = encoder.transform(X_train_categorical).toarray()

cols = encoder.get_feature_names_out(input_features=X_train_categorical.columns)

X_train_cat = onehot_encoded = pd.DataFrame(encoded, columns=cols)
# X_train_cat.head()

In [82]:
encoder = OneHotEncoder(drop='first').fit(X_test_categorical)
encoded = encoder.transform(X_test_categorical).toarray()

cols = encoder.get_feature_names_out(input_features=X_test_categorical.columns)

X_test_cat = onehot_encoded = pd.DataFrame(encoded, columns=cols)
# X_test_cat.head()

In [83]:
X_train_transformed = pd.concat([X_train_norm, X_train_cat], axis=1)
# X_train_transformed

In [84]:
X_test_transformed = pd.concat([X_test_norm, X_test_cat], axis=1)
# X_test_transformed 

In [85]:
y_train = y_train.reset_index(drop=True) 
y_test = y_test.reset_index(drop=True) 

## Training the model

In [88]:
def models_automation(models, X_tr, y_tr, X_te, y_te):
    for model in models:
        model.fit(X_tr, y_tr)
        print(f"{model.__class__.__name__}: Train -> {model.score(X_tr, y_tr)}, Test -> {model.score(X_te, y_te)}")

In [89]:
models = [LogisticRegression(random_state=69, solver='saga',
                  multi_class='multinomial')]
models_automation(models, X_train_transformed, y_train, X_test_transformed, y_test)

LogisticRegression: Train -> 0.9873180573569679, Test -> 0.9878949850652413


In [91]:
LR = LogisticRegression(random_state=69, solver='saga',
                  multi_class='multinomial').fit(X_train_transformed, y_train)

In [92]:
pred = LR.predict(X_test_transformed)
LR.score(X_test_transformed, y_test)

0.9878949850652413

In [93]:
confusion_matrix(y_test, pred)

array([[18157,     0],
       [  231,   695]])

In [94]:
# Scores for unbalanced target:

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  1.0
recall:  0.7505399568034558
f1:  0.8574953732264035


## Fixing imbalanced data in 'TARGET_B'

In [121]:
train_imbalance = pd.concat([X_train_norm, y_train],axis=1)

In [122]:
category_0 = train_imbalance[train_imbalance['TARGET_B'] == 0]
category_1 = train_imbalance[train_imbalance['TARGET_B'] == 1]

In [124]:
train_imbalance['TARGET_B'].value_counts()

0    72412
1     3917
Name: TARGET_B, dtype: int64

### Downsampling

In [125]:
category_0_undersampled = resample(category_0, 
                                   replace=False, 
                                   n_samples = len(category_1))

In [126]:
train_undersampled = pd.concat([category_0_undersampled, category_1], axis=0)

In [127]:
y_train_under = train_undersampled['TARGET_B'].copy()
X_train_under = train_undersampled.drop('TARGET_B', axis=1).copy()

In [128]:
y_train_under.value_counts()

0    3917
1    3917
Name: TARGET_B, dtype: int64

In [129]:
LR_under = LogisticRegression(random_state=69, solver='saga', multi_class='multinomial')
LR_under.fit(X_train_under, y_train_under)
pred_under = LR_under.predict(X_test_norm)

print("precision: ",precision_score(y_test,pred_under))
print("recall: ",recall_score(y_test,pred_under))
print("f1: ",f1_score(y_test,pred_under))

precision:  0.8330188679245283
recall:  0.9535637149028078
f1:  0.8892245720040282


### Upsampling

In [130]:
category_1_oversampled = resample(category_1, 
                                  replace=True, 
                                  n_samples = len(category_0))

In [131]:
train_oversampled = pd.concat([category_0, category_1_oversampled], axis=0)

In [132]:
y_train_over = train_oversampled['TARGET_B'].copy()
X_train_over = train_oversampled.drop('TARGET_B', axis=1).copy()

In [133]:
y_train_over.value_counts()

0    72412
1    72412
Name: TARGET_B, dtype: int64

In [134]:
LR_over = LogisticRegression(random_state=69, solver='saga', multi_class='multinomial')
LR_over.fit(X_train_over, y_train_over)
pred_over = LR_over.predict(X_test_norm)

print("precision: ",precision_score(y_test,pred_over))
print("recall: ",recall_score(y_test,pred_over))
print("f1: ",f1_score(y_test,pred_over))

precision:  0.9977973568281938
recall:  0.978401727861771
f1:  0.9880043620501635


### SMOTE

In [111]:
sm = SMOTE(random_state=100, k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_transformed, y_train)

In [112]:
LR.fit(X_train_SMOTE, y_train_SMOTE)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  1.0
recall:  0.7505399568034558
f1:  0.8574953732264035


In [None]:
# Conclusion:

# If I did the test / train split and scaling / encoding correctly...
# ...then we see that the scores after balancing the target are far superior
# In this case, I would use the oversampled data, which had a higher set of scores than all the others 
# This makes sense given the size of the imbalance in TARGET_B