![Image](https://drive.google.com/uc?export=view&id=10B8NecPfn9sXRescmijQ8Zc2CO08fQm7)

## Data Imputation - Balancing & Resampling
### ACC Tech Challenge Series, Winter 2020
### Harper Xiang

Training a machine learning model on an imbalanced dataset can introduce unique challenges to the learning problem. Imbalanced data typically refers to a classification problem where the number of observations per class is not equally distributed; often you'll have a large amount of data/observations for one class (referred to as the majority class), and much fewer observations for one or more other classes (referred to as the minority classes).
https://www.jeremyjordan.me/imbalanced-data/

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
datapath = "data/"
file_name = "inbalanced.csv"

In [3]:
df_total = pd.read_csv(datapath+file_name)
df_total.shape

(70428, 7)

In [4]:
df_total.head()

Unnamed: 0,fullVisitorId,avg_pageviews,subContinent_Northern America,avg_hits,bounces,transactionRevenue,trans_label
0,6577012298964649552,2.0,0.0,2.0,0.0,0.0,0
1,1492952808289580535,3.5,2.0,5.5,0.0,0.0,0
2,3056727511747081031,1.0,0.0,1.0,1.0,0.0,0
3,540875193565460002,6.0,0.0,8.0,0.0,0.0,0
4,5904797111043658556,1.0,0.0,1.0,1.0,0.0,0


In [6]:
# Target variable is "trans_label"

dfx1 = df_total.drop(["fullVisitorId", "trans_label"], axis=1)
dfy1 = pd.DataFrame({"trans_label" : df_total["trans_label"]})
print(dfx1.shape, dfy1.shape)

(70428, 5) (70428, 1)


In [7]:
dfy1["trans_label"].value_counts()

0    69438
1      990
Name: trans_label, dtype: int64

In [8]:
# features selected by their importance
features_selected = ['avg_pageviews',
                     'subContinent_Northern America',
                     'avg_hits',
                     'bounces']

In [9]:
dfx1 = dfx1[features_selected]
dfx1.shape

(70428, 4)

## Predictor Variables Standardization

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
def standardize_dataframe(df):
    '''
    Use StandardScaler to standardizing a dataframe
    ----------
    Parameters
    df: A dataframe
    ----------
    Returns
    scaled_array: A standardized numpy.array
    '''
    stscaler = StandardScaler().fit(df)
    scaled_array = stscaler.transform(df)
    return scaled_array

In [12]:
# Standardize the dataframe

dfx2 = standardize_dataframe(dfx1)
dfx2 = pd.DataFrame(dfx2, columns=dfx1.columns)
dfx2.shape

(70428, 4)

## Train & Test Split

In [14]:
from sklearn.model_selection import train_test_split

In [16]:
X_train_ib, X_test_ib, y_train_ib, y_test_ib = \
train_test_split(dfx2, dfy1, test_size=0.3, random_state=42)

print(X_train_ib.shape, y_train_ib.shape, X_test_ib.shape, y_test_ib.shape)

(49299, 4) (49299, 1) (21129, 4) (21129, 1)


## Resample the Imbalanced Data

In [17]:
from imblearn.over_sampling import SMOTE

Using TensorFlow backend.
  return f(*args, **kwds)


In [18]:
# Resample the dataset

smote = SMOTE(sampling_strategy=0.2)
X_train_bd, y_train_bd = smote.fit_resample(X_train_ib, y_train_ib)
print(X_train_bd.shape, y_train_bd.shape)

(58353, 4) (58353, 1)


In [19]:
y_train_bd["trans_label"].value_counts()

0    48628
1     9725
Name: trans_label, dtype: int64

## Modeling over Inbalanced Data

In [22]:
from sklearn.linear_model import LogisticRegression
# from sklearn.tree         import DecisionTreeClassifier
# from sklearn.svm          import SVC
# from sklearn.ensemble     import RandomForestRegressor
from sklearn.metrics      import mean_squared_error, r2_score
from sklearn.metrics      import accuracy_score, classification_report, f1_score, precision_score, recall_score
# import pickle

In [23]:
# Run Logistic Regression

s1_lr = LogisticRegression()
s1_lr.fit(X_train_ib, y_train_ib)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
# model evaluation
labels = [0, 1]

y_pred_train = s1_lr.predict(X_train_ib)
accuracy = accuracy_score(y_train_ib, y_pred_train)
classrpt = classification_report(y_train_ib, y_pred_train, labels=labels)
print(f"Train Accuracy   : {accuracy}")
print(classrpt)
print("==============================")

y_pred_test = s1_lr.predict(X_test_ib)
accuracy = accuracy_score(y_test_ib, y_pred_test)
classrpt = classification_report(y_test_ib, y_pred_test, labels=labels)
print(f"Test  Accuracy   : {accuracy}")
print(classrpt)

Train Accuracy   : 0.985760360250715
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     48628
           1       0.43      0.15      0.23       671

    accuracy                           0.99     49299
   macro avg       0.71      0.57      0.61     49299
weighted avg       0.98      0.99      0.98     49299

Test  Accuracy   : 0.9858961616735292
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     20810
           1       0.60      0.20      0.30       319

    accuracy                           0.99     21129
   macro avg       0.79      0.60      0.65     21129
weighted avg       0.98      0.99      0.98     21129



In [25]:
print(y_pred_test.sum())

107


## Modeling over Balanced Data

In [26]:
# Run Logistic Regression

s2_lr = LogisticRegression()
s2_lr.fit(X_train_bd, y_train_bd)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
# model evaluation
labels = [0, 1]

y_pred_train = s2_lr.predict(X_train_bd)
accuracy = accuracy_score(y_train_bd, y_pred_train)
classrpt = classification_report(y_train_bd, y_pred_train, labels=labels)
print(f"Train Accuracy   : {accuracy}")
print(classrpt)
print("==============================")

y_pred_test = s2_lr.predict(X_test_ib)
accuracy = accuracy_score(y_test_ib, y_pred_test)
classrpt = classification_report(y_test_ib, y_pred_test, labels=labels)
print(f"Test  Accuracy   : {accuracy}")
print(classrpt)

Train Accuracy   : 0.9416311072267064
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     48628
           1       0.85      0.79      0.82      9725

    accuracy                           0.94     58353
   macro avg       0.90      0.88      0.89     58353
weighted avg       0.94      0.94      0.94     58353

Test  Accuracy   : 0.970845757016423
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     20810
           1       0.32      0.82      0.46       319

    accuracy                           0.97     21129
   macro avg       0.66      0.90      0.72     21129
weighted avg       0.99      0.97      0.98     21129



In [28]:
print(y_pred_test.sum())

821
