# Under-sampling and Over-sampling


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import train_test_split

from imblearn.under_sampling import (
    RandomUnderSampler,  #randomly removing until dataset is balanced (could remove important information)
    CondensedNearestNeighbour, #remove noisy observations (too close to majority class). Quick, but repeats multiple time. #1-nn
    TomekLinks, #find closest minority and majority, remove majority class samples only/remove both majority and minority
    OneSidedSelection, #basically CNN and tomekLinks combined.
    EditedNearestNeighbours,  #clean noise, 3-NN. (removes complicated scenario, being too close to minority as a majority.)
    RepeatedEditedNearestNeighbours, #same as above, but restarts iteration process if something gets removed
    AllKNN #removes using 1-nn, then 3-nn, then more if needed
)

from imblearn.over_sampling import (
    RandomOverSampler, #extract oberservations at random from minority class until a certain balancing ratio is reached (usually 1) leads to overfit
    SMOTE #creates new observations near the area of 5-NN.
)

## Load data

In [None]:
# load data from Google MyDrive
from google.colab import drive
drive.mount('/content/drive')

data = pd.read_csv('/content/drive/MyDrive/protein_homology_prediction.csv')

data.head()

In [None]:
# size of data
data.shape
data.columns

## Imbalanced target

In [None]:
# imbalanced target
data.target.value_counts() / len(data)   # "target" is the name of the column containing the targets

## Separate train and test

In [None]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),  # drop the target
    data['target'],  # just the target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

## Random Undersampling


In [None]:
rus = RandomUnderSampler(
    sampling_strategy='auto',  # samples only from majority class
    random_state=0,  # for reproducibility
    replacement=True # if it should resample with replacement
)  

X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

## Condensed Nearest Neighbour

In [None]:
# this is going to take around 20-25 seconds

cnn = CondensedNearestNeighbour(
    sampling_strategy='auto',  # undersamples only the majority class
    random_state=0,  # for reproducibility
    n_neighbors=1,# default
    n_jobs=4)  # number of cores used to compute

X_resampled, y_resampled = cnn.fit_resample(X_train, y_train)

## TomekLinks

In [None]:
tl = TomekLinks(
    sampling_strategy='auto',  # undersamples only the majority class
    n_jobs=4)  # number of cores used to compute

X_resampled, y_resampled = tl.fit_resample(X_train, y_train)

## Edited Nearest Neighbours

In [None]:
enn = EditedNearestNeighbours(
    sampling_strategy='auto',  # undersamples only the majority class
    n_neighbors=3, # the number of neighbours to examine
    kind_sel='all',  # all neighbours need to have the same label as the observation examined
    n_jobs=4)  # number of cores used to compute

X_resampled, y_resampled = enn.fit_resample(X_train, y_train)

## Repeated Edited Nearest Neighbours





In [None]:
renn = RepeatedEditedNearestNeighbours(
    sampling_strategy='auto',# removes only the majority class
    n_neighbors=3, # the number of neighbours to examine
    kind_sel='all', # all neighbouring observations should show the same class
    n_jobs=4, # number of cores used to compute
    max_iter=100) # maximum number of iterations

X_resampled, y_resampled = renn.fit_resample(X_train, y_train)

## One-sided Selection

In [None]:
oss = OneSidedSelection(
    sampling_strategy='auto',  # undersamples only the majority class
    random_state=0,  # for reproducibility
    n_neighbors=1,# default, algo to find the hardest instances.
    n_jobs=4)  # number of cores used to compute

X_resampled, y_resampled = oss.fit_resample(X_train, y_train)

## All K-NN

In [None]:
allknn = AllKNN(
    sampling_strategy='auto',  # undersamples only the majority class
    n_neighbors=5, # the maximum size of the neighbourhood to examine
    kind_sel='all',  # all neighbours need to have the same label as the observation examined
    n_jobs=4)  # number of cores used to compute

X_resampled, y_resampled = allknn.fit_resample(X_train, y_train)

## Random Over-sampling

In [None]:
ros = RandomOverSampler(
    sampling_strategy='auto', # samples only the minority class
    random_state=0,  # for reproducibility
)  

X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

## Random Over-sampling with smoothing

In [None]:
shrink = 2

ros = RandomOverSampler(
  sampling_strategy='auto', # samples only the minority class
  random_state=0,  # for reproducibility
  shrinkage = shrink,
)  

X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

## SMOTE

In [None]:
sm = SMOTE(
    sampling_strategy='auto',  # samples only the minority class
    random_state=0,  # for reproducibility
    k_neighbors=5,
)

X_resampled, y_resampled  = sm.fit_resample(X_train, y_train)

## New size of the resampled dataset


In [None]:
# size of undersampled data

X_resampled.shape, y_resampled.shape

In [None]:
# number of positive class in original dataset
y_train.value_counts()

In [None]:
# number of positive class in resampled dataset
y_resampled.value_counts()

## Plot data

Let's compare how the data looks before and after the undersampling.

In [None]:
sns.scatterplot(data=data.sample(1784, random_state=0),
                x="0",
                y="1",
                hue="target")


In [None]:
col_names = [str(i) for i in range(74)] +['target']

data_resampled = pd.concat([X_resampled, y_resampled], axis=1)
data_resampled.columns = col_names

sns.scatterplot(data=data_resampled, x="0", y="1", hue="target")

In [None]:
sns.scatterplot(data=data.sample(1784, random_state=0),
                x="4",
                y="5",
                hue="target")

In [None]:
sns.scatterplot(data=data_resampled, x="4", y="5", hue="target")

## Machine learning performance comparison

Let's compare model performance with and without undersampling.

In [None]:
# function to train random forests and evaluate the performance

def run_randomForests(X_train, X_test, y_train, y_test):
    
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [None]:
# evaluate performance of algorithm built
# using imbalanced dataset

run_randomForests(X_train,
                  X_test,
                  y_train,
                  y_test)

In [None]:
# evaluate performance of algorithm built
# using undersampled dataset

run_randomForests(X_resampled,
                  X_test,
                  y_resampled,
                  y_test)