# Fixing Imbalance in Classification Dataset
- `Undersampling` -> Randomly remove some samples from the majority class
- `Oversampling` -> Synthetically some samples from the minority class

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
url ='https://raw.githubusercontent.com/digipodium/Datasets/main/classfication/diabetes.csv'
df = pd.read_csv(url)
df

0 - Non-Diabetes
1 - Diabetes

In [None]:
sns.countplot(data=df, x='Outcome')

In [None]:
from imblearn.under_sampling import (
    RepeatedEditedNearestNeighbours, 
    RandomUnderSampler
)

In [None]:
df.Outcome.value_counts()

In [None]:
X, y = df.drop('Outcome', axis=1), df['Outcome']
us1 = RandomUnderSampler()
Xr, yr =us1.fit_resample(X, y)
print(X.shape, y.shape)
print("Undersampled Data sizes")
print(Xr.shape, yr.shape)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(Xr, yr, test_size=0.2, random_state=0)
model = KNeighborsClassifier()
model.fit(Xtrain, ytrain)
ypred = model.predict(Xtest)
fig, ax = plt.subplots(figsize=(3,3))
sns.heatmap(confusion_matrix(ytest, ypred), annot=True,ax=ax, cbar=False)
print(classification_report(ytest, ypred))

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE()
Xs, ys = smote.fit_resample(X, y)
print(X.shape, y.shape)
print("Oversampled Data sizes")
print(Xs.shape, ys.shape)

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(Xs, ys, test_size=0.2, random_state=0)
model = KNeighborsClassifier()
model.fit(Xtrain, ytrain)
ypred = model.predict(Xtest)
fig, ax = plt.subplots(figsize=(3,3))
sns.heatmap(confusion_matrix(ytest, ypred), annot=True,ax=ax, cbar=False)
print(classification_report(ytest, ypred))