# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE




# Load the data

In [2]:
df0 = pd.read_csv('/content/ImbalancedDataset_v2.csv')
df0.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,SalaryBraket
0,25,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,18,,Some-college,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K
3,34,Private,10th,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
4,29,,HS-grad,Never-married,,Unmarried,Black,Male,0,0,40,United-States,<=50K


# Dealing with Missing Values

In [3]:
df0.drop(index = df0[df0.isnull().any(axis=1)].index, inplace = True)
df0.reset_index(drop = True, inplace = True)
df0.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,SalaryBraket
0,25,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,34,Private,10th,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
3,24,Private,Some-college,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
4,55,Private,7th-8th,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K


# Scaling Numerical Values

In [4]:
X = df0.iloc[:,:-1].values
y = df0.iloc[:,-1].values

sc  = StandardScaler()
X[:,[0,8,9,10]] = sc.fit_transform(X[:,[0,8,9,10]])

# Encoding Categorical Data

In [5]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,2,3,4,5,6,7,11])], remainder='passthrough')
X = ct.fit_transform(X)

# Encoding Dependent Variable

In [6]:
le = LabelEncoder()
y = le.fit_transform(y)

# Training Dummy Classifier 

In [7]:
clf_dummy = DummyClassifier(strategy="most_frequent")
scoring = ["accuracy", "balanced_accuracy"]
cv_result = cross_validate(clf_dummy, X, y, scoring = scoring)

print(f"Accuracy score of a dummy classifier: {cv_result['test_accuracy'].mean():0.4f}")
print(f"Balanced accuracy score of a dummy classifier: {cv_result['test_balanced_accuracy'].mean():0.3f}")

Accuracy score of a dummy classifier: 0.9663
Balanced accuracy score of a dummy classifier: 0.500


# Train Logistic Regression

In [8]:
logestic_regression_cl = LogisticRegression(max_iter=1000)
cv_result = cross_validate(logestic_regression_cl, X, y, scoring = scoring)
print(f"Accuracy score of a Logestic Regression classifier: {cv_result['test_accuracy'].mean():0.4f}")
print(f"Balanced accuracy score of a Logestic Regression classifier: {cv_result['test_balanced_accuracy'].mean():0.3f}")

Accuracy score of a Logestic Regression classifier: 0.9699
Balanced accuracy score of a Logestic Regression classifier: 0.577


# Random Forest Classifier

In [9]:
RandomForest_cl = RandomForestClassifier()
cv_result = cross_validate(RandomForest_cl, X, y, scoring = scoring)

print(f"Accuracy score of a Random Forest classifier: {cv_result['test_accuracy'].mean():0.4f}")
print(f"Balanced accuracy score of a Random Forest classifier: {cv_result['test_balanced_accuracy'].mean():0.3f}")

Accuracy score of a Random Forest classifier: 0.9678
Balanced accuracy score of a Random Forest classifier: 0.626


# Let's see the case of Logestic Regression with class_weight

In [10]:
logestic_regression_cl = LogisticRegression(max_iter=1000, class_weight= 'balanced')
cv_result = cross_validate(logestic_regression_cl, X, y, scoring = scoring)
print(f"Accuracy score of a Logestic Regression classifier with class_weight: {cv_result['test_accuracy'].mean():0.4f}")
print(f"Balanced accuracy score of a Logestic Regression classifier with class_weight: {cv_result['test_balanced_accuracy'].mean():0.3f}")

Accuracy score of a Logestic Regression classifier with class_weight: 0.7943
Balanced accuracy score of a Logestic Regression classifier with class_weight: 0.812


# Random Forest Classifier with class_weight

In [11]:
RandomForest_cl = RandomForestClassifier(class_weight= 'balanced')
cv_result = cross_validate(RandomForest_cl, X, y, scoring = scoring)

print(f"Accuracy score of a Random Forest classifier with class_weight: {cv_result['test_accuracy'].mean():0.4f}")
print(f"Balanced accuracy score of a Random Forest classifier with class_weight: {cv_result['test_balanced_accuracy'].mean():0.3f}")

Accuracy score of a Random Forest classifier with class_weight: 0.9591
Balanced accuracy score of a Random Forest classifier with class_weight: 0.623


# Using Undersampling - Logistic Regression


In [12]:
random_sampler = RandomUnderSampler()
X_res, y_res = random_sampler.fit_resample(X,y)

logestic_regression_cl = LogisticRegression(max_iter=1000, class_weight= 'balanced')
cv_result = cross_validate(logestic_regression_cl, X_res, y_res, scoring = scoring)
print(f"Accuracy score of a Logestic Regression classifier with under sampling and class_weight: {cv_result['test_accuracy'].mean():0.4f}")
print(f"Balanced accuracy score of a Logestic Regression classifier with under sampling and class_weight: {cv_result['test_balanced_accuracy'].mean():0.3f}")



Accuracy score of a Logestic Regression classifier with under sampling and class_weight: 0.8113
Balanced accuracy score of a Logestic Regression classifier with under sampling and class_weight: 0.811


# Using Undersampling - Random Forest

In [13]:
RandomForest_cl = RandomForestClassifier(class_weight= 'balanced')
cv_result = cross_validate(RandomForest_cl, X_res, y_res, scoring = scoring)

print(f"Accuracy score of a Random Forest classifier with under-sampling and class_weight: {cv_result['test_accuracy'].mean():0.4f}")
print(f"Balanced accuracy score of a Logestic Random Foreset with under-sampling and class_weight: {cv_result['test_balanced_accuracy'].mean():0.3f}")

Accuracy score of a Random Forest classifier with under-sampling and class_weight: 0.8045
Balanced accuracy score of a Logestic Random Foreset with under-sampling and class_weight: 0.805


# Using SMOTE - Logistic Regression

In [14]:
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X,y)



In [15]:
logestic_regression_cl = LogisticRegression(max_iter=1000, class_weight= 'balanced')
cv_result = cross_validate(logestic_regression_cl, X_smote, y_smote, scoring = scoring)
print(f"Accuracy score of a Logestic Regression classifier with SMOTE and class_weight: {cv_result['test_accuracy'].mean():0.4f}")
print(f"Balanced accuracy score of a Logestic Regression classifier  with SMOTE and class_weight: {cv_result['test_balanced_accuracy'].mean():0.3f}")

Accuracy score of a Logestic Regression classifier with SMOTE and class_weight: 0.8337
Balanced accuracy score of a Logestic Regression classifier  with SMOTE and class_weight: 0.834


In [16]:
RandomForest_cl = RandomForestClassifier(class_weight= 'balanced')
cv_result = cross_validate(RandomForest_cl, X_smote, y_smote, scoring = scoring)

print(f"Accuracy score of a Random Forest classifier with SMOTE over-sampler and class_weight: {cv_result['test_accuracy'].mean():0.4f}")
print(f"Balanced accuracy score of a Random Forest classifier with SMOTE over-sampler and class_weight: {cv_result['test_balanced_accuracy'].mean():0.3f}")

Accuracy score of a Random Forest classifier with SMOTE over-sampler and class_weight: 0.9714
Balanced accuracy score of a Random Forest classifier with SMOTE over-sampler and class_weight: 0.971
