**Fixing Imbalances**



In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/content/Creditcard_data.csv')

class_counts = df['Class'].value_counts()

print(class_counts)

0    763
1      9
Name: Class, dtype: int64


In [3]:
from imblearn.over_sampling import RandomOverSampler
target_var = 'Class'
oversampling_rate = 1.0
ros = RandomOverSampler(sampling_strategy=oversampling_rate, random_state=42)
X = df.drop(target_var, axis=1)
y = df[target_var]
X_resampled, y_resampled = ros.fit_resample(X, y)
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)


In [4]:
num_rows, num_cols = df.shape
print("Number of rows in original df: ", num_rows)
print("Number of columns in original df: ", num_cols)

Number of rows in original df:  772
Number of columns in original df:  31


In [5]:
num_rows, num_cols = df_resampled.shape
print("Number of rows in balanced df: ", num_rows)
print("Number of columns in balanced df: ", num_cols)

Number of rows in balanced df:  1526
Number of columns in balanced df:  31


In [6]:
class_counts = df_resampled['Class'].value_counts()

print(class_counts)

0    763
1    763
Name: Class, dtype: int64


In [7]:
shuffled_df = df_resampled.sample(frac=1, random_state=1).reset_index(drop=True)
shuffled_df.to_csv('balanced_df.csv')

Performing Simple Random Sampling

In [8]:
import random
n = int(1526/2)

df_srs = pd.read_csv('balanced_df.csv')
df_srs = df_srs.sample(n)
num_rows, num_cols = df_srs.shape
print("Number of rows in Simple Random Sampling df: ", num_rows)
print("Number of columns in Simple Random Sampling df: ", num_cols)
df_srs.to_csv('df_srs.csv')

Number of rows in Simple Random Sampling df:  763
Number of columns in Simple Random Sampling df:  32


Systematic Sampling

In [9]:
n = int(1526/2)
df = pd.read_csv('balanced_df.csv')
k = int(len(df) / n)
start_idx = k // 2  
idx = range(start_idx, len(df), k)
df_sys_s = df.iloc[idx]
df_sys_s.to_csv('df_sys_s.csv')

Clustering Sampling

In [10]:
import pandas as pd
import random

df = pd.read_csv('/content/balanced_df.csv')

sample_size = 2
selected_clusters = random.sample(list(df['Class'].unique()), sample_size)

proportion = 0.5

sampled_df = df[df['Class'].isin(selected_clusters)].groupby('Class').apply(lambda x: x.sample(frac=proportion))


sampled_df.reset_index(drop=True, inplace=True)


sampled_df.to_csv('df_cluster_s.csv')


Stratified Sampling

In [11]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
df = pd.read_csv('/content/balanced_df.csv')
strat_var = 'Class'
test_size = 0.5
splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=42)
for train_idx, test_idx in splitter.split(df, df[strat_var]):
    train_set = df.loc[train_idx]
    test_set = df.loc[test_idx]
train_set.to_csv('df_stratified_s.csv')

Training 

In [15]:
!pip install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [17]:

!pip uninstall scikit-learn -y

!pip install scikit-learn==0.23.2


Found existing installation: scikit-learn 0.23.2
Uninstalling scikit-learn-0.23.2:
  Successfully uninstalled scikit-learn-0.23.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn==0.23.2
  Using cached scikit_learn-0.23.2-cp38-cp38-manylinux1_x86_64.whl (6.8 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-0.23.2


Running all models and comparing the results

1.On Random Sampled Data

In [1]:
# Import necessary libraries
from pycaret.classification import *
import pandas as pd

# Load the dataset
df=pd.read_csv('df_srs.csv')
df=df.iloc[:,2:]

# Initialize the PyCaret setup
clf = setup(df, target='Class', silent=True,preprocess=False)

# Compare the performance of several models
best_model = compare_models()

# Print the model's performance metrics
print(best_model)



Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9981,1.0,1.0,0.9963,0.9981,0.9962,0.9963,0.471
rf,Random Forest Classifier,0.9962,1.0,1.0,0.9927,0.9963,0.9925,0.9926,0.221
gbc,Gradient Boosting Classifier,0.9943,1.0,1.0,0.9892,0.9945,0.9887,0.9889,0.412
lightgbm,Light Gradient Boosting Machine,0.9943,1.0,1.0,0.9892,0.9945,0.9887,0.9889,0.296
dt,Decision Tree Classifier,0.9925,0.9925,1.0,0.9857,0.9927,0.9849,0.9853,0.02
ada,Ada Boost Classifier,0.9925,1.0,1.0,0.9856,0.9927,0.985,0.9852,0.12
knn,K Neighbors Classifier,0.942,0.9852,1.0,0.8977,0.9455,0.8841,0.8911,0.022
lr,Logistic Regression,0.9383,0.9625,1.0,0.892,0.9425,0.8766,0.8842,0.658
lda,Linear Discriminant Analysis,0.8803,0.9421,0.9179,0.8556,0.885,0.7606,0.764,0.038
ridge,Ridge Classifier,0.8747,0.0,0.9179,0.8485,0.8808,0.7493,0.754,0.012


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=900, verbose=0,
                     warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=900, verbose=0,
                     warm_start=False)


2.On Systematic Sampled Data

In [2]:
# Import necessary libraries
from pycaret.classification import *
import pandas as pd

# Load the dataset
df=pd.read_csv('df_sys_s.csv')
df=df.iloc[:,2:]

# Initialize the PyCaret setup
clf = setup(df, target='Class', silent=True,preprocess=False)

# Compare the performance of several models
best_model = compare_models()

# Print the model's performance metrics
print(best_model)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9981,1.0,1.0,0.9964,0.9982,0.9963,0.9964,0.389
ada,Ada Boost Classifier,0.9981,1.0,1.0,0.9963,0.9981,0.9962,0.9963,0.149
et,Extra Trees Classifier,0.9981,1.0,1.0,0.9964,0.9982,0.9963,0.9964,0.176
gbc,Gradient Boosting Classifier,0.9944,1.0,1.0,0.9892,0.9945,0.9887,0.9889,0.246
dt,Decision Tree Classifier,0.9906,0.9907,1.0,0.9819,0.9907,0.9812,0.9817,0.023
lightgbm,Light Gradient Boosting Machine,0.9888,1.0,1.0,0.9786,0.989,0.9776,0.9781,0.091
knn,K Neighbors Classifier,0.972,0.985,1.0,0.9495,0.9735,0.9439,0.9467,0.039
lr,Logistic Regression,0.9345,0.96,1.0,0.8862,0.9389,0.869,0.8779,0.286
lda,Linear Discriminant Analysis,0.8819,0.9425,0.9056,0.8683,0.8848,0.7638,0.7677,0.015
ridge,Ridge Classifier,0.88,0.0,0.9056,0.8629,0.8825,0.7602,0.7635,0.019


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=2391, verbose=0,
                       warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=2391, verbose=0,
                       warm_start=False)


3.On Clustering Sampled Data

In [3]:
# Import necessary libraries
from pycaret.classification import *
import pandas as pd

# Load the dataset
df=pd.read_csv('df_cluster_s.csv')
df=df.iloc[:,2:]

# Initialize the PyCaret setup
clf = setup(df, target='Class', silent=True,preprocess=False)

# Compare the performance of several models
best_model = compare_models()

# Print the model's performance metrics
print(best_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.22
lightgbm,Light Gradient Boosting Machine,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.098
rf,Random Forest Classifier,0.9981,1.0,1.0,0.9963,0.9981,0.9962,0.9963,0.491
gbc,Gradient Boosting Classifier,0.9981,1.0,1.0,0.9963,0.9981,0.9962,0.9963,0.429
ada,Ada Boost Classifier,0.9962,1.0,1.0,0.9926,0.9962,0.9925,0.9926,0.286
dt,Decision Tree Classifier,0.9888,0.989,1.0,0.9783,0.9889,0.9777,0.9782,0.015
knn,K Neighbors Classifier,0.9607,0.9798,1.0,0.9277,0.962,0.9214,0.9252,0.025
lr,Logistic Regression,0.9401,0.9634,1.0,0.892,0.9427,0.8805,0.8872,0.185
lda,Linear Discriminant Analysis,0.9214,0.9423,1.0,0.8636,0.9263,0.8433,0.8547,0.024
ridge,Ridge Classifier,0.8878,0.0,0.9473,0.8468,0.8915,0.7763,0.7878,0.014


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=8039, verbose=0,
                     warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=8039, verbose=0,
                     warm_start=False)


4.On Stratified Sampled Data

In [4]:
# Import necessary libraries
from pycaret.classification import *
import pandas as pd

# Load the dataset
df=pd.read_csv('df_stratified_s.csv')
df=df.iloc[:,2:]

# Initialize the PyCaret setup
clf = setup(df, target='Class', silent=True,preprocess=False)

# Compare the performance of several models
best_model = compare_models()

# Print the model's performance metrics
print(best_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.221
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.318
lightgbm,Light Gradient Boosting Machine,0.9963,1.0,1.0,0.9929,0.9964,0.9925,0.9927,0.314
gbc,Gradient Boosting Classifier,0.9944,1.0,1.0,0.9893,0.9945,0.9887,0.9889,0.249
ada,Ada Boost Classifier,0.9925,1.0,1.0,0.9857,0.9927,0.985,0.9853,0.125
dt,Decision Tree Classifier,0.9776,0.9774,1.0,0.9583,0.9785,0.9552,0.9566,0.016
knn,K Neighbors Classifier,0.9606,0.9866,1.0,0.9302,0.9632,0.921,0.9251,0.036
lr,Logistic Regression,0.9216,0.9405,1.0,0.8694,0.9291,0.8429,0.8554,0.971
lda,Linear Discriminant Analysis,0.8747,0.9261,0.9444,0.835,0.8837,0.7493,0.7622,0.018
ridge,Ridge Classifier,0.8465,0.0,0.8889,0.8297,0.8535,0.6929,0.7032,0.012


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=6415, verbose=0,
                       warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=6415, verbose=0,
                       warm_start=False)
