##Uploading Dataset

In [3]:
from google.colab import files
uploaded = files.upload()


Saving Creditcard_data.csv to Creditcard_data (1).csv


In [4]:
import pandas as pd
df = pd.read_csv('Creditcard_data.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


##Balancing the data

In [5]:
df['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,763
1,9


1. Undersampling

In [10]:
import imblearn
from imblearn.under_sampling import RandomUnderSampler
X = df.drop(columns=['Class'],axis=1)
y = df['Class']
#balancing the data using random undersampling
rus = RandomUnderSampler(random_state=42, replacement=True)
x_rus, y_rus = rus.fit_resample(X, y)

print('original dataset shape:', y.value_counts())
print('Resample dataset shape', y_rus.value_counts())

original dataset shape: Class
0    763
1      9
Name: count, dtype: int64
Resample dataset shape Class
0    9
1    9
Name: count, dtype: int64


2. Oversampling

In [12]:
#Balancing the data using oversampling
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)


x_ros, y_ros = ros.fit_resample(X, y)
print('original dataset shape:', y.value_counts())
print('Resample dataset shape', y_ros.value_counts())

original dataset shape: Class
0    763
1      9
Name: count, dtype: int64
Resample dataset shape Class
0    763
1    763
Name: count, dtype: int64


In [30]:
os_data = pd.concat([x_ros, y_ros], axis=1)

##Sampling Methods

In [27]:
#calculating sample size
import numpy as np
Z = 1.96
p = 0.5
E = 0.05
C = 50
S = 30

def simple_random_sample_size(Z, p, E):
    return int((Z**2 * p * (1 - p)) / (E**2))
def cluster_sample_size(Z, p, E, C):
    return int((Z**2 * p * (1 - p)) / ((E / C)**2))
def stratified_sample_size(Z, p, E, S):
    return int((Z**2 * p * (1 - p)) / ((E / np.sqrt(S))**2))
srs_size = simple_random_sample_size(Z, p, E)
cs_size = cluster_sample_size(Z, p, E, C)
ss_size = stratified_sample_size(Z, p, E, S)


In [33]:
def simple_random_sampling(data, sample_size):
    return data.sample(n=sample_size, random_state=42)

def bootstrap_sampling(data, n_iterations):
    return [data.sample(frac=1, replace=True, random_state=i) for i in range(n_iterations)]

def cluster_sampling(data, cluster_size):
    clusters = data.groupby(data.index // cluster_size)
    sampled_clusters = clusters.apply(lambda x: x.sample(n=min(len(x), 5), random_state=42))
    return sampled_clusters.reset_index(drop=True)

def stratified_sampling(data, target_column, sample_size):
    min_group_size = data[target_column].value_counts().min()
    sample_size_per_group = min(int(sample_size / len(data[target_column].unique())), min_group_size)
    return data.groupby(target_column, group_keys=False).apply(lambda x: x.sample(sample_size_per_group, random_state=42))
def systematic_sampling(data, step):
    return data.iloc[::step]

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

sampling_methods = {
    "simple_random": simple_random_sampling(os_data, srs_size),
    "bootstrap": bootstrap_sampling(os_data, n_iterations=10)[0],
    "cluster": cluster_sampling(os_data, cs_size),
    "stratified": stratified_sampling(os_data, 'Class', ss_size),
    "systematic": systematic_sampling(os_data, step=10)
}

models = {
    "RandomForest": RandomForestClassifier(),
    "SVM": SVC(),
    "LogisticRegression": LogisticRegression(),
    "NaiveBayes": GaussianNB(),
    "DecisionTree": DecisionTreeClassifier()
}



  return data.groupby(target_column, group_keys=False).apply(lambda x: x.sample(sample_size_per_group, random_state=42))


In [35]:
results = []

for method_name, sampled_data in sampling_methods.items():

    X_train, X_test, y_train, y_test = train_test_split(x_ros, y_ros, test_size=0.3, random_state=42, stratify=y_ros)

    for model_name, model in models.items():
        pipeline = Pipeline([
            ('model', model)
        ])

        if len(np.unique(y_train)) > 1:
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)

            accuracy = accuracy_score(y_test, y_pred)
            results.append([method_name, model_name, accuracy])
        else:
            print(f"Skipping {method_name}_{model_name} due to insufficient classes in training data.")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [40]:
results_df = pd.DataFrame(results, columns=['Sampling Method', 'Model', 'Accuracy'])
results_matrix = results_df.pivot(index='Sampling Method', columns='Model', values='Accuracy')
results_matrix

Model,DecisionTree,LogisticRegression,NaiveBayes,RandomForest,SVM
Sampling Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bootstrap,0.989083,0.914847,0.783843,1.0,0.751092
cluster,0.991266,0.914847,0.783843,1.0,0.751092
simple_random,0.984716,0.914847,0.783843,1.0,0.751092
stratified,0.984716,0.914847,0.783843,0.997817,0.751092
systematic,0.989083,0.914847,0.783843,1.0,0.751092
