In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.utils import resample


In [3]:
df = pd.read_csv("/content/Creditcard_data.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
df['Class'].value_counts()


Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,763
1,9


In [5]:
X = df.drop('Class', axis=1)
y = df['Class']


In [6]:
df_majority = df[df.Class == 0]
df_minority = df[df.Class == 1]

df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=len(df_minority),
    random_state=42
)

df_balanced = pd.concat([df_majority_downsampled, df_minority])

df_balanced['Class'].value_counts()


Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,9
1,9


In [7]:
X_bal = df_balanced.drop('Class', axis=1)
y_bal = df_balanced['Class']


In [8]:
models = {
    "M1": LogisticRegression(max_iter=1000),
    "M2": DecisionTreeClassifier(),
    "M3": RandomForestClassifier(),
    "M4": KNeighborsClassifier(),
    "M5": GaussianNB()
}


In [9]:
def sampling1_random(X, y):
    return train_test_split(X, y, test_size=0.3, random_state=1)

def sampling2_systematic(X, y, step=2):
    idx = np.arange(0, len(X), step)
    X_s, y_s = X.iloc[idx], y.iloc[idx]
    return train_test_split(X_s, y_s, test_size=0.3, random_state=1)

def sampling3_stratified(X, y):
    return train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)

def sampling4_bootstrap(X, y):
    X_bs, y_bs = resample(X, y, replace=True, random_state=1)
    return train_test_split(X_bs, y_bs, test_size=0.3, random_state=1)

def sampling5_stratified_balanced(X, y):
    return train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)


In [10]:
sampling_methods = {
    "Sampling1": sampling1_random,
    "Sampling2": sampling2_systematic,
    "Sampling3": sampling3_stratified,
    "Sampling4": sampling4_bootstrap,
    "Sampling5": sampling5_stratified_balanced
}


In [11]:
results = pd.DataFrame(index=models.keys(), columns=sampling_methods.keys())


In [12]:
for samp_name, samp_func in sampling_methods.items():
    X_train, X_test, y_train, y_test = samp_func(X_bal, y_bal)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred) * 100
        results.loc[model_name, samp_name] = round(acc, 2)


In [13]:
results


Unnamed: 0,Sampling1,Sampling2,Sampling3,Sampling4,Sampling5
M1,83.33,66.67,50.0,83.33,50.0
M2,83.33,66.67,66.67,66.67,66.67
M3,50.0,66.67,50.0,66.67,50.0
M4,50.0,33.33,50.0,33.33,50.0
M5,66.67,33.33,16.67,66.67,16.67


In [15]:
for model in results.index:
    max_val = results.loc[model].astype(float).max()
    best = results.loc[model][results.loc[model].astype(float) == max_val].index.tolist()
    print(f"{model}: Best Sampling -> {best} (Accuracy = {max_val})")



M1: Best Sampling -> ['Sampling1', 'Sampling4'] (Accuracy = 83.33)
M2: Best Sampling -> ['Sampling1'] (Accuracy = 83.33)
M3: Best Sampling -> ['Sampling2', 'Sampling4'] (Accuracy = 66.67)
M4: Best Sampling -> ['Sampling1', 'Sampling3', 'Sampling5'] (Accuracy = 50.0)
M5: Best Sampling -> ['Sampling1', 'Sampling4'] (Accuracy = 66.67)
