In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek

# ---------------------------
# Step 1: Load Dataset
# ---------------------------
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
df = pd.read_csv(url)

print("Original Class Distribution:")
print(df['Class'].value_counts())

# ---------------------------
# Step 2: Prepare Data
# ---------------------------
X = df.drop('Class', axis=1)
y = df['Class']

scaler = StandardScaler()
X = scaler.fit_transform(X)

# ---------------------------
# Step 3: Define Sampling Methods
# ---------------------------
sampling_methods = {
    "Sampling1_RandomUnder": RandomUnderSampler(),
    "Sampling2_RandomOver": RandomOverSampler(),
    "Sampling3_SMOTE": SMOTE(),
    "Sampling4_Tomek": TomekLinks(),
    "Sampling5_SMOTETomek": SMOTETomek()
}

# ---------------------------
# Step 4: Define Models
# ---------------------------
models = {
    "M1_Logistic": LogisticRegression(max_iter=1000),
    "M2_DecisionTree": DecisionTreeClassifier(),
    "M3_RandomForest": RandomForestClassifier(),
    "M4_KNN": KNeighborsClassifier(),
    "M5_SVM": SVC()
}

results = pd.DataFrame(index=models.keys(), columns=sampling_methods.keys())

# ---------------------------
# Step 5: Apply Sampling + Train Models
# ---------------------------
for samp_name, sampler in sampling_methods.items():
    X_res, y_res = sampler.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(
        X_res, y_res, test_size=0.2, random_state=42
    )

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds) * 100
        results.loc[model_name, samp_name] = round(acc, 2)

# ---------------------------
# Step 6: Show Results
# ---------------------------
print("\nAccuracy Comparison Table:\n")
print(results)

# ---------------------------
# Step 7: Find Best Combination
# ---------------------------
best_model = results.stack().idxmax()
best_accuracy = results.stack().max()

print("\nBest Combination:")
print("Model:", best_model[0])
print("Sampling Technique:", best_model[1])
print("Accuracy:", best_accuracy)


Original Class Distribution:
Class
0    763
1      9
Name: count, dtype: int64

Accuracy Comparison Table:

                Sampling1_RandomUnder Sampling2_RandomOver Sampling3_SMOTE  \
M1_Logistic                      25.0                93.14           93.46   
M2_DecisionTree                   0.0                99.02           97.06   
M3_RandomForest                   0.0                100.0           99.35   
M4_KNN                            0.0                96.41           94.44   
M5_SVM                            0.0                96.08           96.73   

                Sampling4_Tomek Sampling5_SMOTETomek  
M1_Logistic               97.42                93.14  
M2_DecisionTree           97.42                97.71  
M3_RandomForest           98.71                99.35  
M4_KNN                    98.71                94.44  
M5_SVM                    98.71                96.73  

Best Combination:
Model: M3_RandomForest
Sampling Technique: Sampling2_RandomOver
Accuracy: 