In [1]:
!pip install scikit-learn pandas numpy




In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.cluster import KMeans
from sklearn.utils import resample


In [4]:
df = pd.read_csv("Creditcard_data.csv")
df.head()



Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
print(df['Class'].value_counts())

Class
0    763
1      9
Name: count, dtype: int64


In [6]:
X = df.drop("Class", axis=1)
y = df["Class"]


In [7]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns
)


In [8]:
def random_sampling(X, y, frac=0.7):
    data = pd.concat([X, y], axis=1)
    return data.sample(frac=frac, random_state=42)


In [9]:
def cluster_sampling(X, y, n_clusters=5):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)

    data = X.copy()
    data["Class"] = y
    data["Cluster"] = clusters

    sampled = data.groupby("Cluster").sample(frac=0.5, random_state=42)
    return sampled.drop("Cluster", axis=1)


In [10]:
def bootstrap_sampling(X, y):
    data = pd.concat([X, y], axis=1)
    return resample(
        data,
        replace=True,
        n_samples=len(data),
        random_state=42
    )


In [11]:
def stratified_sampling(X, y):
    data = pd.concat([X, y], axis=1)
    return data.groupby("Class").sample(frac=0.8, random_state=42)


In [12]:
def strategic_sampling(X, y):
    centroid = X.mean(axis=0)
    distances = np.linalg.norm(X - centroid, axis=1)

    data = X.copy()
    data["Class"] = y
    data["distance"] = distances

    return data.nsmallest(
        int(0.7 * len(data)),
        "distance"
    ).drop("distance", axis=1)


In [13]:
models = {
    "M1_LogisticRegression": LogisticRegression(max_iter=1000),
    "M2_DecisionTree": DecisionTreeClassifier(),
    "M3_RandomForest": RandomForestClassifier(n_estimators=100),
    "M4_KNN": KNeighborsClassifier(),
    "M5_SVM": SVC()
}


In [14]:
sampling_methods = {
    "Sampling1_Random": random_sampling,
    "Sampling2_Cluster": cluster_sampling,
    "Sampling3_Bootstrap": bootstrap_sampling,
    "Sampling4_Stratified": stratified_sampling,
    "Sampling5_Strategic": strategic_sampling
}

results = pd.DataFrame(
    index=models.keys(),
    columns=sampling_methods.keys()
)


In [15]:
for s_name, sampler in sampling_methods.items():
    sampled_data = sampler(X_scaled, y)

    X_s = sampled_data.drop("Class", axis=1)
    y_s = sampled_data["Class"]

    X_train, X_test, y_train, y_test = train_test_split(
        X_s, y_s,
        test_size=0.3,
        random_state=42,
        stratify=y_s
    )

    for m_name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)

        results.loc[m_name, s_name] = round(acc * 100, 2)


In [16]:
results


Unnamed: 0,Sampling1_Random,Sampling2_Cluster,Sampling3_Bootstrap,Sampling4_Stratified,Sampling5_Strategic
M1_LogisticRegression,98.77,99.15,97.84,98.92,99.38
M2_DecisionTree,96.3,99.15,98.71,95.7,99.38
M3_RandomForest,98.77,99.15,100.0,98.92,99.38
M4_KNN,98.77,99.15,98.28,98.92,99.38
M5_SVM,98.77,99.15,98.28,98.92,99.38


In [18]:
best_sampling = results.idxmax(axis=1)
best_sampling


Unnamed: 0,0
M1_LogisticRegression,Sampling5_Strategic
M2_DecisionTree,Sampling5_Strategic
M3_RandomForest,Sampling3_Bootstrap
M4_KNN,Sampling5_Strategic
M5_SVM,Sampling5_Strategic
