In [None]:
import pandas as pd

df = pd.read_csv(r"C:\Users\Yashika\Desktop\Sampling\Creditcard_data.csv")
df.head()


In [None]:
#Check Imbalance
df['Class'].value_counts()

In [None]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df.Class == 0]
df_minority = df[df.Class == 1]

# Downsample majority class
df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=len(df_minority),
    random_state=42
)

# Combine minority and downsampled majority
df_balanced = pd.concat([df_majority_downsampled, df_minority])


In [None]:
df_balanced['Class'].value_counts()

In [None]:
#Split features and target
X = df_balanced.drop("Class", axis=1)
y = df_balanced["Class"]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

models = {
    "M1": LogisticRegression(max_iter=1000),
    "M2": DecisionTreeClassifier(),
    "M3": RandomForestClassifier(),
    "M4": GaussianNB(),
    "M5": SVC()
}


In [None]:
#Random Sampling
from sklearn.model_selection import train_test_split

samples = {}

samples["Random Sampling"] = train_test_split(
    X, y, test_size=0.3, random_state=42
)



In [None]:
#Stratified Sampling
samples["Stratified Sampling"] = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


In [None]:
#Cluster Sampling
from sklearn.cluster import KMeans
import numpy as np

kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X)

X_clustered = X.copy()
X_clustered["cluster"] = clusters

df_clustered = pd.concat([X_clustered, y], axis=1)

# randomly select 2 clusters
selected_clusters = np.random.choice(df_clustered["cluster"].unique(), 2, replace=False)
df_sampled = df_clustered[df_clustered["cluster"].isin(selected_clusters)]

X_c = df_sampled.drop(["Class", "cluster"], axis=1)
y_c = df_sampled["Class"]

samples["Cluster Sampling"] = train_test_split(
    X_c, y_c, test_size=0.3, random_state=42
)



In [None]:
#K - Fold Sampling
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

def kfold_accuracy(model, X, y):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        scores.append(accuracy_score(y_test, preds))

    return sum(scores) / len(scores)



In [None]:
#Bootstrap Sampling
from sklearn.utils import resample

X_boot, y_boot = resample(
    X, y, replace=True, random_state=42
)

samples["Bootstrap Sampling"] = train_test_split(
    X_boot, y_boot, test_size=0.3, random_state=42
)


In [None]:
from sklearn.metrics import accuracy_score

results = []

for sample_name, sample_data in samples.items():
    
    # K-Fold handled separately
    if sample_name == "K-Fold Sampling":
        continue

    X_train, X_test, y_train, y_test = sample_data

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)

        results.append({
            "Sampling Method": sample_name,
            "Model": model_name,
            "Accuracy": round(acc, 4)
        })


In [None]:
for model_name, model in models.items():
    acc = kfold_accuracy(model, X, y)

    results.append({
        "Sampling Method": "K-Fold Sampling",
        "Model": model_name,
        "Accuracy": round(acc, 4)
    })


In [None]:
results_df = pd.DataFrame(results)
results_df


In [None]:
results_df.to_csv("sampling_model_accuracy_results.csv", index=False)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("sampling_model_accuracy_results.csv")


In [None]:
sampling_methods = df["Sampling Method"].unique()

for method in sampling_methods:
    subset = df[df["Sampling Method"] == method]

    plt.figure()
    plt.bar(subset["Model"], subset["Accuracy"])
    plt.xlabel("Models")
    plt.ylabel("Accuracy")
    plt.title(f"Model Comparison for {method}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
