In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Step 1: Download and load dataset
url = "/content/Creditcard_data.csv"
data = pd.read_csv(url)
data = data.dropna()

X = data.drop('Class', axis=1)
y = data['Class']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_scaled, y)

sample_sizes = [int(len(X_balanced) * p / 100) for p in [15, 25, 35, 45, 55]]  # Adjust percentages as per formula
samples = [pd.DataFrame(X_balanced).sample(n=size, random_state=42) for size in sample_sizes]
sample_labels = [pd.Series(y_balanced).iloc[sample.index] for sample in samples]

sampling_techniques = [
    lambda x: x.sample(frac=0.75, random_state=42),
    lambda x: x.sample(frac=0.65, random_state=42),
    lambda x: x.sample(frac=0.5, random_state=42),
    lambda x: x.sample(frac=0.3, random_state=42),
    lambda x: x.sample(frac=0.9, random_state=42)
]

# ML models
models = [
    LogisticRegression(max_iter=2000, random_state=42),
    RandomForestClassifier(random_state=42),
    SVC(random_state=42),
    KNeighborsClassifier(n_neighbors=5),
    GradientBoostingClassifier(random_state=42)
]

results = []

for i, (sample, labels) in enumerate(zip(samples, sample_labels)):
    X_train, X_test, y_train, y_test = train_test_split(sample, labels, test_size=0.2, random_state=42)

    for j, technique in enumerate(sampling_techniques):
        sampled_X_train = technique(X_train)
        sampled_y_train = y_train.loc[sampled_X_train.index]

        for k, model in enumerate(models):
            model.fit(sampled_X_train, sampled_y_train)
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)

            results.append({
                'Sample': f"Sample{i+1}",
                'Sampling': f"Sampling{j+1}",
                'Model': f"Model{chr(77+k)}",
                'Accuracy': accuracy
            })

#results
results_df = pd.DataFrame(results)

best_results = results_df.groupby(['Model', 'Sampling'])['Accuracy'].mean().reset_index()
best_results = best_results.loc[best_results.groupby('Model')['Accuracy'].idxmax()]
best_results.to_csv("best_sampling_per_model.csv", index=False)

results_df.to_csv("sampling_results.csv", index=False)

