In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler

In [None]:
df = pd.read_csv("data/Creditcard_data.csv")


X = df.drop('Class', axis=1)
y = df['Class']
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X, y)
balanced_df = pd.concat([X_res, y_res], axis=1)

In [None]:
def get_sampling_techniques(data):
    samples = []
    n = len(data)
    
    # S1: Simple Random Sampling (Z=1.96, p=0.5, e=0.05)
    s1_n = int((1.96**2 * 0.5 * 0.5) / (0.05**2))
    samples.append(data.sample(n=s1_n, random_state=42))
    
    # S2: Systematic Sampling
    interval = 5
    samples.append(data.iloc[::interval])
    
    # S3: Stratified Sampling
    samples.append(data.groupby('Class', group_keys=False).apply(lambda x: x.sample(frac=0.4, random_state=42)))
    
    # S4: Cluster Sampling
    data['Cluster'] = np.repeat(range(10), len(data)//10 + 1)[:len(data)]
    selected_clusters = [1, 5, 8]
    samples.append(data[data['Cluster'].isin(selected_clusters)].drop('Cluster', axis=1))
    data.drop('Cluster', axis=1, inplace=True)
    
    # S5: Bootstrap Sampling
    samples.append(data.sample(frac=1.0, replace=True, random_state=42))
    
    return samples

# 4. Initialize Models 
models = {
    "M1": LogisticRegression(max_iter=2000),
    "M2": RandomForestClassifier(random_state=42),
    "M3": SVC(random_state=42),
    "M4": DecisionTreeClassifier(random_state=42),
    "M5": KNeighborsClassifier()
}

samples = get_sampling_techniques(balanced_df)
results = {}

# 5. Evaluate [cite: 21]
for i, sample in enumerate(samples, 1):
    s_name = f"Sampling{i}"
    results[s_name] = []
    
    X_s = sample.drop('Class', axis=1)
    y_s = sample['Class']
    X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.2, random_state=42)
    
    for m_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results[s_name].append(acc * 100)

# Create final table
final_table = pd.DataFrame(results, index=models.keys())
print(final_table)
final_table.to_csv("submission_results.csv")