In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE

# Load dataset
file_path = 'Creditcard_data.csv'  # Update with your dataset path
data = pd.read_csv(file_path)

# Check class distribution
print("Class distribution before balancing:", data['Class'].value_counts())

# Balancing the dataset using SMOTE
X = data.drop(columns=['Class'])
y = data['Class']
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

print("Class distribution after balancing:", pd.Series(y_balanced).value_counts())

# Define sampling techniques
def systematic_sampling(data, step):
    return data.iloc[::step, :]

samples = []
sample_size = int(len(X_balanced) * 0.1)  # 10% sample size for simplicity
random_states = [42, 7, 21, 35, 99]

for i, seed in enumerate(random_states):
    if i == 2:  # Systematic sampling
        samples.append(systematic_sampling(pd.concat([X_balanced, y_balanced], axis=1), step=10))
    else:
        samples.append(
            pd.concat([X_balanced, y_balanced], axis=1).sample(n=sample_size, random_state=seed)
        )

# Machine learning models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(kernel='linear', random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

# Evaluate models
accuracy_matrix = pd.DataFrame(columns=["Sampling1", "Sampling2", "Sampling3", "Sampling4", "Sampling5"],
                               index=models.keys())

for model_name, model in models.items():
    accuracies = []
    for i, sample in enumerate(samples):
        X_sample = sample.iloc[:, :-1]
        y_sample = sample.iloc[:, -1]
        
        X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, random_state=42)
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
    
    accuracy_matrix.loc[model_name] = accuracies

# Display results
print("Accuracy Matrix:\n", accuracy_matrix)

# Save results
accuracy_matrix.to_csv("sampling_results.csv")
print("Results saved to 'sampling_results.csv'")

Class distribution before balancing: Class
0    763
1      9
Name: count, dtype: int64
Class distribution after balancing: Class
0    763
1    763
Name: count, dtype: int64
Accuracy Matrix:
                     Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
Logistic Regression  0.913043  0.826087  0.804348  0.913043  0.934783
Decision Tree        0.869565   0.76087  0.891304  0.956522  0.826087
Random Forest             1.0  0.913043  0.956522       1.0  0.978261
SVM                  0.891304  0.782609  0.826087  0.847826  0.891304
KNN                  0.630435  0.804348  0.586957  0.782609   0.73913
Results saved to 'sampling_results.csv'
