In [3]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


data = pd.read_csv('Creditcard_data.csv')

X = data.drop('Class', axis=1)
y = data['Class']

# 2. Balancing the dataset using SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)
balanced_df = pd.concat([pd.DataFrame(X_balanced), pd.Series(y_balanced, name='Class')], axis=1)

print(f"Original class distribution:\n{y.value_counts()}")
print(f"Balanced class distribution:\n{y_balanced.value_counts()}")


n = 300

# 3. Five Different Samples
samples = {}

# Sampling 1: Simple Random Sampling
samples['Sampling1'] = balanced_df.sample(n=n, random_state=1)

# Sampling 2: Systematic Sampling
k = len(balanced_df) // n
samples['Sampling2'] = balanced_df.iloc[::k][:n]

# Sampling 3: Stratified Sampling

samples['Sampling3'] = balanced_df.groupby('Class', group_keys=False).apply(lambda x: x.sample(n=n//2, random_state=3))

# Sampling 4: Cluster Sampling
# We'll create clusters based on 'Time' blocks for demonstration
balanced_df['Cluster'] = pd.qcut(balanced_df['Time'], q=10, labels=False)
selected_clusters = np.random.choice(balanced_df['Cluster'].unique(), size=5, replace=False)
samples['Sampling4'] = balanced_df[balanced_df['Cluster'].isin(selected_clusters)].sample(n=n, random_state=4)

# Sampling 5: Bootstrap Sampling (Random Sampling with Replacement)
samples['Sampling5'] = balanced_df.sample(n=n, replace=True, random_state=5)

# 4. ML Models
models = {
    'M1': LogisticRegression(max_iter=1000),
    'M2': RandomForestClassifier(random_state=42),
    'M3': SVC(),
    'M4': DecisionTreeClassifier(random_state=42),
    'M5': KNeighborsClassifier()
}

# 5.  Models evaluation
results = pd.DataFrame(index=models.keys(), columns=samples.keys())

for s_name, sample_data in samples.items():
    #  train/test split
    X_s = sample_data.drop(['Class', 'Cluster'], axis=1, errors='ignore')
    y_s = sample_data['Class']
    X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.2, random_state=42)

    for m_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results.loc[m_name, s_name] = accuracy_score(y_test, y_pred)

# 6. Determine which sampling technique gives higher accuracy for each model
print("\n--- Accuracy Matrix ---")
print(results)

best_per_model = results.idxmax(axis=1)
print("\n--- Best Sampling Technique per Model ---")
for model, best_sample in best_per_model.items():
    print(f"Model {model}: {best_sample} with Accuracy {results.loc[model, best_sample]:.4f}")

Original class distribution:
Class
0    763
1      9
Name: count, dtype: int64
Balanced class distribution:
Class
0    763
1    763
Name: count, dtype: int64


  samples['Sampling3'] = balanced_df.groupby('Class', group_keys=False).apply(lambda x: x.sample(n=n//2, random_state=3))



--- Accuracy Matrix ---
   Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
M1  0.983333  0.866667  0.933333  0.916667  0.916667
M2       1.0  0.966667  0.966667       1.0       1.0
M3  0.683333  0.683333  0.766667  0.616667  0.666667
M4       0.9  0.933333  0.933333       0.9      0.95
M5  0.716667  0.683333       0.8  0.766667  0.783333

--- Best Sampling Technique per Model ---
Model M1: Sampling1 with Accuracy 0.9833
Model M2: Sampling1 with Accuracy 1.0000
Model M3: Sampling3 with Accuracy 0.7667
Model M4: Sampling5 with Accuracy 0.9500
Model M5: Sampling3 with Accuracy 0.8000


In [5]:
# ================================
# Import required libraries
# ================================

import pandas as pd
import numpy as np

# For handling class imbalance
from imblearn.over_sampling import SMOTE

# For splitting data and evaluating models
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Machine Learning models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier




data = pd.read_csv('Creditcard_data.csv')


X = data.drop('Class', axis=1)   # all columns except target
y = data['Class']


#using smote


smote = SMOTE(random_state=42)

#applying SMOTE
X_balanced, y_balanced = smote.fit_resample(X, y)


balanced_df = pd.concat(
    [pd.DataFrame(X_balanced), pd.Series(y_balanced, name='Class')],
    axis=1
)

# class distributions before and after balancing
print(f"Original class distribution:\n{y.value_counts()}")
print(f"Balanced class distribution:\n{y_balanced.value_counts()}")




# number of records to be taken in each sample
n = 300


samples = {}


#  sampling 1: Simple Random Sampling

samples['Sampling1'] = balanced_df.sample(n=n, random_state=1)


#  sampling 2: Systematic Sampling

k = len(balanced_df) // n
samples['Sampling2'] = balanced_df.iloc[::k][:n]


# sampling 3: Stratified Sampling

samples['Sampling3'] = (
    balanced_df
    .groupby('Class', group_keys=False)
    .apply(lambda x: x.sample(n=n // 2, random_state=3))
)


# sampling 4: Cluster Sampling
# clusters based on the 'Time' column
balanced_df['Cluster'] = pd.qcut(balanced_df['Time'], q=10, labels=False)

# randomly select some clusters
selected_clusters = np.random.choice(
    balanced_df['Cluster'].unique(),
    size=5,
    replace=False
)

# take samples only from the selected clusters
samples['Sampling4'] = (
    balanced_df[balanced_df['Cluster'].isin(selected_clusters)]
    .sample(n=n, random_state=4)
)


#sampling 5: Bootstrap Sampling

samples['Sampling5'] = balanced_df.sample(
    n=n,
    replace=True,
    random_state=5
)




models = {
    'M1': LogisticRegression(max_iter=1000),
    'M2': RandomForestClassifier(random_state=42),
    'M3': SVC(),
    'M4': DecisionTreeClassifier(random_state=42),
    'M5': KNeighborsClassifier()
}





results = pd.DataFrame(index=models.keys(), columns=samples.keys())


for sample_name, sample_data in samples.items():


    X_s = sample_data.drop(['Class', 'Cluster'], axis=1, errors='ignore')
    y_s = sample_data['Class']

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_s, y_s, test_size=0.2, random_state=42
    )

    # train and test each model
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        #  accuracy score storing
        results.loc[model_name, sample_name] = accuracy_score(y_test, y_pred)




print("\n--- Accuracy Matrix ---")
print(results)

# best sampling technique for each model
best_per_model = results.idxmax(axis=1)

print("\n--- Best Sampling Technique per Model ---")
for model, best_sample in best_per_model.items():
   print(
        f"Model {model}: {best_sample} "
        f"with Accuracy {results.loc[model, best_sample]:.4f}"
    )

Original class distribution:
Class
0    763
1      9
Name: count, dtype: int64
Balanced class distribution:
Class
0    763
1    763
Name: count, dtype: int64


  .apply(lambda x: x.sample(n=n // 2, random_state=3))



--- Accuracy Matrix ---
   Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
M1  0.983333  0.866667  0.933333  0.966667  0.916667
M2       1.0  0.966667  0.966667  0.983333       1.0
M3  0.683333  0.683333  0.766667  0.683333  0.666667
M4       0.9  0.933333  0.933333  0.916667      0.95
M5  0.716667  0.683333       0.8  0.816667  0.783333

--- Best Sampling Technique per Model ---
Model M1: Sampling1 with Accuracy 0.9833
Model M2: Sampling1 with Accuracy 1.0000
Model M3: Sampling3 with Accuracy 0.7667
Model M4: Sampling5 with Accuracy 0.9500
Model M5: Sampling4 with Accuracy 0.8167
