In [6]:
!pip install imbalanced-learn



In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
import numpy as np

In [8]:
data = pd.read_csv('/content/Creditcard_data.csv')

In [9]:
print(data.head())
print(data.info())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [10]:
X = data.drop("Class", axis=1)
y = data["Class"]

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("Original class distribution:")
print(y.value_counts())
print("Resampled class distribution:")
print(y_resampled.value_counts())

Original class distribution:
Class
0    763
1      9
Name: count, dtype: int64
Resampled class distribution:
Class
0    763
1    763
Name: count, dtype: int64


In [12]:
sample_size = 1000

samples = {
    "Sampling1": X_resampled.sample(n=sample_size, random_state=42),
    "Sampling2": X_resampled.sample(n=sample_size, random_state=21),
    "Sampling3": X_resampled.iloc[::len(X_resampled)//sample_size, :],
    "Sampling4": X_resampled.sample(n=sample_size, random_state=56),
    "Sampling5": X_resampled.sample(n=sample_size, random_state=99),
}

sample_datasets = {
    name: (sample, y_resampled.loc[sample.index])
    for name, sample in samples.items()
}


In [13]:
models = {
    "M1": LogisticRegression(random_state=42),
    "M2": DecisionTreeClassifier(random_state=42),
    "M3": GradientBoostingClassifier(random_state=42),
    "M4": SVC(random_state=42),
    "M5": KNeighborsClassifier(),
}


In [14]:
results = {}

for sample_name, (X_sample, y_sample) in sample_datasets.items():
    print(f"Evaluating for {sample_name}...")
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, random_state=42)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        if sample_name not in results:
            results[sample_name] = {}
        results[sample_name][model_name] = accuracy


Evaluating for Sampling1...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating for Sampling2...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating for Sampling3...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating for Sampling4...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating for Sampling5...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
matrix_data = []
for sample_name, accuracies in results.items():
    row = [accuracies.get(model, None) for model in models.keys()]
    matrix_data.append(row)

results_matrix = pd.DataFrame(
    matrix_data,
    index=results.keys(),
    columns=models.keys()
)

print("Accuracy Matrix:")
print(results_matrix)

results_matrix.to_csv("results_matrix_colab.csv")

Accuracy Matrix:
                 M1        M2        M3        M4        M5
Sampling1  0.906667  0.983333  0.983333  0.676667  0.860000
Sampling2  0.906667  0.963333  0.996667  0.696667  0.840000
Sampling3  0.906114  0.978166  0.986900  0.676856  0.847162
Sampling4  0.913333  0.960000  0.980000  0.663333  0.836667
Sampling5  0.916667  0.980000  0.983333  0.666667  0.820000


In [16]:
best_combinations = results_matrix.idxmax()
print("Best Sampling Technique for Each Model:")
print(best_combinations)

Best Sampling Technique for Each Model:
M1    Sampling5
M2    Sampling1
M3    Sampling2
M4    Sampling2
M5    Sampling1
dtype: object
