In [10]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


In [11]:
# Load dataset
credit_data = pd.read_csv('Creditcard_data.csv')
class_distribution = credit_data['Class'].value_counts()


In [12]:
# Separate majority and minority classes
majority_class = credit_data[credit_data['Class'] == 0]
minority_class = credit_data[credit_data['Class'] == 1]
print(f'Majority class (0): {majority_class.shape}')
print(f'Minority class (1): {minority_class.shape}')

# Split features and target
features = credit_data.drop(columns='Class')
target = credit_data['Class']


Majority class (0): (763, 31)
Minority class (1): (9, 31)


In [13]:
# Define resampling methods
# Modified Sampling Techniques
def apply_smote(X, y):
    smote = SMOTE(random_state=30, k_neighbors=5)  # Adjusted SMOTE parameters
    return smote.fit_resample(X, y)

def apply_random_undersampling(X, y):
    undersampler = RandomUnderSampler(random_state=50, replacement=False)  # Changed replacement to False
    return undersampler.fit_resample(X, y)

def apply_random_oversampling(X, y):
    oversampler = RandomOverSampler(random_state=60, sampling_strategy=0.8)  # Adjusted sampling strategy
    return oversampler.fit_resample(X, y)

def apply_tomek_links(X, y):
    tomek = TomekLinks()
    return tomek.fit_resample(X, y)

def apply_nearmiss(X, y):
    nearmiss = NearMiss()
    return nearmiss.fit_resample(X, y)


In [14]:
# Updated Train-Test Split Ratio and Random States
def evaluate_logistic_regression(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)
    classifier = LogisticRegression(max_iter=200)  # Increased max iterations
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    return accuracy_score(y_test, predictions)

def evaluate_decision_tree(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=20)
    classifier = DecisionTreeClassifier(max_depth=5, random_state=10)  # Limited tree depth
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    return accuracy_score(y_test, predictions)

def evaluate_random_forest(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
    classifier = RandomForestClassifier(n_estimators=50, random_state=15)  # Reduced estimators
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    return accuracy_score(y_test, predictions)

def evaluate_svm(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40)
    classifier = SVC(kernel='poly', degree=3, random_state=20)  # Changed kernel to polynomial
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    return accuracy_score(y_test, predictions)

def evaluate_knn(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=50)
    classifier = KNeighborsClassifier(n_neighbors=5, metric='manhattan')  # Changed distance metric
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    return accuracy_score(y_test, predictions)


In [15]:
# Generate results matrix
sampling_methods = [
    apply_random_undersampling,
    apply_random_oversampling,
    apply_tomek_links,
    apply_smote,
    apply_nearmiss,
]

classifiers = [
    evaluate_logistic_regression,
    evaluate_decision_tree,
    evaluate_random_forest,
    evaluate_svm,
    evaluate_knn,
]

results = []
row_labels = ["Random Undersampling", "Random Oversampling", "Tomek Links", "SMOTE", "NearMiss"]
column_labels = ["LogReg", "DecisionTree", "RandomForest", "SVM", "KNN"]

for resampling in sampling_methods:
    row_results = []
    for classifier in classifiers:
        X_resampled, y_resampled = resampling(features, target)
        accuracy = classifier(X_resampled, y_resampled)
        row_results.append(accuracy)
    results.append(row_results)

# Create updated DataFrame
updated_performance_df = pd.DataFrame(results, columns=column_labels, index=row_labels)
print(updated_performance_df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

                        LogReg  DecisionTree  RandomForest       SVM       KNN
Random Undersampling  0.600000      0.800000      0.200000  0.600000  0.400000
Random Oversampling   0.904070      0.968023      1.000000  0.688953  0.982558
Tomek Links           0.984375      0.963542      0.979167  1.000000  0.994792
SMOTE                 0.895288      0.950262      1.000000  0.696335  0.848168
NearMiss              0.400000      0.600000      0.200000  0.600000  0.400000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
