## Importing Libraries

In [316]:
import pandas as pd
import numpy as np
import random

## Importing Dataset

In [317]:
fd = pd.read_csv("Creditcard_data.csv")

## Over-Sampling to Balance the Data

In [318]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

X = df.drop('Class', axis=1)
y = df['Class']

ros = RandomOverSampler()

X_resampled, y_resampled = ros.fit_resample(X, y)

df_resampled = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled)], axis=1)
df_resampled.columns = df.columns


In [319]:
print(df['Class'].value_counts())
print(df_resampled['Class'].value_counts())

0    763
1      9
Name: Class, dtype: int64
0    763
1    763
Name: Class, dtype: int64


## Simple Random Sampling

In [320]:
def simple_random_sample(df, z_score, margin_error, p):

    n = (z_score**2 * p * (1-p)) / margin_error**2

    n = int(np.ceil(n))
    
    sample_indices = random.sample(range(len(df)), n)

    sample_df = df.iloc[sample_indices, :]
    
    return sample_df


In [321]:
sample_1 = simple_random_sample(df_resampled, 1.96, 0.05, 0.5)
print(sample_1.shape)

(385, 31)


## Systematic Sampling

In [322]:
def systematic_sample(df, k):
   
    start_indices = np.arange(0, len(df), k)
    
    sample_df = df.iloc[start_indices, :]
    
    return sample_df


In [323]:
sample_2 = systematic_sample(df_resampled, 5)
print(sample_2.shape)


(306, 31)


## Stratified Sampling

In [324]:
def stratified_sample(df, col, z, e, p):
    
    t = df[col].value_counts()

    s = len(t)
    n = (z ** 2) * (p * (1 - p)) // ((e / s) ** 2)
    
    n_rows = t[0] + t[1]

    sample_df = df.groupby(col, group_keys=False).apply(lambda x: x.sample(frac = n/n_rows))
    
    return sample_df

In [325]:
sample_3 = stratified_sample(df_resampled, 'Class', 0.95, 0.05, 0.5)
print(sample_3.shape)

(360, 31)


## Cluster Sampling

In [326]:
def cluster_sampling(df, z, e, p, c):
    
    n = ((z ** 2) * (p * (1 - p)) // ((e) ** 2)) / (df.shape[0] - c)

    cluster_sample_df = df.sample(frac = n)
    
    return cluster_sample_df


In [327]:
sample_4 = cluster_sampling(df_resampled, 0.95, 0.05, 0.5, 300)
print(sample_4.shape)

(112, 31)


## Quota Sampling

In [328]:
def quota_sampling(df, strata, quotas):

    sample = pd.DataFrame(columns = df.columns)

    for stratum, quota in quotas.items():

        stratum_df = df[df[strata] == stratum]

        stratum_sample = stratum_df.sample(n = quota, random_state = 1)
        
        sample = pd.concat([sample, stratum_sample], ignore_index = True)
    
    return sample


In [329]:
quotas = {0: 50, 1: 50}
sample_5 = quota_sampling(df_resampled, 'Class', quotas)
print(sample_5.shape)


(100, 31)


## Merging all samples

In [330]:
all_samples = [sample_1, sample_2, sample_3, sample_4, sample_5]

## Extracting Testing Data

In [331]:
X_test = np.array(df.drop('Class', axis=1))
y_test = np.array(df['Class']).reshape(-1,).astype('int')

## Model - 1 : LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

M1 = []
for sample in all_samples:

    X_train = np.array(sample.iloc[:,:-1])
    y_train = np.array(sample.iloc[:,-1:].values).reshape(-1,).astype('int')
    
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    M1.append(accuracy)


In [333]:
print(M1)

[0.8354922279792746, 0.8095854922279793, 0.8432642487046632, 0.7681347150259067, 0.822538860103627]


## Model - 2 : RandomForestClassifier

In [334]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

M2 = []
for sample in all_samples:
    
    X_train = np.array(sample.iloc[:,:-1])
    y_train = np.array(sample.iloc[:,-1:].values).reshape(-1,).astype('int')
    
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    M2.append(accuracy)
    


In [335]:
print(M2)

[0.9961139896373057, 0.9961139896373057, 0.9974093264248705, 0.9766839378238342, 0.9740932642487047]


## Model - 3 : SVM

In [336]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

M3 = []
for sample in all_samples:

    X_train = np.array(sample.iloc[:,:-1])
    y_train = np.array(sample.iloc[:,-1:].values).reshape(-1,).astype('int')
    
    clf = SVC()
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    M3.append(accuracy)


In [337]:
print(M3)

[0.6230569948186528, 0.7020725388601037, 0.694300518134715, 0.7046632124352331, 0.7551813471502591]


## Model - 4 : GradientBoostingClassifier

In [338]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

M4 = []
for sample in all_samples:

    X_train = np.array(sample.iloc[:,:-1])
    y_train = np.array(sample.iloc[:,-1:].values).reshape(-1,).astype('int')
    
    clf = GradientBoostingClassifier()
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    M4.append(accuracy)


In [339]:
print(M4)

[0.9702072538860104, 0.9922279792746114, 0.9909326424870466, 0.9546632124352331, 0.9183937823834197]


## Model - 5 : NaiveBayes

In [340]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

M5 = []
for sample in all_samples:
    
    X_train = np.array(sample.iloc[:,:-1])
    y_train = np.array(sample.iloc[:,-1:].values).reshape(-1,).astype('int')
    
    clf = GaussianNB()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    M5.append(accuracy)


In [341]:
print(M5)

[0.8898963730569949, 0.7914507772020726, 0.8937823834196891, 0.716321243523316, 0.9261658031088082]


## Comparison Table

In [342]:
models = [M1, M2, M3, M4, M5]

Comparison = pd.DataFrame(models, columns = ['Simple Random Sampling', 'Systematic Sampling', 'Stratified Sampling', 'Cluster Sampling',
'Quota Sampling'])

Comparison.index = ['Logistic Regression', 'Random Forest', 'SVM', 'Gradient Boosting', 'Naive Bayes']

Comparison

Unnamed: 0,Simple Random Sampling,Systematic Sampling,Stratified Sampling,Cluster Sampling,Quota Sampling
Logistic Regression,0.835492,0.809585,0.843264,0.768135,0.822539
Random Forest,0.996114,0.996114,0.997409,0.976684,0.974093
SVM,0.623057,0.702073,0.694301,0.704663,0.755181
Gradient Boosting,0.970207,0.992228,0.990933,0.954663,0.918394
Naive Bayes,0.889896,0.791451,0.893782,0.716321,0.926166


In [351]:
max_value = Comparison.max().max()
row, col = Comparison.stack().idxmax()
print(f"The Sample created from '{col}' Technique gives the highest accuracy on model '{row}' of {max_value}.")

The Sample created from 'Stratified Sampling' Technique gives the highest accuracy on model 'Random Forest' of 0.9974093264248705
