In [178]:
# Samples
# s1 - Simple Random Sampling
# s2 - Systematic Sampling
# s3 - Cluster Sampling
# s4 - Stratified Sampling
# s5 - Bootstrap Sampling

In [179]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



In [180]:
cdf = pd.read_csv("/content/drive/MyDrive/Creditcard_data.csv")
cdf.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [181]:
cdf['Class'].value_counts()

0    763
1      9
Name: Class, dtype: int64

**Balancing Dataset**

In [182]:
X = cdf.drop(['Class'],axis=1)
Y = cdf['Class']

In [183]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy='minority')
X_sm,Y_sm=smote.fit_resample(X,Y)


In [184]:
balanced_cdf =  pd.concat([X_sm, Y_sm], axis=1)
balanced_cdf.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [185]:
balanced_cdf['Class'].value_counts()

0    763
1    763
Name: Class, dtype: int64

**Sampling Techniques**

In [186]:
# Random Sampling

import math

def random_sample_size(c, e, p):

    sample_size = (1.96 ** 2 *p* (1 - p)) / e ** 2
    return math.ceil(sample_size)

confidence_level = 0.95
error_margin = 0.05
estimated_proportion = 0.5

n_random = random_sample_size(confidence_level, error_margin, estimated_proportion)
print(f"Required Random Sample Size: {n_random}")


Required Random Sample Size: 385


In [187]:
np.random.seed(0)
s1 = balanced_cdf.sample(n= n_random, random_state=0)
s1.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
1361,495,-1.092381,0.328168,1.628816,0.012536,0.95711,-0.777511,0.755378,-0.096341,-0.108553,...,0.016508,0.185413,-0.258759,0.259126,0.24392,-0.313139,-0.153236,-0.178023,1.0,1
511,377,1.166919,0.027049,0.513875,0.860965,-0.519452,-0.681147,0.074992,-0.187776,0.345399,...,-0.20275,-0.441391,-0.025782,0.452607,0.467223,0.262577,-0.023834,0.020521,40.83,0
9,9,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0
393,284,-0.810756,0.654499,2.217257,0.104341,-0.286801,0.117833,0.287552,-0.736461,0.699092,...,0.938194,0.571651,-0.101609,0.363928,-0.170947,-0.471524,0.058958,-0.079157,30.3,0
471,346,1.077079,0.28498,0.007731,1.657073,0.05202,0.446389,-0.407036,0.355704,0.626039,...,-0.174337,-0.174161,-0.153375,-0.466331,0.611001,-0.252871,0.090375,0.05482,10.99,0


**Shapiro Test for Checking Goodness of Sample**

In [188]:
import scipy.stats as stats
stat, p = stats.shapiro(s1)
print('Statistics=%.3f, p=%.3f' % (stat, p))

Statistics=0.174, p=0.000




In [189]:
# Systematic Sampling
import math as m

def systematic_sampling(df, step):

    indexes = np.arange(0, len(df), step=step)
    systematic_sample = df.iloc[indexes]
    return systematic_sample

s2 = systematic_sampling(balanced_cdf, int(m.sqrt(len(balanced_cdf))) )
s2.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
39,29,1.11088,0.168717,0.517144,1.325407,-0.191573,0.019504,-0.031849,0.11762,0.017665,...,-0.037709,0.095701,-0.048198,0.232115,0.606201,-0.342097,0.03677,0.00748,6.54,0
78,50,-0.571521,1.0716,1.28011,0.54278,0.574439,-0.259359,1.061148,-0.410972,-0.17913,...,0.003559,0.56124,-0.199287,0.001387,-0.17953,-0.374116,0.071641,-0.17551,9.79,0
117,76,-1.024576,0.522289,1.787699,0.202672,-1.140803,-0.137831,-0.336555,0.670704,0.07167,...,0.315868,0.847565,0.148877,0.549791,-0.585131,0.325841,-0.068871,0.059713,50.0,0
156,98,-0.646513,1.004199,1.616224,-0.099628,-0.122477,-0.671327,0.656183,0.009755,-0.635963,...,-0.147934,-0.420046,0.061424,0.520997,-0.238845,0.030135,0.140481,0.101163,14.98,0


In [190]:
stat, p = stats.shapiro(s2)
print('Statistics=%.3f, p=%.3f' % (stat, p))

Statistics=0.172, p=0.000


In [191]:
# Cluster Sampling
def Cluster_sample_size(c, e, p,nc):

    sample_size = (1.96 ** 2 * p * (1 - p)) / (e/nc) ** 2
    return m.ceil(sample_size)

confidence_level = 0.95
error_margin = 0.05
estimated_proportion = 0.5
num_cluster = 1.5

n_cluster = Cluster_sample_size(confidence_level, error_margin, estimated_proportion,num_cluster)
print(f"Required Cluster Sample Size: {n_cluster}")


Required Cluster Sample Size: 865


In [192]:
reshaped_df = balanced_cdf.groupby(np.arange(len(balanced_cdf)) // 2, sort=False).apply(lambda x: x.reset_index(drop=True))
s3 = reshaped_df.sample(n=int(n_cluster))
s3.head()

Unnamed: 0,Unnamed: 1,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
680,1,495,-1.092381,0.328168,1.628816,0.012536,0.95711,-0.777511,0.755378,-0.096341,-0.108553,...,0.016508,0.185413,-0.258759,0.259126,0.24392,-0.313139,-0.153236,-0.178023,1.0,1
255,1,377,1.166919,0.027049,0.513875,0.860965,-0.519452,-0.681147,0.074992,-0.187776,0.345399,...,-0.20275,-0.441391,-0.025782,0.452607,0.467223,0.262577,-0.023834,0.020521,40.83,0
4,1,9,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0
196,1,284,-0.810756,0.654499,2.217257,0.104341,-0.286801,0.117833,0.287552,-0.736461,0.699092,...,0.938194,0.571651,-0.101609,0.363928,-0.170947,-0.471524,0.058958,-0.079157,30.3,0
235,1,346,1.077079,0.28498,0.007731,1.657073,0.05202,0.446389,-0.407036,0.355704,0.626039,...,-0.174337,-0.174161,-0.153375,-0.466331,0.611001,-0.252871,0.090375,0.05482,10.99,0


In [193]:
# Stratified Sampling
def Stratified_sample_size(c, e, p,ns):

    sample_size = (1.96 ** 2 * p * (1 - p)) / (e/ns) ** 2
    return m.ceil(sample_size)

confidence_level = 0.95
error_margin = 0.05
estimated_proportion = 0.5
num_strata = 0.5

n_strata = Stratified_sample_size(confidence_level, error_margin, estimated_proportion,num_strata)
print(f"Required Stratified Sample Size: {n_strata}")


Required Stratified Sample Size: 97


In [194]:
dfs = balanced_cdf.groupby('Class', group_keys=False).apply(lambda x: x.sample(n_strata))
s4 = dfs.sample(n=int(n_strata))
s4.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
990,113,1.252346,0.346862,0.296951,0.683142,-0.353904,-1.030038,0.08004,-0.191114,0.023325,...,-0.285076,-0.824784,0.126992,0.311773,0.213958,0.095973,-0.022769,0.030234,2.69,1
683,514,-1.045042,0.957421,1.388023,0.195389,-0.327385,-0.64371,0.521262,0.174238,0.063058,...,-0.182403,-0.311953,-0.140268,0.400592,-0.050207,0.20124,-0.092265,-0.134806,30.49,0
834,533,-1.880922,-1.331742,1.882547,0.31952,1.758078,0.189614,-1.023601,0.492543,0.531649,...,0.147527,0.561772,0.042221,-0.574128,-0.272169,0.577555,-0.225325,-0.268108,1.271656,1
1170,511,-1.573413,-1.342971,2.177986,0.758742,1.866324,-0.001282,-0.919625,0.411991,0.520325,...,0.280623,0.831354,0.076468,-0.362238,0.023808,0.337323,-0.116587,-0.15302,1.300857,1
793,538,-1.745067,-0.010337,1.218743,-0.612178,0.958777,-0.269856,0.210189,0.156664,0.046617,...,-0.164644,-0.157693,-0.248025,-0.252814,-0.332176,0.255687,-0.339671,-0.378303,1.012377,1


In [195]:
# Boostrap Sampling
np.random.seed(0)
s5 = balanced_cdf.sample(n= n_random, replace = True)
s5.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
684,517,1.314713,-0.328688,0.002645,-0.805044,-0.46726,-0.522747,-0.18085,-0.093472,1.795353,...,-0.12254,-0.029521,-0.250848,-0.427629,0.91779,-0.53437,0.062355,0.012093,10.0,0
559,417,-2.680348,1.872052,1.144712,-0.693664,0.155172,0.601325,0.904201,-0.520079,3.013065,...,-0.459592,0.485421,-0.365437,-0.744118,0.328655,0.457695,0.566152,0.168241,29.99,0
1216,139,0.691809,0.44597,0.373698,0.417547,0.197831,-0.44027,0.186654,-0.039668,-0.040252,...,-0.211886,-0.617839,0.139012,-0.472471,-0.549067,0.085509,0.098237,0.125875,1.87972,1
835,419,-2.069722,1.679763,-1.022751,3.329468,-0.261386,-1.334469,-1.938826,1.120401,-2.309663,...,0.438237,0.020389,-0.428901,0.331619,0.105244,0.067166,0.197092,-0.140208,0.175202,1
763,574,-0.402057,0.5843,2.474227,0.929684,0.014314,0.29749,0.715195,-0.257153,0.593868,...,-0.072812,0.445733,-0.245103,0.421234,0.04928,-0.388323,-0.329333,-0.386747,12.0,0


In [196]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tabulate import tabulate

**Applying ML Models**

In [197]:
def accuracy(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    models = {
        'Logistic_Regression' : LogisticRegression(random_state = 0 , max_iter=2000),
        'NaiveBayes_Classifier'  : GaussianNB(),
        'SVM' : SVC(kernel='rbf', random_state=0),
        'DecisionTree_Classifier'  : DecisionTreeClassifier(random_state=0),
        'RandomForest_Classifier'  : RandomForestClassifier(n_estimators=100,random_state=0)
      }
    accuracies = []
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append([model_name, accuracy])


    table = tabulate(accuracies, headers=['Model', 'Accuracy'], tablefmt='pretty')

    return table



**Random Sampling**

In [198]:
X_s1 = s1.drop(['Class'],axis=1)
y_s1 = s1['Class']
print(accuracy(X_s1,y_s1))

+-------------------------+--------------------+
|          Model          |      Accuracy      |
+-------------------------+--------------------+
|   Logistic_Regression   | 0.8831168831168831 |
|  NaiveBayes_Classifier  | 0.7272727272727273 |
|           SVM           | 0.6493506493506493 |
| DecisionTree_Classifier | 0.974025974025974  |
| RandomForest_Classifier | 0.987012987012987  |
+-------------------------+--------------------+


**Systematic Sampling**

In [199]:
X_s2 = s2.drop(['Class'],axis=1)
y_s2 = s2['Class']
print(accuracy(X_s2,y_s2))

+-------------------------+----------+
|          Model          | Accuracy |
+-------------------------+----------+
|   Logistic_Regression   |   0.75   |
|  NaiveBayes_Classifier  |   0.75   |
|           SVM           |   0.5    |
| DecisionTree_Classifier |   0.75   |
| RandomForest_Classifier |   0.75   |
+-------------------------+----------+


**Cluster Sampling**

In [200]:
X_s3 = s3.drop(['Class'],axis=1)
y_s3 = s3['Class']
print(accuracy(X_s3,y_s3))

+-------------------------+--------------------+
|          Model          |      Accuracy      |
+-------------------------+--------------------+
|   Logistic_Regression   | 0.930635838150289  |
|  NaiveBayes_Classifier  | 0.8497109826589595 |
|           SVM           | 0.6820809248554913 |
| DecisionTree_Classifier | 0.9653179190751445 |
| RandomForest_Classifier |        1.0         |
+-------------------------+--------------------+


**Stratified Sampling**

In [201]:
X_s4 = s4.drop(['Class'],axis=1)
y_s4 = s4['Class']
print(accuracy(X_s4,y_s4))

+-------------------------+----------+
|          Model          | Accuracy |
+-------------------------+----------+
|   Logistic_Regression   |   0.65   |
|  NaiveBayes_Classifier  |   0.6    |
|           SVM           |   0.5    |
| DecisionTree_Classifier |   0.9    |
| RandomForest_Classifier |   0.85   |
+-------------------------+----------+


**Bootstap Sampling**

In [202]:
X_s5 = s5.drop(['Class'],axis=1)
y_s5 = s5['Class']
print(accuracy(X_s5,y_s5))

+-------------------------+--------------------+
|          Model          |      Accuracy      |
+-------------------------+--------------------+
|   Logistic_Regression   | 0.961038961038961  |
|  NaiveBayes_Classifier  | 0.922077922077922  |
|           SVM           | 0.6883116883116883 |
| DecisionTree_Classifier | 0.961038961038961  |
| RandomForest_Classifier | 0.987012987012987  |
+-------------------------+--------------------+
