In [70]:
import pandas as pd
import numpy as np

from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [71]:
data = pd.read_csv('C:/Users/Svea/Downloads/Creditcard_data.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [72]:
data.shape

(772, 31)

In [73]:
data['Class'].value_counts()

Class
0    763
1      9
Name: count, dtype: int64

In [74]:
time_count = Counter(data['Time'])
data = data.drop('Time', axis=1)

In [75]:
X, y = data.drop('Class', axis=1), data['Class']

In [76]:
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [77]:
y.value_counts()

Class
0    763
1    763
Name: count, dtype: int64

In [78]:
data_new = pd.concat([X, y], axis=1)

In [79]:
from sklearn.model_selection import StratifiedShuffleSplit
def get_samples():
    sample_names = []
    samples_x_train = []
    samples_y_train = []
    samples_x_test = []
    samples_y_test = []

    n = (1.92**2)*0.5*(1-0.5)/0.05**2

    # 1. Simple random sampling
    random_sample = data_new.sample(n=int(n), random_state=42)
    X_sample, y_sample = random_sample.drop('Class', axis=1), random_sample['Class']
    X_test, X_train, y_test, y_train = train_test_split(X_sample, y_sample, test_size=0.5, random_state=42)
    samples_x_train.append(X_train)
    samples_y_train.append(y_train)
    samples_x_test.append(X_test)
    samples_y_test.append(y_test)
    sample_names.append('Random')

    # 2. Stratified random sampling
    stratified_sample = data_new.groupby('Class').apply(lambda x: x.sample(frac=0.5, random_state=42))
    X_sample, y_sample = stratified_sample.drop('Class', axis=1), stratified_sample['Class']
    X_test, X_train, y_test, y_train = train_test_split(X_sample, y_sample, test_size=0.5, random_state=42)
    samples_x_train.append(X_train)
    samples_y_train.append(y_train)
    samples_x_test.append(X_test)
    samples_y_test.append(y_test)
    sample_names.append('Stratified')

    # 3. Cluster sampling
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        samples_x_train.append(X_train)
        samples_y_train.append(y_train)
        samples_x_test.append(X_test)
        samples_y_test.append(y_test)
        sample_names.append('Cluster')

    # 4. Systematic sampling
    k = int(len(data_new)/n)
    start = np.random.randint(0, k)
    systematic_sample = data_new.iloc[start::k]
    X_sample, y_sample = systematic_sample.drop('Class', axis=1), systematic_sample['Class']
    X_test, X_train, y_test, y_train = train_test_split(X_sample, y_sample, test_size=0.5, random_state=42)
    samples_x_train.append(X_train)
    samples_y_train.append(y_train)
    samples_x_test.append(X_test)
    samples_y_test.append(y_test)
    sample_names.append('Systematic')

    return samples_x_train, samples_y_train, samples_x_test, samples_y_test, sample_names


In [80]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC  
from sklearn.naive_bayes import GaussianNB  

def get_models():
    models, names = list(), list()
    steps = [('s',StandardScaler()),('m',KNeighborsClassifier())]
    models.append(Pipeline(steps=steps))
    names.append('KNN')
    # Bagging
    models.append(BaggingClassifier(n_estimators=100))
    names.append('BAG')
    # RF
    models.append(RandomForestClassifier(n_estimators=50))
    names.append('RF')
     # SVM
    steps = [('s', StandardScaler()), ('m', SVC())]
    models.append(Pipeline(steps=steps))
    names.append('SVM')
    # NB
    models.append(GaussianNB())
    names.append('NB')
    return models, names
    return models, names

In [81]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

def evaluate_model(X, y, model):
    cv = KFold(n_splits=10, random_state=1, shuffle=True)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

In [82]:
from numpy import mean
from numpy import std
from sklearn.metrics import accuracy_score

models, names = get_models()
samples_x_train, samples_y_train, samples_x_test, samples_y_test, sample_names = get_samples()
results = list()

for i in range(len(samples_x_train)):
    print('Sampling: %s' % sample_names[i])
    for j in range(len(models)):
        scores = evaluate_model(samples_x_train[i], samples_y_train[i], models[j])
        results.append(scores)
        print('>%s %.3f (%.3f)' % (names[j], mean(scores), std(scores)))


Sampling: Random
>KNN 0.777 (0.098)


>BAG 0.902 (0.092)
>RF 0.961 (0.043)
>SVM 0.928 (0.083)
>NB 0.711 (0.089)
Sampling: Stratified
>KNN 0.874 (0.048)
>BAG 0.976 (0.018)
>RF 0.989 (0.017)
>SVM 0.979 (0.023)
>NB 0.676 (0.051)
Sampling: Cluster
>KNN 0.962 (0.016)
>BAG 0.991 (0.008)
>RF 0.997 (0.005)
>SVM 0.983 (0.010)
>NB 0.728 (0.031)
Sampling: Systematic
>KNN 0.822 (0.057)
>BAG 0.948 (0.062)
>RF 0.969 (0.035)
>SVM 0.942 (0.028)
>NB 0.748 (0.062)


### Performance Evaluation

In [83]:
scores = []
for i in range(len(samples_x_test)):
    for j in range(len(models)):
        model = models[j]
        model.fit(samples_x_train[i], samples_y_train[i])
        y_pred = model.predict(samples_x_test[i])
        scores.append(accuracy_score(samples_y_test[i], y_pred))
        print('Sampling: %s, Model: %s, Accuracy: %.3f' % (sample_names[i], names[j], scores[-1]))

Sampling: Random, Model: KNN, Accuracy: 0.864
Sampling: Random, Model: BAG, Accuracy: 0.940
Sampling: Random, Model: RF, Accuracy: 0.973
Sampling: Random, Model: SVM, Accuracy: 0.962
Sampling: Random, Model: NB, Accuracy: 0.777
Sampling: Stratified, Model: KNN, Accuracy: 0.882
Sampling: Stratified, Model: BAG, Accuracy: 0.958
Sampling: Stratified, Model: RF, Accuracy: 0.987
Sampling: Stratified, Model: SVM, Accuracy: 0.974
Sampling: Stratified, Model: NB, Accuracy: 0.725
Sampling: Cluster, Model: KNN, Accuracy: 0.980
Sampling: Cluster, Model: BAG, Accuracy: 0.993
Sampling: Cluster, Model: RF, Accuracy: 0.993
Sampling: Cluster, Model: SVM, Accuracy: 0.980
Sampling: Cluster, Model: NB, Accuracy: 0.745
Sampling: Systematic, Model: KNN, Accuracy: 0.805
Sampling: Systematic, Model: BAG, Accuracy: 0.921
Sampling: Systematic, Model: RF, Accuracy: 0.963
Sampling: Systematic, Model: SVM, Accuracy: 0.974
Sampling: Systematic, Model: NB, Accuracy: 0.663
