In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import auc,precision_recall_curve,average_precision_score
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.decomposition import IncrementalPCA
import time

In [3]:
df = pd.read_csv('data/creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [19]:
X = df.drop(['Class','Time'],axis=1)
y = df['Class']

In [20]:
#(clf, X, y_true)
def auc_score(model,X_test,y_test):
    y_test_pred = model.predict_proba(X_test)
    precision, recall, thresholds = precision_recall_curve(y_test, y_test_pred[:,1])
    return auc(recall, precision)

# Best models from feature_selection and Dimensionality reduction

## Neighboors

In [21]:
columns = [['V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V2', 'V3', 'V4', 'V7', 'V9'],
        ['V3','V4', 'V7', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18'],
        ['V1', 'V4', 'V9', 'V10', 'V14', 'V16', 'V17', 'V20', 'V21', 'V27','V28']]

In [26]:
scores_nei = []

In [27]:
for cols in columns:
    X = df.drop(['Class','Time'],axis=1)
    X_= X[cols]
    X_train, X_test, y_train, y_test=train_test_split(X_,y,test_size=0.33,random_state=42)

    scaler = StandardScaler()
    model = KNeighborsClassifier(n_jobs=3)
    pipe = Pipeline([('scaler',scaler),
                        ('model',model)])

    pipe.fit(X_train,y_train)
    score = auc_score(pipe,X_test,y_test)
    scores_nei.append(score)
    print('auc = ',score)

auc =  0.8860948299422249
auc =  0.88403696131403
auc =  0.8860491758718232


In [32]:
X = df.drop(['Class','Time'],axis=1)
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.33,random_state=42)

scaler = StandardScaler()
model = KNeighborsClassifier(n_jobs=3)
batch_size = 14000
pca = IncrementalPCA(batch_size=batch_size,n_components=15)

pipe = Pipeline([('scaler',scaler),
                 ('pca',pca),
                 ('model',model)])

pipe.fit(X_train,y_train)
score = auc_score(pipe,X_test,y_test)
scores_nei.append(score)
print('auc = ',score)

auc =  0.8845923814460563


In [37]:
scores_nei

[0.8860948299422249, 0.88403696131403, 0.8860491758718232, 0.8845923814460563]

### NCA provided a good score in the reduction to 2 features, so I will perform it again in order to do a really nice plot :)

### If you don't want to wait for it, just execute the cel that loads de pipeline 

In [None]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/creditcard.csv')
X = df.drop(['Time'],axis=1)
y = df['Class']

chunksize = 18000
t0 = time.time()

nca = NeighborhoodComponentsAnalysis(random_state=42,n_components=2)

X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.33,random_state=42)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
list_df = [X_train.loc[i:i+chunksize-1,:] for i in range(0,X_train.shape[0],chunksize)]

for df in list_df:
    df_train=df.pop('Class')
    scaler = StandardScaler()
    nca_pipe = Pipeline([('scaler',scaler),
                    ('nca',nca)])

    nca_pipe.fit(df,df_train)
    nca_pipe.named_steps['nca'].set_params(warm_start=True)
        
X_test = X_test.drop(['Class'],axis=1)
X_train = X_train.drop(['Class'],axis=1)
X_train_transformed=nca_pipe.transform(X_train)
X_test_transformed=nca_pipe.transform(X_test)

model = KNeighborsClassifier(n_jobs=3)
model.fit(X_train_transformed,y_train)
score = scorer(model,X_test_transformed,y_test)
print('auc =',score)


print('total time taken:',time.time()-t0)
dump(nca_pipe, 'nca_pipe.pkl', compress = 1)

In [None]:
nca_pipe = load('nca_pipe.pkl')
nca_pipe

In [None]:
X = df.drop(['Time','Class'],axis=1)
y = df['Class']
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.33,random_state=42)

In [None]:
X_transformed = nca_pipe.named_steps['scaler'].transform(X)
X_transformed = nca_pipe.named_steps['nca'].transform(X_transformed)
X_transformed = pd.DataFrame(X_transformed,index=X.index)

X_train_transformed = nca_pipe.named_steps['scaler'].transform(X_train)
X_train_transformed = nca_pipe.named_steps['nca'].transform(X_train_transformed)
X_train_transformed = pd.DataFrame(X_train_transformed,index=X_train.index)

X_test_transformed = nca_pipe.named_steps['scaler'].transform(X_test)
X_test_transformed = nca_pipe.named_steps['nca'].transform(X_test_transformed)
X_test_transformed = pd.DataFrame(X_test_transformed,index=X_test.index)

In [None]:
#COPIA O CÓDIGO PRA IMAGEM AQUI 

## Forest

In [36]:
columns = [['V4', 'V8', 'V9', 'V10', 'V13', 'V14', 'V16', 'V20', 'V21', 'V22','V24', 'V27', 'V28'],
        ['V1', 'V4', 'V9', 'V10', 'V14', 'V16', 'V17', 'V20', 'V27', 'V28']]

In [38]:
scores_rf = []

In [39]:
for cols in columns:
    X = df.drop(['Class','Time'],axis=1)
    X_= X[cols]
    X_train, X_test, y_train, y_test=train_test_split(X_,y,test_size=0.33,random_state=42)

    scaler = StandardScaler()
    model = RandomForestClassifier(random_state=42, n_jobs=3)
    pipe = Pipeline([('scaler',scaler),
                      ('model',model)])

    pipe.fit(X_train,y_train)
    score = auc_score(pipe,X_test,y_test)
    scores_rf.append(score)
    print('auc = ',score)

auc =  0.8696176874422396
auc =  0.8762468206034059


In [40]:
X = df.drop(['Class','Time'],axis=1)
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.33,random_state=42)

scaler = StandardScaler()
model = RandomForestClassifier(random_state=42, n_jobs=3)
batch_size = 14000
pca = IncrementalPCA(batch_size=batch_size,n_components=17)

pipe = Pipeline([('scaler',scaler),
                 ('pca',pca),
                 ('model',model)])

pipe.fit(X_train,y_train)
score = auc_score(pipe,X_test,y_test)
scores_rf.append(score)
print('auc = ',score)

auc =  0.8612131763842392


In [41]:
scores_rf

[0.8696176874422396, 0.8762468206034059, 0.8612131763842392]

# I've done a grid_search on the two classifiers and doens't did any good.