# Analysis

## Data Preparation & Exploration

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import time

In [2]:
# Read in data
df = pd.read_csv('creditcard.csv')

In [3]:
# Preview data
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
# Evaluate for class imbalance
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [5]:
# Oversample to counteact class imbalance
from imblearn.over_sampling import SMOTE
smote = SMOTE(kind = "regular")

X = df.drop('Class',axis=1)
Y = df['Class']

X_sm, Y_sm = smote.fit_sample(X,Y)

sm_features = pd.DataFrame(data=X_sm,columns=X.columns)
sm_target = pd.DataFrame(data=Y_sm, columns = ['Class'])

frames = [sm_features, sm_target]
sm_data = pd.concat(frames,axis=1)

X = sm_features
Y = sm_target.values.ravel()

In [6]:
# Verify that classes are now balanaced
sm_data['Class'].value_counts()

1    284315
0    284315
Name: Class, dtype: int64

## Component Selection

### Mutual Information

In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

# Instiantiate selector
selector = SelectKBest(mutual_info_classif, k=5)

# Fit selector to data
selector.fit(X,Y)

# Store indexes of selected parameters
idxs_selected = selector.get_support(indices=True)

# Store selected parameters
mutual_info = X[X.columns[[idxs_selected]]]

In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

selector = SelectKBest(f_classif, k=5)
selector.fit(X,Y)
idxs_selected = selector.get_support(indices=True)
f_class = X[X.columns[[idxs_selected]]]

from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

svc = SVC(kernel="linear")
rfecv = RFECV(estimator=svc,step=1,cv=StratifiedKFold(2),scoring='accuracy')
rfecv.fit(X,Y)

print("Optimal number of features: %d" % rfecv.n_features_)

## Experimentation with Different Models

In [13]:
scores = []

### KNN

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
knn = KNeighborsClassifier(n_neighbors=5,weights='distance')

start_time = time.time()
knn_scores = cross_val_score(knn, X, Y, cv=5)
print("--- %s seconds ---" % (time.time() - start_time))

knn_score = np.mean(knn_scores)
scores.append(knn_score)
print(knn_score)

--- 55.812994718551636 seconds ---
0.495932328579


### Random Forest

In [15]:
from sklearn import ensemble
rfc = ensemble.RandomForestClassifier()

start_time = time.time()
rfc_scores = cross_val_score(rfc,X,Y,cv=5)
print("--- %s seconds ---" % (time.time() - start_time))

rfc_score = np.mean(rfc_scores)
scores.append(rfc_score)
print(rfc_score)

--- 297.6033191680908 seconds ---
0.898763695197


### Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X,Y)

start_time=time.time()
lr_scores = cross_val_score(lr,X,Y,cv=5)

print("--- %s seconds ---" % (time.time() - start_time))

lr_score = np.mean(lr_scores)
scores.append(lr_score)
print(lr_score)

--- 28.67907691001892 seconds ---
0.961053760793


### SVM

In [18]:
from sklearn.svm import LinearSVC
svc = LinearSVC()
svc.fit(X,Y)

start_time=time.time()
svc_scores = cross_val_score(svc,X,Y,cv=5)
print("--- %s seconds ---" % (time.time() - start_time))

svc_score = np.mean(svc_scores)
scores.append(svc_scores)
print(svc_scores)

--- 1131.5345861911774 seconds ---
[ 0.82689095  0.90215958  0.93891458  0.93190651  0.93539736]


In [19]:
print(svc_score)

0.907053795966


### Boosting

In [None]:
from sklearn import grid_search

parameters = {'n_estimators':list(range(1,501,50)),'max_depth':list(range(1,11,2)),'loss':('exponential','deviance')}
gradient_boost = ensemble.GradientBoostingClassifier()
gs = grid_search.GridSearchCV(gradient_boost, parameters,cv= 3,scoring="accuracy")
gs.fit(X,Y)

clf = ensemble.GradientBoostingClassifier(**gs.best_params_)
clf.fit(X, Y)

start_time = time.time()
clf_scores = cross_val_score(clf,X,Y,cv=5)
print("--- %s seconds ---" % (time.time() - start_time))

clf_score = np.mean(clf_scores)
scores.append(clf_scores)
print(clf_score)

## Evaluation

In [None]:
models = ['knn','random_forest','logistic_regression','svm','boosting']