dataset: https://www.kaggle.com/dalpozz/creditcardfraud

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display

FILE_NAME = 'creditcard.csv'

In [2]:
#load data
full_data = pd.read_csv(FILE_NAME)

#rename the 'Class' column
full_data.rename(columns = {'Class': 'Label'}, inplace = True)

#let's take a peek
print full_data.shape
full_data.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Label
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
#full_data.groupby('Label').hist(figsize = (20,20))
#pd.scatter_matrix(full_data)

In [3]:
from sklearn.utils import shuffle
full_data = shuffle(full_data)

labels = full_data['Label']
times = full_data['Time']
features = full_data.drop(['Time', 'Label'], axis=1)

print "Data consists of {} instances of data with {} total features with value counts of \n{}".format(
    features.shape[0], features.shape[1], labels.value_counts())
print "Where 0 indicates a legitimate transaction and 1 indicates fraud"

Data consists of 284807 instances of data with 29 total features with value counts of 
0    284315
1       492
Name: Label, dtype: int64
Where 0 indicates a legitimate transaction and 1 indicates fraud


In [4]:
from sklearn.preprocessing import StandardScaler

features['normAmount'] = StandardScaler().fit_transform(features['Amount'].reshape(-1, 1))


In [5]:
amounts = features['Amount']
features = features.drop(['Amount'], axis=1)
features.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,normAmount
88005,0.065203,0.560112,1.125474,0.540353,-0.536414,-0.320562,0.302465,0.057764,0.060539,0.017287,...,-0.108628,-0.051882,-0.043935,0.279408,0.550676,-1.400744,0.10911,0.047491,0.154411,-0.237525
167431,-0.873838,0.474974,-1.415775,-0.484232,1.016395,-0.760008,1.88926,0.044478,-1.530959,-0.754338,...,0.271137,0.537073,1.012363,-0.061086,0.792703,1.023667,0.670202,-0.31463,-0.170169,0.422119
59392,1.279947,-0.846119,0.426752,-0.84006,-0.996439,0.027175,-0.972928,0.20813,-0.651566,0.785956,...,0.039822,0.406552,0.999654,-0.170415,-0.30268,0.445933,0.002176,0.015554,0.002923,-0.192826
20131,1.225637,-0.649268,-0.119134,-0.513312,-0.797736,-0.561421,-0.522373,-0.006629,-0.759095,0.281007,...,0.213038,-0.014411,-0.255918,-0.120095,-0.074851,0.438338,-0.304013,-0.001022,0.035895,-0.009754
261110,-1.225787,-0.035555,1.673703,0.582572,-0.171355,0.438098,0.986569,0.154613,-0.249954,-0.805134,...,0.533896,0.372035,0.543561,0.149715,0.675198,0.74866,-0.393897,-0.004364,0.103384,0.606311


In [21]:
#?????????????????
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features)

In [None]:
# try resampling

#f1 scorer

#k-nn and random forest classifiers

#neural network

In [6]:
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, make_scorer, confusion_matrix

f1_scorer = make_scorer(f1_score, pos_label = 0)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = .3, random_state = 331)

print y_train.value_counts()
print y_test.value_counts()

0    199016
1       348
Name: Label, dtype: int64
0    85299
1      144
Name: Label, dtype: int64


In [8]:
rf = RandomForestClassifier()
knn = KNeighborsClassifier()

print "For Random Forest Classifier:"
rfscores = cross_val_score(rf, features, labels, scoring = f1_scorer)
print rfscores, rfscores.mean()

print "For K-Nearest Neighbors Classifier:"
knnscores = cross_val_score(knn, features, labels, scoring = f1_scorer)
print knnscores, knnscores.mean()


For Random Forest Classifier:
[ 0.99973097  0.9997679   0.99975208] 0.999750317137
For K-Nearest Neighbors Classifier:
[ 0.99971515  0.9997468   0.99973097] 0.999730973346


In [24]:
#majority vote benchmark
majority_vote_predictions = np.zeros(features.shape[0])
print "f1 score for simple majority vote is " , f1_score(labels, majority_vote_predictions, pos_label = 0)

f1 score for simple majority vote is  0.999135510488


In [None]:
# V this is stupid V

In [10]:
rf_params = {'n_estimators' : np.arange(10, 110, 15),
                'min_samples_split': np.arange(2, 8, 2),
                'max_features': np.arange(5, 29, 4),
                'criterion': ['gini', 'entropy']}

'''
knn_params = {'n_neighbors': np.arange(3, 10),
                'weights': ['uniform', 'distance'],
                'p': np.arange(1, 3)} 
''' 

rf_tune = RandomizedSearchCV(rf, rf_params, n_iter = 25, verbose = 3)
#knn_tune = RandomizedSearchCV(knn, knn_params, n_iter = 20, verbose = 3)

rf_tune = rf_tune.fit(features, labels)
#knn_tune = knn_tune.fit(features, labels)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] n_estimators=40, min_samples_split=4, criterion=gini, max_features=5 
[CV]  n_estimators=40, min_samples_split=4, criterion=gini, max_features=5, score=0.999547 -   0.5s
[CV] n_estimators=40, min_samples_split=4, criterion=gini, max_features=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.4min remaining:    0.0s


[CV]  n_estimators=40, min_samples_split=4, criterion=gini, max_features=5, score=0.999505 -   0.5s
[CV] n_estimators=40, min_samples_split=4, criterion=gini, max_features=5 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.0min remaining:    0.0s


[CV]  n_estimators=40, min_samples_split=4, criterion=gini, max_features=5, score=0.999547 -   0.4s
[CV] n_estimators=85, min_samples_split=6, criterion=entropy, max_features=17 
[CV]  n_estimators=85, min_samples_split=6, criterion=entropy, max_features=17, score=0.999568 -   0.8s
[CV] n_estimators=85, min_samples_split=6, criterion=entropy, max_features=17 
[CV]  n_estimators=85, min_samples_split=6, criterion=entropy, max_features=17, score=0.999558 -   0.8s
[CV] n_estimators=85, min_samples_split=6, criterion=entropy, max_features=17 
[CV]  n_estimators=85, min_samples_split=6, criterion=entropy, max_features=17, score=0.999600 -   0.7s
[CV] n_estimators=10, min_samples_split=2, criterion=entropy, max_features=25 
[CV]  n_estimators=10, min_samples_split=2, criterion=entropy, max_features=25, score=0.999484 -   0.0s
[CV] n_estimators=10, min_samples_split=2, criterion=entropy, max_features=25 
[CV]  n_estimators=10, min_samples_split=2, criterion=entropy, max_features=25, score=0.9

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 430.2min finished


In [13]:
print rf_tune.best_estimator_ , '\nf1 score:' , rf_tune.best_score_
#print knn_tune.best_estimator_ + '\nf1 score:' + knn_tune.best_score_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=17, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=6, min_weight_fraction_leaf=0.0,
            n_estimators=55, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False) 
f1 score: 0.999585684341


In [19]:
rft = rf_tune.best_estimator_
rft.fit_transform(X_train, y_train)

rfu = RandomForestClassifier()
rfu.fit_transform(X_train, y_train)



array([[-0.36807225, -0.80535119, -0.20908081, ..., -0.30532203,
         1.06654398,  0.06716665],
       [-0.27124035,  0.55946789, -0.54494174, ..., -0.28095942,
         1.33383085,  0.24786847],
       [ 0.80936887,  0.61689589,  0.74584106, ...,  0.5148158 ,
        -0.30145523,  0.41496762],
       ..., 
       [ 0.42375799,  0.36419524,  2.19072329, ...,  1.21963372,
         1.59622618,  0.7538579 ],
       [-0.07676136, -0.50647389,  0.07786699, ..., -1.50003717,
        -0.04110927,  0.41677922],
       [ 0.4690207 , -1.22739193, -0.56715424, ..., -2.73527709,
         0.98445114,  1.40371771]])

In [20]:
# Check performances of tuned and untuned models
print "f1 testing score for tuned random forest is ", f1_score(y_test, rft.predict(X_test), pos_label = 0)

print "f1 testing score for random forest is " , f1_score(y_test, rfu.predict(X_test), pos_label = 0)

f1 testing score for tuned random forest is  0.999759709776
f1 testing score for random forest is  0.99976556914


oops, looks like I could have chosen a better selection of hyper parameters for the randomized search cross validation optimization

In [22]:
confusion_matrix(y_test, rfu.predict(X_test))

array([[85293,     6],
       [   34,   110]])

In [None]:
#now let's try a neural network

In [36]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
from keras.regularizers import l2
from keras.utils.np_utils import to_categorical

In [None]:
# one hot or nah? idk

In [41]:
# Let's build a model

#add regularization
model = Sequential()
model.add(Dense(40, input_dim = X_train.shape[1], activation = 'tanh', init = 'lecun_uniform'))
model.add(Dropout(0.3))
model.add(Dense(20, activation = 'tanh', init = 'lecun_uniform'))
model.add(Dropout(0.1))
model.add(Dense(8, activation = 'tanh', init = 'lecun_uniform'))
model.add(Dense(output_dim = 1, activation = 'sigmoid'))

sgd = SGD(lr = .1, momentum = .8, decay = .001)

model.compile(optimizer = sgd, loss = 'binary_crossentropy', metrics = ['fmeasure'])

In [42]:
history = model.fit(X_train.values, y_train.values, nb_epoch = 200, batch_size = 2000, verbose = 2, validation_split = .20,
                    shuffle = True)

Train on 159491 samples, validate on 39873 samples
Epoch 1/200
1s - loss: 0.0703 - fmeasure: 4.7590e-05 - val_loss: 0.0144 - val_fmeasure: 0.0000e+00
Epoch 2/200
1s - loss: 0.0120 - fmeasure: 0.0000e+00 - val_loss: 0.0109 - val_fmeasure: 0.0000e+00
Epoch 3/200
1s - loss: 0.0090 - fmeasure: 0.0000e+00 - val_loss: 0.0077 - val_fmeasure: 0.0000e+00
Epoch 4/200
1s - loss: 0.0065 - fmeasure: 0.1996 - val_loss: 0.0058 - val_fmeasure: 0.5606
Epoch 5/200
1s - loss: 0.0053 - fmeasure: 0.5088 - val_loss: 0.0052 - val_fmeasure: 0.6576
Epoch 6/200
1s - loss: 0.0047 - fmeasure: 0.6381 - val_loss: 0.0049 - val_fmeasure: 0.6917
Epoch 7/200
1s - loss: 0.0043 - fmeasure: 0.6551 - val_loss: 0.0048 - val_fmeasure: 0.6917
Epoch 8/200
1s - loss: 0.0041 - fmeasure: 0.6708 - val_loss: 0.0047 - val_fmeasure: 0.7123
Epoch 9/200
1s - loss: 0.0041 - fmeasure: 0.6819 - val_loss: 0.0046 - val_fmeasure: 0.7123
Epoch 10/200
1s - loss: 0.0040 - fmeasure: 0.7354 - val_loss: 0.0046 - val_fmeasure: 0.7123
Epoch 11/200
1