# using logistic regression and knn to predict credit card fraud

In [49]:
import sys
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 999)
pd.set_option('display.width', 500)
pd.set_option('display.notebook_repr_html', True)
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
# Note --  Requires sklearn version .18 or higher  
from sklearn import metrics, datasets
from collections import Counter
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
%matplotlib inline

assert(sys.version_info.major==3),print(sys.version)
# Python 3 or higher is required

# Read data from .csv

In [50]:
df = pd.read_csv("creditcard.csv")

# See the top N lines of the dataframe

In [51]:
# show top 5 lines by default
df.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


In [52]:
# or you could also specify the N
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# 1. normalize the data


In [100]:
def normalize(x):
    num = x - np.min(x)
    denom = np.max(x) - np.min(x)
    return (num / denom)

df.iloc[:, 0:30] = normalize(df.iloc[:, 0:30])

df.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,0.935192,0.76649,0.881365,0.313023,0.763439,0.267669,0.266815,0.786444,0.475312,...,0.561184,0.522992,0.663793,0.391253,0.585122,0.394557,0.418976,0.312697,0.005824,0
1,0.0,0.978542,0.770067,0.840298,0.271796,0.76612,0.262192,0.264875,0.786298,0.453981,...,0.55784,0.480237,0.666938,0.33644,0.58729,0.446013,0.416345,0.313423,0.000105,0
2,6e-06,0.935217,0.753118,0.868141,0.268766,0.762329,0.281122,0.270177,0.788042,0.410603,...,0.565477,0.54603,0.678939,0.289354,0.559515,0.402727,0.415489,0.311911,0.014739,0
3,6e-06,0.941878,0.765304,0.868484,0.213661,0.765647,0.275559,0.266803,0.789434,0.414999,...,0.559734,0.510277,0.662607,0.223826,0.614245,0.389197,0.417669,0.314371,0.004807,0
4,1.2e-05,0.938617,0.77652,0.864251,0.269796,0.762975,0.263984,0.268968,0.782484,0.49095,...,0.561327,0.547271,0.663392,0.40127,0.566343,0.507497,0.420561,0.31749,0.002724,0


# 2. play with group

In [54]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,0.548717,0.958294,0.767258,0.837414,0.25193,0.765716,0.26302,0.265356,0.785385,0.46278,...,0.56148,0.510031,0.665434,0.382234,0.577907,0.425448,0.416511,0.313124,0.003439,0.001727
std,0.274828,0.033276,0.017424,0.026275,0.062764,0.009292,0.013395,0.007537,0.012812,0.037846,...,0.011841,0.033854,0.009274,0.081611,0.029261,0.078771,0.00745,0.006698,0.009736,0.041527
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.313681,0.942658,0.760943,0.821985,0.214311,0.76106,0.255295,0.26198,0.783148,0.440626,...,0.557798,0.48473,0.66303,0.334454,0.560104,0.372036,0.415203,0.312049,0.000218,0.0
50%,0.490138,0.958601,0.767949,0.84053,0.25105,0.765351,0.260263,0.2656,0.785625,0.461008,...,0.561005,0.510347,0.665267,0.387756,0.578838,0.416932,0.416536,0.313352,0.000856,0.0
75%,0.80629,0.980645,0.775739,0.855213,0.284882,0.769836,0.267027,0.268831,0.788897,0.48335,...,0.564484,0.534688,0.667626,0.44146,0.597593,0.464807,0.418191,0.314712,0.003004,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [55]:
# creat a list of the indexes of fraud behaviour data
fraudlist = df[df['Class']==1].index.values
fraudlist

np.random.shuffle(fraudlist)

# creat a list of the indexes of nonfraud behaviour data
nonfraudlist = df[df['Class']==0].index.values

np.random.shuffle(nonfraudlist)

s = np.append(fraudlist, nonfraudlist, axis = 0)
len(s)

284807

# 3. split the data

In [101]:
def split_data(data, train_size=.7):
    # creat a list of the indexes of fraud&nonfraud behaviour data
    index1 = data[data['Class']==1].index.values
    index0 = data[data['Class']==0].index.values
    # Determine the number of observations we have in each data set:
    length1 = len(index1)
    length0 = len(index0)
    
    # Use numpy's random.shuffle() function to randomly shuffle over our index:
    np.random.shuffle(index1)
    np.random.shuffle(index0)
    
    # Create a list for the first 70% of the shuffled indices and set to training: 
    train_indices1 = index1[0:int(length1 * train_size)]
    train_indices0 = index0[0:int(length0 * train_size)]
    train_indices = np.append(train_indices1, train_indices0, axis = 0)
    
    # Create a list for the remaining 30% of the shuffled indices and set to testing:
    test_indices1 = index1[int(length1 * train_size):]
    test_indices0 = index0[int(length0 * train_size):]
    test_indices = np.append(test_indices1,test_indices0, axis = 0)
    
    # Use the list of training indices to find the corresponding data entries:
    train = data.iloc[train_indices]
    # Use the list of testing indices to find the corresponding data entries:
    test = data.iloc[test_indices]
    
    # Return two dataframes, one with the testing data and one with the training data:
    return train, test

We will now run the function and see if it returns actually what we want:

In [102]:
df_train,df_test  = split_data(df)
df_train.shape

(199364, 31)

## 4. Using Linear Regression:

We will use the training/testing dataset as before and create our linear regression objects.

In [103]:
features_cols = [col for col in df.columns if col != 'Class']
x_train, x_test = df_train[features_cols], df_test[features_cols]
y_train, y_test = df_train['Class'], df_test['Class']

len(y_test[y_test.values==1])

148

In [104]:
from statsmodels.api import OLS
# We must first create the linear regression object from sklearn:
regr = LinearRegression()
# Then, we will put in the training sets in for the .fit() function:
regr.fit(x_train, y_train)
# This prints the regression coefficients of our model:
print(regr.coef_)

[-5.48044535e-04 -1.06857468e-01  2.79539401e-01 -2.83738429e-01
  8.59107253e-02 -3.10501152e-01 -1.73518333e-01 -1.13018715e+00
  7.43727202e-02 -1.04825383e-01 -3.90937884e-01  1.07303819e-01
 -2.89043811e-01 -2.81008188e-03 -3.95979371e-01 -3.01018702e-03
 -2.94948096e-01 -5.49953055e-01 -8.14370914e-02  2.46442300e-02
  1.26580674e-02  1.23870980e-01  6.44777430e-03  1.67742905e-02
 -3.15256956e-03  5.21159256e-03  2.28961588e-03  9.01944443e-02
  6.12747132e-02  2.25494005e-01]


In [105]:
import statsmodels.api as sm
# We must first create the linear regression object from stats model:
model = sm.OLS(y_train.values, x_train)
regr = model.fit()
# This prints the regression coefficients of our model:
regr.params

Time      0.000131
V1        0.021298
V2        0.811385
V3       -0.123093
V4        0.064820
V5        0.582777
V6       -0.366069
V7       -1.773655
V8        0.339967
V9       -0.077239
V10      -0.292964
V11       0.110201
V12      -0.269658
V13      -0.001271
V14      -0.390522
V15       0.000208
V16      -0.275200
V17      -0.515761
V18      -0.084377
V19       0.041759
V20      -0.308197
V21       0.116787
V22       0.047849
V23       0.439953
V24      -0.002131
V25       0.051312
V26       0.005113
V27       0.261656
V28       0.247924
Amount    1.739513
dtype: float64

In [106]:
# To compute the mean squared error (notice that we are now using the TEST set):
np.mean((regr.predict(x_test)-y_test)**2)

0.000814738389190094

In [107]:
regr.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.472
Model:,OLS,Adj. R-squared:,0.472
Method:,Least Squares,F-statistic:,5935.0
Date:,"Fri, 09 Mar 2018",Prob (F-statistic):,0.0
Time:,19:44:24,Log-Likelihood:,414940.0
No. Observations:,199364,AIC:,-829800.0
Df Residuals:,199334,BIC:,-829500.0
Df Model:,30,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Time,0.0001,0.000,0.387,0.699,-0.001,0.001
V1,0.0213,0.002,9.109,0.000,0.017,0.026
V2,0.8114,0.007,124.347,0.000,0.799,0.824
V3,-0.1231,0.003,-38.312,0.000,-0.129,-0.117
V4,0.0648,0.001,57.161,0.000,0.063,0.067
V5,0.5828,0.009,62.747,0.000,0.565,0.601
V6,-0.3661,0.006,-59.996,0.000,-0.378,-0.354
V7,-1.7737,0.014,-124.406,0.000,-1.802,-1.746
V8,0.3400,0.005,67.827,0.000,0.330,0.350

0,1,2,3
Omnibus:,416269.667,Durbin-Watson:,0.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4213318032.96
Skew:,17.612,Prob(JB):,0.0
Kurtosis:,714.316,Cond. No.,1240.0


# Instead of focusing on $R^2$, let’s look at the classification stats..

In [111]:
predicted = np.round(regr.predict(x_test))
expected = y_test
print(metrics.classification_report(expected, predicted))

             precision    recall  f1-score   support

       -1.0       0.00      0.00      0.00         0
        0.0       1.00      1.00      1.00     85295
        1.0       0.86      0.45      0.59       148

avg / total       1.00      1.00      1.00     85443



  'recall', 'true', average, warn_for)


In [110]:
predicted.describe()

count    85443.000000
mean         0.000889
std          0.030201
min         -1.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
dtype: float64

In [112]:
pd.DataFrame(metrics.confusion_matrix(expected, predicted))

Unnamed: 0,0,1,2
0,0,0,0
1,1,85283,11
2,0,82,66


# 5.using knn to predict

In [96]:
# Set kNN parameter:
k = 10

# Now we can fit the model, predict our variable of interest, and then evaluate our fit:
# First, we create the classifier object:
neighbors = KNeighborsClassifier(n_neighbors=k)

# Then, we fit the model using x_train as training data and y_train as target values:
neighbors.fit(x_train, y_train)

# Retreieve our predictions:
prediction_knn = neighbors.predict(x_test)

# This returns the mean accuracy on the given test data and labels, or in other words, 
# the R squared value -- A constant model that always predicts the expected value of y, 
# disregarding the input features, would get a R^2 score of 1.
r = neighbors.score(x_test, y_test)
r

0.9994382219725431

In [97]:
expected_knn = y_test
predicted_knn = neighbors.predict(x_test)
print(metrics.classification_report(expected_knn, predicted_knn))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     85295
          1       0.90      0.76      0.82       148

avg / total       1.00      1.00      1.00     85443



In [98]:
pd.DataFrame(metrics.confusion_matrix(expected_knn, predicted_knn))

Unnamed: 0,0,1
0,85282,13
1,35,113
