In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.ensemble import GradientBoostingClassifier as gbm
from sklearn.linear_model import LogisticRegression as logr
from sklearn.svm import SVC

from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, roc_auc_score

from IPython.display import display

In [2]:
df = pd.read_csv('fraud_data.csv')
df['signup_time'] = pd.to_datetime(df['signup_time'])
df['purchase_time'] = pd.to_datetime(df['purchase_time'])
df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [3]:
print '%d fraud records out of %d total' % (df['class'].sum(), len(df))

1929 fraud records out of 20000 total


In [4]:
df.nunique()

user_id           20000
signup_time       20000
purchase_time     19995
purchase_value      110
device_id         19348
source                3
browser               5
sex                   2
age                  53
ip_address        19430
class                 2
dtype: int64

In [5]:
df['timedelta'] = df['purchase_time'] - df['signup_time']
df['timedelta'] = map(lambda x: x.total_seconds(), df['timedelta'])

In [6]:
df['duplicate_ip'] = map(lambda ip: 1 if df.loc[df['ip_address']==ip, 'ip_address'].count()>1 else 0, 
                        df['ip_address'])
df['duplicate_purchase_time'] = map(lambda t: 1 if df.loc[df['purchase_time']==t, 'purchase_time'].count()>1 else 0, 
                                    df['purchase_time'])
df['duplicate_device_id'] = map(lambda i: 1 if df.loc[df['device_id']==i, 'device_id'].count()>1 else 0, 
                                df['device_id'])
df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,timedelta,duplicate_ip,duplicate_purchase_time,duplicate_device_id
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,4506682.0,0,0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,17944.0,0,0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,1.0,1,0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,492085.0,0,0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,4361461.0,0,0,0


In [7]:
print df['duplicate_ip'].sum()
print df['duplicate_purchase_time'].sum()
print df['duplicate_device_id'].sum()

919
10
1080


In [8]:
dummy_col = ['source', 'browser', 'sex']
df_dummy = pd.get_dummies(df[dummy_col])
dummy_cols = df_dummy.columns

get_one_col = lambda s, cols: [k for k in cols if k.startswith(s)][0]
rm_cols = map(lambda dummy: get_one_col(dummy, dummy_cols), dummy_col)

kp_cols = [col for col in dummy_cols if col not in rm_cols]
print rm_cols, kp_cols

df_dummy = df_dummy[kp_cols]

['source_Ads', 'browser_Chrome', 'sex_F'] ['source_Direct', 'source_SEO', 'browser_FireFox', 'browser_IE', 'browser_Opera', 'browser_Safari', 'sex_M']


In [9]:
df2 = df.join(df_dummy)
display(df2.head())
print df2.columns

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,...,duplicate_ip,duplicate_purchase_time,duplicate_device_id,source_Direct,source_SEO,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_M
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,...,0,0,0,0,1,0,0,0,0,1
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,...,0,0,0,0,0,0,0,0,0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,...,1,0,1,0,1,0,0,1,0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,...,0,0,0,0,1,0,0,0,1,1
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,...,0,0,0,0,0,0,0,0,1,1


Index([u'user_id', u'signup_time', u'purchase_time', u'purchase_value',
       u'device_id', u'source', u'browser', u'sex', u'age', u'ip_address',
       u'class', u'timedelta', u'duplicate_ip', u'duplicate_purchase_time',
       u'duplicate_device_id', u'source_Direct', u'source_SEO',
       u'browser_FireFox', u'browser_IE', u'browser_Opera', u'browser_Safari',
       u'sex_M'],
      dtype='object')


In [10]:
targ_col = ['class']
feat_col = ['purchase_value', 'age', 'timedelta', 'duplicate_ip', 'duplicate_purchase_time', 'duplicate_device_id']
feat_col += kp_cols
print feat_col

['purchase_value', 'age', 'timedelta', 'duplicate_ip', 'duplicate_purchase_time', 'duplicate_device_id', 'source_Direct', 'source_SEO', 'browser_FireFox', 'browser_IE', 'browser_Opera', 'browser_Safari', 'sex_M']


In [11]:
X = df2[feat_col]
y = df2[targ_col]

In [12]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42, stratify = y)
print float(y_test.values.sum())/len(y_test)
print float(y.values.sum()) / len(y)
print y_test.values.sum()
print len(y_test)

0.0965
0.09645
386
4000


#### Random Forest

In [13]:
clf = rf(n_estimators=500, max_depth = 10, oob_score=True)
clf.fit(X_train, y_train)
print 'Out of bag error', clf.oob_score_

y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)
xt
print 'Accuracy', accuracy_score(y_true=y_test, y_pred=y_pred)
print 'Confusion Matrix\n', confusion_matrix(y_true=y_test, y_pred=y_pred)
print 'Log loss', log_loss(y_true=y_test, y_pred=y_pred_prob)
print 'ROC AUC', roc_auc_score(y_true=y_test, y_score=y_pred_prob[:, 1])

  


Out of bag error 0.956125
Accuracy 0.95475
Confusion Matrix
[[3613    1]
 [ 180  206]]
Log loss 0.182401244412
ROC AUC 0.775986305416


#### GBM

In [14]:
clf = gbm(n_estimators=500, max_depth = 10)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)

print 'Accuracy', accuracy_score(y_true=y_test, y_pred=y_pred)
print 'Confusion Matrix\n', confusion_matrix(y_true=y_test, y_pred=y_pred)
print 'Log loss', log_loss(y_true=y_test, y_pred=y_pred_prob)
print 'ROC AUC', roc_auc_score(y_true=y_test, y_score=y_pred_prob[:, 1])

  y = column_or_1d(y, warn=True)


Accuracy 0.952
Confusion Matrix
[[3602   12]
 [ 180  206]]
Log loss 0.346991730188
ROC AUC 0.751438705552


#### Logistic Regression

In [15]:
clf = logr(class_weight='balanced')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)

print 'Accuracy', accuracy_score(y_true=y_test, y_pred=y_pred)
print 'Confusion Matrix\n', confusion_matrix(y_true=y_test, y_pred=y_pred)
print 'Log loss', log_loss(y_true=y_test, y_pred=y_pred_prob)
print 'ROC AUC', roc_auc_score(y_true=y_test, y_score=y_pred_prob[:, 1])

Accuracy 0.9035
Confusion Matrix
[[3614    0]
 [ 386    0]]
Log loss 0.489842148271
ROC AUC 0.762389211787


#### SVM

In [16]:
clf = SVC(kernel='rbf', probability=True)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)

print 'Accuracy', accuracy_score(y_true=y_test, y_pred=y_pred)
print 'Confusion Matrix\n', confusion_matrix(y_true=y_test, y_pred=y_pred)
print 'Log loss', log_loss(y_true=y_test, y_pred=y_pred_prob)
print 'ROC AUC', roc_auc_score(y_true=y_test, y_score=y_pred_prob[:, 1])

Accuracy 0.95425
Confusion Matrix
[[3614    0]
 [ 183  203]]
Log loss 0.181783065572
ROC AUC 0.767095291483
