# Summary of Fraud Detection

* Data is highly imbalanced
* Features of interval_after_signup and geolocation aggregation are highly predictive of fraud
* Made actionable operation recommendations/proposal for business

# Data Exploration

In [132]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
import pandas_profiling

pd.set_option('display.max_columns', 500)

# Import and store dataset
fraud_data = pd.read_csv('/Users/qiuyu/Desktop/TCR_Intern/QiuYu/data/imbalancedFraudDF.csv')
ipToCountry = pd.read_csv('/Users/qiuyu/Desktop/TCR_Intern/QiuYu/data/IpAddress_to_Country.csv')

In [133]:
import warnings
warnings.filterwarnings("ignore")

In [134]:
# Distribution of the label column
fraud_data['class'].value_counts()

0    136961
1      1415
Name: class, dtype: int64

In [135]:
fraud_data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
3,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0
4,159135,2015-05-21 06:03:03,2015-07-09 08:05:14,42,ALEYXFXINSXLZ,Ads,Chrome,M,18,2809315000.0,0


In [136]:
# Inline summary report without saving report as object
pandas_profiling.ProfileReport(fraud_data)

0,1
Number of variables,11
Number of observations,138376
Total Missing (%),0.0%
Total size in memory,11.6 MiB
Average record size in memory,88.0 B

0,1
Numeric,4
Categorical,5
Boolean,1
Date,0
Text (Unique),1
Rejected,0
Unsupported,0

0,1
Distinct count,58
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,33.126
Minimum,18
Maximum,76
Zeros (%),0.0%

0,1
Minimum,18
5-th percentile,20
Q1,27
Median,33
Q3,39
95-th percentile,48
Maximum,76
Range,58
Interquartile range,12

0,1
Standard deviation,8.6236
Coef of variation,0.26033
Kurtosis,-0.17773
Mean,33.126
MAD,7.0076
Skewness,0.42729
Sum,4583825
Variance,74.367
Memory size,1.1 MiB

Value,Count,Frequency (%),Unnamed: 3
31,6047,4.4%,
32,6022,4.4%,
33,5976,4.3%,
30,5868,4.2%,
34,5813,4.2%,
29,5812,4.2%,
28,5733,4.1%,
35,5713,4.1%,
36,5470,4.0%,
27,5403,3.9%,

Value,Count,Frequency (%),Unnamed: 3
18,2514,1.8%,
19,2692,1.9%,
20,3091,2.2%,
21,3475,2.5%,
22,3753,2.7%,

Value,Count,Frequency (%),Unnamed: 3
71,2,0.0%,
72,2,0.0%,
73,1,0.0%,
74,1,0.0%,
76,1,0.0%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Chrome,55993
IE,33836
Safari,22670
Other values (2),25877

Value,Count,Frequency (%),Unnamed: 3
Chrome,55993,40.5%,
IE,33836,24.5%,
Safari,22670,16.4%,
FireFox,22500,16.3%,
Opera,3377,2.4%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.010226

0,1
0,136961
1,1415

Value,Count,Frequency (%),Unnamed: 3
0,136961,99.0%,
1,1415,1.0%,

0,1
Distinct count,134121
Unique (%),96.9%
Missing (%),0.0%
Missing (n),0

0,1
QTXDJHIIXYVQN,7
BWSMVSLCJXMCM,6
WBBPGFKHVUYEU,5
Other values (134118),138358

Value,Count,Frequency (%),Unnamed: 3
QTXDJHIIXYVQN,7,0.0%,
BWSMVSLCJXMCM,6,0.0%,
WBBPGFKHVUYEU,5,0.0%,
XWNCGVXNZVIGQ,5,0.0%,
ULGTRBHXSTOEV,5,0.0%,
HOBZLEUMZUDEB,5,0.0%,
HCYSLYNRFLAXU,5,0.0%,
JCCAZQKFWLFBL,5,0.0%,
JVZMHFTYWUJBE,5,0.0%,
ZYZQZXBXADPST,5,0.0%,

0,1
Distinct count,137653
Unique (%),99.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2154400000
Minimum,52093
Maximum,4294900000
Zeros (%),0.0%

0,1
Minimum,52093
5-th percentile,209350000
Q1,1085100000
Median,2156500000
Q3,3249200000
95-th percentile,4088000000
Maximum,4294900000
Range,4294800000
Interquartile range,2164100000

0,1
Standard deviation,1250600000
Coef of variation,0.58047
Kurtosis,-1.2148
Mean,2154400000
MAD,1083700000
Skewness,-0.007369
Sum,298110000000000
Variance,1.5639e+18
Memory size,1.1 MiB

Value,Count,Frequency (%),Unnamed: 3
1954600796.05912,7,0.0%,
2937899119.50045,6,0.0%,
2647792501.42565,5,0.0%,
799584366.227038,5,0.0%,
1120496524.56385,5,0.0%,
4149333595.43826,5,0.0%,
1537339184.26314,5,0.0%,
4204700290.7123604,5,0.0%,
1977076202.65662,5,0.0%,
1797069085.53522,5,0.0%,

Value,Count,Frequency (%),Unnamed: 3
52093.4968949854,1,0.0%,
93447.13896136609,1,0.0%,
105818.501505225,1,0.0%,
117566.66486748,1,0.0%,
131423.789041531,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
4294673680.7675,1,0.0%,
4294714854.85489,1,0.0%,
4294719533.35461,1,0.0%,
4294822241.8758297,1,0.0%,
4294850499.67884,1,0.0%,

0,1
Distinct count,137985
Unique (%),99.7%
Missing (%),0.0%
Missing (n),0

0,1
2015-09-10 09:04:53,3
2015-06-08 09:42:04,3
2015-08-02 06:06:04,2
Other values (137982),138368

Value,Count,Frequency (%),Unnamed: 3
2015-09-10 09:04:53,3,0.0%,
2015-06-08 09:42:04,3,0.0%,
2015-08-02 06:06:04,2,0.0%,
2015-04-02 05:14:10,2,0.0%,
2015-04-07 05:25:28,2,0.0%,
2015-03-27 18:37:51,2,0.0%,
2015-07-24 02:08:45,2,0.0%,
2015-05-15 05:30:09,2,0.0%,
2015-09-27 11:17:08,2,0.0%,
2015-04-12 14:04:05,2,0.0%,

0,1
Distinct count,122
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,36.939
Minimum,9
Maximum,154
Zeros (%),0.0%

0,1
Minimum,9
5-th percentile,12
Q1,22
Median,35
Q3,49
95-th percentile,71
Maximum,154
Range,145
Interquartile range,27

0,1
Standard deviation,18.321
Coef of variation,0.49598
Kurtosis,0.16021
Mean,36.939
MAD,14.876
Skewness,0.67599
Sum,5111469
Variance,335.66
Memory size,1.1 MiB

Value,Count,Frequency (%),Unnamed: 3
28,3041,2.2%,
26,2931,2.1%,
27,2929,2.1%,
30,2920,2.1%,
24,2898,2.1%,
29,2850,2.1%,
34,2848,2.1%,
32,2832,2.0%,
25,2823,2.0%,
22,2795,2.0%,

Value,Count,Frequency (%),Unnamed: 3
9,2121,1.5%,
10,2060,1.5%,
11,2229,1.6%,
12,2269,1.6%,
13,2362,1.7%,

Value,Count,Frequency (%),Unnamed: 3
128,3,0.0%,
129,2,0.0%,
132,1,0.0%,
140,1,0.0%,
154,1,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
M,80693
F,57683

Value,Count,Frequency (%),Unnamed: 3
M,80693,58.3%,
F,57683,41.7%,

First 3 values

Last 3 values

Value,Count,Frequency (%),Unnamed: 3
2015-01-01 00:00:42,1,0.0%,
2015-01-01 00:00:46,1,0.0%,
2015-01-01 00:05:19,1,0.0%,
2015-01-01 00:07:11,1,0.0%,
2015-01-01 00:08:56,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2015-08-18 04:25:00,1,0.0%,
2015-08-18 04:29:35,1,0.0%,
2015-08-18 04:31:58,1,0.0%,
2015-08-18 04:37:34,1,0.0%,
2015-08-18 04:40:29,1,0.0%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
SEO,55766
Ads,54913
Direct,27697

Value,Count,Frequency (%),Unnamed: 3
SEO,55766,40.3%,
Ads,54913,39.7%,
Direct,27697,20.0%,

0,1
Distinct count,138376
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,200150
Minimum,2
Maximum,400000
Zeros (%),0.0%

0,1
Minimum,2
5-th percentile,20234
Q1,100890
Median,200000
Q3,299750
95-th percentile,379900
Maximum,400000
Range,399998
Interquartile range,198850

0,1
Standard deviation,115230
Coef of variation,0.5757
Kurtosis,-1.1945
Mean,200150
MAD,99727
Skewness,0.00041272
Sum,27695822486
Variance,13277000000
Memory size,1.1 MiB

Value,Count,Frequency (%),Unnamed: 3
264191,1,0.0%,
183539,1,0.0%,
212173,1,0.0%,
251088,1,0.0%,
253137,1,0.0%,
257239,1,0.0%,
327188,1,0.0%,
135060,1,0.0%,
105884,1,0.0%,
393974,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2,1,0.0%,
4,1,0.0%,
8,1,0.0%,
9,1,0.0%,
12,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
399992,1,0.0%,
399993,1,0.0%,
399995,1,0.0%,
399997,1,0.0%,
400000,1,0.0%,

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
3,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0
4,159135,2015-05-21 06:03:03,2015-07-09 08:05:14,42,ALEYXFXINSXLZ,Ads,Chrome,M,18,2809315000.0,0


## Missing values

In [137]:
# Count of NaN in each column
fraud_data.isna().sum()

user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64

In [138]:
# Identify country info based on ip_address
ipToCountry.head()

Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
0,16777216.0,16777471,Australia
1,16777472.0,16777727,China
2,16777728.0,16778239,China
3,16778240.0,16779263,Australia
4,16779264.0,16781311,China


In [139]:
# Match the country and ip address
start = time.time()

countries = []
for i in range(len(fraud_data)):
    ip_address = fraud_data.loc[i, 'ip_address'] #number
    #below [] is list of T/F, tmp is a df of size n * 3, n = len(tmp) is 1 match or 0 no match
    tmp = ipToCountry[(ipToCountry['lower_bound_ip_address'] <= ip_address) &
                    (ipToCountry['upper_bound_ip_address'] >= ip_address)]
    if len(tmp) == 1:#found match
        countries.append(tmp['country'].values[0])
    else:#no match
        countries.append('NA')
        
fraud_data['country'] = countries
print("Lookup took", time.time() - start, "seconds.")

Lookup took 382.95719599723816 seconds.


In [140]:
ip_address = fraud_data.loc[6, 'ip_address']
tmp = ipToCountry[(ipToCountry['lower_bound_ip_address'] <= ip_address) &
                    (ipToCountry['upper_bound_ip_address'] >= ip_address)]
print(tmp)

       lower_bound_ip_address  upper_bound_ip_address        country
28203            1.686110e+09              1694498815  United States


In [141]:
# Check the frequency of each country
print(fraud_data.country.value_counts())

United States                      53023
NA                                 20291
China                              11110
Japan                               6664
United Kingdom                      4054
Korea Republic of                   3817
Germany                             3412
France                              2900
Brazil                              2716
Canada                              2661
Italy                               1795
Australia                           1695
Netherlands                         1566
Russian Federation                  1490
India                               1179
Taiwan; Republic of China (ROC)     1143
Mexico                               996
Sweden                               974
Spain                                960
South Africa                         769
Switzerland                          720
Poland                               694
Argentina                            606
Indonesia                            597
Colombia        

# Feature Engineering

## Check if column user_id is unique(no dup) for time related aggregates


In [142]:
print (fraud_data.user_id.nunique())
print (len(fraud_data.index))
# All of the user_id has only the first 1 transaction, difficult to do time based aggregates, 
# e.g. amount/counts in past 1 month for a user

138376
138376


In [143]:
# Time related features: can be done before split, as they has no interaction between other rows, 
# solely based on other columns of the same row
fraud_data['interval_after_signup'] = (pd.to_datetime(fraud_data['purchase_time']) - pd.to_datetime(
        fraud_data['signup_time'])).dt.total_seconds()

fraud_data['signup_days_of_year'] = pd.DatetimeIndex(fraud_data['signup_time']).dayofyear

# bed time operation
fraud_data['signup_seconds_of_day'] = pd.DatetimeIndex(fraud_data['signup_time']).second + 60 * pd.DatetimeIndex(
    fraud_data['signup_time']).minute + 3600 * pd.DatetimeIndex(fraud_data['signup_time']).hour

fraud_data['purchase_days_of_year'] = pd.DatetimeIndex(fraud_data['purchase_time']).dayofyear
fraud_data['purchase_seconds_of_day'] = pd.DatetimeIndex(fraud_data['purchase_time']).second + 60 * pd.DatetimeIndex(
    fraud_data['purchase_time']).minute + 3600 * pd.DatetimeIndex(fraud_data['purchase_time']).hour

fraud_data = fraud_data.drop(['user_id','signup_time','purchase_time'], axis=1)

In [144]:
fraud_data.head()

Unnamed: 0,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,interval_after_signup,signup_days_of_year,signup_seconds_of_day,purchase_days_of_year,purchase_seconds_of_day
0,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan,4506682.0,55,82549,108,10031
1,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States,17944.0,158,74390,159,5934
2,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,,492085.0,118,76405,124,50090
3,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States,4361461.0,202,25792,252,67253
4,42,ALEYXFXINSXLZ,Ads,Chrome,M,18,2809315000.0,0,Canada,4240931.0,141,21783,190,29114


In [145]:
# Check the types of source
print(fraud_data.source.value_counts())

SEO       55766
Ads       54913
Direct    27697
Name: source, dtype: int64


# Train/Test Split

In [146]:
# If there is another new level in test but not in train, it will cause info leak if we do encoding before split

In [147]:
y = fraud_data['class']
X = fraud_data.drop(['class'], axis=1)

#split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print("X_train.shape:", X_train.shape)
print("y_train.shape:", y_train.shape)

X_train.shape: (110700, 13)
y_train.shape: (110700,)


In [148]:
X_train.head()

Unnamed: 0,purchase_value,device_id,source,browser,sex,age,ip_address,country,interval_after_signup,signup_days_of_year,signup_seconds_of_day,purchase_days_of_year,purchase_seconds_of_day
29343,12,OULPAZAFRFPXP,Ads,Chrome,M,42,3690922000.0,Korea Republic of,3499664.0,183,67384,224,24648
12190,10,AIIWMFEYQQIEB,Ads,Opera,M,29,1686759000.0,United States,6766039.0,5,78146,84,18585
19388,34,VUVETBUPCIWJE,Direct,Chrome,M,53,4138429000.0,,5870515.0,197,81354,265,76669
89104,48,QCFULAJOYKFUU,Ads,Chrome,M,29,96173370.0,France,2145618.0,160,30920,185,16538
82082,44,IHRWLMIJMEEEU,Ads,FireFox,M,24,1936025000.0,China,7079059.0,111,71897,193,66156


# Convert categorical features with high cardinality to numericals

In [149]:
# Converting needs to be done after split 
# One-hot encoding for sourse and browser since there are only a few types
X_train = pd.get_dummies(X_train, columns=['source', 'browser'])# source and browser are auto dropped by get_dummies 
X_train['sex'] = (X_train.sex == 'M').astype(int)

# The more a device is shared, the more suspicious
X_train_device_id_mapping = X_train.device_id.value_counts(dropna=False)
X_train['n_dev_shared'] = X_train.device_id.map(X_train_device_id_mapping)

# The more a ip is shared, the more suspicious
X_train_ip_address_mapping = X_train.ip_address.value_counts(dropna=False)
X_train['n_ip_shared'] = X_train.ip_address.map(X_train_ip_address_mapping)

# The less visit from a country, the more suspicious
X_train_country_mapping = X_train.country.value_counts(dropna=False)
X_train['n_country_shared'] = X_train.country.map(X_train_country_mapping)#lots of NAs in country column, #without dropna=False will produce nan in this col

X_train = X_train.drop(['device_id','ip_address','country'], axis=1)

In [150]:
# The same operations on testing data, one-hot encoding for source and browser
X_test = pd.get_dummies(X_test, columns=['source', 'browser'])
X_test['sex'] = (X_test.sex == 'M').astype(int)

# The more a device is shared, the more suspicious
X_test['n_dev_shared'] = X_test.device_id.map(X_test.device_id.value_counts(dropna=False))

# The more a ip is shared, the more suspicious
X_test['n_ip_shared'] = X_test.ip_address.map(X_test.ip_address.value_counts(dropna=False))

# The less visit from a country, the more suspicious
X_test['n_country_shared'] = X_test.country.map(X_test.country.value_counts(dropna=False))

X_test = X_test.drop(['device_id','ip_address','country'], axis=1)

# if the levels/values of the column in train and test data are pretty much the same, e.g. country, 
# then we should apply the above 3 X_train_mappings (generated from X_train) on X_test, 
# rather than using the new mapping generated from X_test
# the more a device is shared, the more suspicious
# X_test['n_dev_shared'] = X_test.device_id.map(X_train_device_id_mapping)

# the more a ip is shared, the more suspicious
# X_test['n_ip_shared'] = X_test.ip_address.map(X_train_ip_address_mapping)

# the less visit from a country, the more suspicious
# X_test['n_country_shared'] = X_test.country.map(X_train_country_mapping)

# but here device_id, ip_address has few overlap between train and test, if we apply the above 
# 3 X_train_mappings (generated from X_train) on X_test, then most of the X_test will be NaN

In [103]:
X_train.head(20)

Unnamed: 0,purchase_value,sex,age,interval_after_signup,signup_days_of_year,signup_seconds_of_day,purchase_days_of_year,purchase_seconds_of_day,source_Ads,source_Direct,source_SEO,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,browser_Safari,n_dev_shared,n_ip_shared,n_country_shared
29343,12,1,42,3499664.0,183,67384,224,24648,1,0,0,1,0,0,0,0,1,1,3075
12190,10,1,29,6766039.0,5,78146,84,18585,1,0,0,0,0,0,1,0,1,1,42348
19388,34,1,53,5870515.0,197,81354,265,76669,0,1,0,1,0,0,0,0,1,1,16275
89104,48,1,29,2145618.0,160,30920,185,16538,1,0,0,1,0,0,0,0,1,1,2322
82082,44,1,24,7079059.0,111,71897,193,66156,1,0,0,0,1,0,0,0,1,1,8876
76812,56,1,25,7872819.0,102,78778,194,2797,1,0,0,1,0,0,0,0,1,1,42348
111006,67,1,43,7662881.0,143,68977,232,42258,1,0,0,0,1,0,0,0,1,1,16275
37929,29,0,25,1293152.0,69,70051,84,67203,0,0,1,0,0,0,0,1,1,1,42348
88089,20,1,18,7551233.0,225,22512,312,56945,0,0,1,1,0,0,0,0,1,1,42348
50851,14,1,28,6830027.0,188,26963,267,31390,1,0,0,0,1,0,0,0,1,1,42348


# Normalization or standardization


In [105]:
# Normalize (min-max) to [0,1], standardize(StandardScaler) to normal, mu=0,var = 1 can < 0, so we do normalize here
# Needs to be brought to the same scale for models like LR with regularization(that are not tree based)

# Compute the train minimum and maximum to be used for later scaling:
scaler = preprocessing.MinMaxScaler().fit(X_train[['n_dev_shared', 'n_ip_shared', 'n_country_shared']]) 

# Transform the training data and use them for the model training
X_train[['n_dev_shared', 'n_ip_shared', 'n_country_shared']] = scaler.transform(X_train[['n_dev_shared', 'n_ip_shared', 'n_country_shared']])

# Before the prediction of the test data, apply the same scaler obtained from above on X_test, not fitting a brandnew scaler on test
X_test[['n_dev_shared', 'n_ip_shared', 'n_country_shared']] = scaler.transform(X_test[['n_dev_shared', 'n_ip_shared', 'n_country_shared']])

In [106]:
X_train.n_dev_shared.value_counts(dropna=False)

0.0    105427
0.2      4774
0.4       324
0.6       124
0.8        45
1.0         6
Name: n_dev_shared, dtype: int64

In [107]:
X_test.n_dev_shared.value_counts(dropna=False)

0.0    27330
0.2      334
0.4       12
Name: n_dev_shared, dtype: int64

# Simple LogisticRegression model

In [108]:
# Instantiate the model (using the default parameters)
logreg = LogisticRegression()

# Fit the model with data
logreg.fit(X_train,y_train)

# Predict on test
y_pred = logreg.predict(X_test)

In [109]:
# Get the confusion matrix on test data
cm = metrics.confusion_matrix(y_test, y_pred)
cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)

        pred_0  pred_1
true_0   27389       0
true_1     287       0


# Simple RF model

In [110]:
classifier_RF = RandomForestClassifier(random_state=0)

classifier_RF.fit(X_train, y_train)

# Predict class labels 0/1 for the test set
predicted = classifier_RF.predict(X_test)

# Generate class probabilities
probs = classifier_RF.predict_proba(X_test)

# Generate evaluation metrics
print("%s: %r" % ("accuracy_score is: ", accuracy_score(y_test, predicted)))
print("%s: %r" % ("roc_auc_score is: ", roc_auc_score(y_test, probs[:, 1])))
print("%s: %r" % ("f1_score is: ", f1_score(y_test, predicted )))#string to int

print("confusion_matrix is: ")
cm = confusion_matrix(y_test, predicted)
cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)
print('recall =',float(cm[1,1])/(cm[1,0]+cm[1,1]))
print('precision =', float(cm[1,1])/(cm[1,1] + cm[0,1]))

accuracy_score is: : 0.9948692007515537
roc_auc_score is: : 0.7548596215347777
f1_score is: : 0.6712962962962962
confusion_matrix is: 
        pred_0  pred_1
true_0   27389       0
true_1     142     145
recall = 0.5052264808362369
precision = 1.0


# SMOTE sampling

In [111]:
# Wrong way to sampling: smote before split train/test, then test data does not reflect true distribution in reality, 
# and “blend” information from the test set into the training of the model. overfit! think about the case of simple 
# oversampling (where I just duplicate observations). If I upsample a dataset before splitting it into a train and 
# validation set, I could end up with the same observation in both datasets

# Oversampling on only the training data, the right way! 
# Sampling_strategy: number of samples in the majority class over the number of samples in the minority class after resampling

smote = SMOTE(random_state=12)
x_train_sm, y_train_sm = smote.fit_sample(X_train, y_train)

unique, counts = np.unique(y_train_sm, return_counts=True)

print (np.asarray((unique, counts)).T)

[[     0 109572]
 [     1 109572]]


In [112]:
# RF on smoted training data
classifier_RF_sm = RandomForestClassifier(random_state=0)

classifier_RF_sm.fit(x_train_sm, y_train_sm)

# Predict class labels for the test set
predicted_sm = classifier_RF_sm.predict(X_test)

# Generate class probabilities
probs_sm = classifier_RF_sm.predict_proba(X_test)

# Generate evaluation metrics
print("%s: %r" % ("accuracy_score_sm is: ", accuracy_score(y_test, predicted_sm)))
print("%s: %r" % ("roc_auc_score_sm is: ", roc_auc_score(y_test, probs_sm[:, 1])))
print("%s: %r" % ("f1_score_sm is: ", f1_score(y_test, predicted_sm )))#string to int

print("confusion_matrix_sm is: ")
cm_sm = confusion_matrix(y_test, predicted_sm)
cmDF = pd.DataFrame(cm_sm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)
print('recall or sens_sm =',float(cm_sm[1,1])/(cm_sm[1,0]+cm_sm[1,1]))
print('precision_sm =', float(cm_sm[1,1])/(cm_sm[1,1] + cm_sm[0,1]))

accuracy_score_sm is: : 0.9945440092498916
roc_auc_score_sm is: : 0.7533518568391924
f1_score_sm is: : 0.6544622425629291
confusion_matrix_sm is: 
        pred_0  pred_1
true_0   27382       7
true_1     144     143
recall or sens_sm = 0.49825783972125437
precision_sm = 0.9533333333333334


# Parameter tuning by GridSearchCV

## Eval metrics for GridSearchCV over all fits upon combination of parameters and cv

In [113]:
# Eval metrics to be calculated for each combination of parameters and cv
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'f1_score': make_scorer(f1_score, pos_label=1)
}

In [114]:
def grid_search_wrapper(model, parameters, refit_score='f1_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization(refit on the best model according to refit_score)
    prints classifier performance metrics
    """

    grid_search = GridSearchCV(model, parameters, scoring=scorers, refit=refit_score,
                           cv=5, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Make the predictions
    y_pred = grid_search.predict(X_test)
    y_prob = grid_search.predict_proba(X_test)[:, 1]
    
    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # Confusion matrix on the test data.
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    cm = confusion_matrix(y_test, y_pred)
    cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
    print(cmDF)
    
    print("\t%s: %r" % ("roc_auc_score is: ", roc_auc_score(y_test, y_prob)))
    print("\t%s: %r" % ("f1_score is: ", f1_score(y_test, y_pred)))#string to int

    print('recall = ', float(cm[1,1]) / (cm[1,0] + cm[1,1]))
    print('precision = ', float(cm[1,1]) / (cm[1, 1] + cm[0,1]))

    return grid_search

## Optimizing on f1_score on LR

In [115]:
# C: inverse of regularization strength, smaller values specify stronger regularization
LRGrid = {"C" : np.logspace(-2,2,5), "penalty":["l1","l2"]} # l1 lasso l2 ridge
#param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
logRegModel = LogisticRegression(random_state=0)

grid_search_LR_f1 = grid_search_wrapper(logRegModel, LRGrid, refit_score='f1_score')

Best params for f1_score
{'C': 0.1, 'penalty': 'l1'}

Confusion matrix of Random Forest optimized for f1_score on the test data:
        pred_0  pred_1
true_0   27386       3
true_1     278       9
	roc_auc_score is: : 0.760104993955329
	f1_score is: : 0.06020066889632108
recall =  0.0313588850174216
precision =  0.75


## Optimizing on f1_score on RF

In [116]:
parameters = {        
'max_depth': [None, 5, 15],
'n_estimators' :  [10,150],
'class_weight' : [{0: 1, 1: w} for w in [0.2, 1, 100]]
}

clf = RandomForestClassifier(random_state=0)

In [117]:
grid_search_rf_f1 = grid_search_wrapper(clf, parameters, refit_score='f1_score')

Best params for f1_score
{'class_weight': {0: 1, 1: 0.2}, 'max_depth': None, 'n_estimators': 10}

Confusion matrix of Random Forest optimized for f1_score on the test data:
        pred_0  pred_1
true_0   27389       0
true_1     142     145
	roc_auc_score is: : 0.7537836281332202
	f1_score is: : 0.6712962962962962
recall =  0.5052264808362369
precision =  1.0


In [118]:
best_rf_model_f1 = grid_search_rf_f1.best_estimator_
best_rf_model_f1

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 0.2},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=None, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [120]:
results_f1 = pd.DataFrame(grid_search_rf_f1.cv_results_)
results_sortf1 = results_f1.sort_values(by='mean_test_f1_score', ascending=False)
results_sortf1[['mean_test_precision_score', 'mean_test_recall_score', 'mean_test_f1_score', 'mean_train_precision_score', 'mean_train_recall_score', 'mean_train_f1_score','param_max_depth', 'param_class_weight', 'param_n_estimators']].round(3).head()

Unnamed: 0,mean_test_precision_score,mean_test_recall_score,mean_test_f1_score,mean_train_precision_score,mean_train_recall_score,mean_train_f1_score,param_max_depth,param_class_weight,param_n_estimators
0,1.0,0.527,0.689,1.0,0.856,0.922,,"{0: 1, 1: 0.2}",10
7,1.0,0.527,0.689,1.0,1.0,1.0,,"{0: 1, 1: 1}",150
13,1.0,0.527,0.689,1.0,1.0,1.0,,"{0: 1, 1: 100}",150
11,1.0,0.527,0.689,1.0,0.568,0.725,15.0,"{0: 1, 1: 1}",150
10,1.0,0.527,0.689,1.0,0.607,0.755,15.0,"{0: 1, 1: 1}",10


In [119]:
# Feature importance
pd.DataFrame(best_rf_model_f1.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

Unnamed: 0,importance
interval_after_signup,0.417489
purchase_days_of_year,0.125334
purchase_seconds_of_day,0.081799
signup_seconds_of_day,0.078478
n_dev_shared,0.072574
signup_days_of_year,0.054064
purchase_value,0.04365
age,0.039094
n_country_shared,0.024125
n_ip_shared,0.017022


## Optimizing  recall_score on RF

In [121]:
grid_search_rf_recall = grid_search_wrapper(clf, parameters, refit_score='recall_score')

Best params for recall_score
{'class_weight': {0: 1, 1: 100}, 'max_depth': 5, 'n_estimators': 150}

Confusion matrix of Random Forest optimized for recall_score on the test data:
        pred_0  pred_1
true_0   27146     243
true_1     132     155
	roc_auc_score is: : 0.7904661234456265
	f1_score is: : 0.4525547445255475
recall =  0.5400696864111498
precision =  0.38944723618090454


In [122]:
best_RF_model_recall = grid_search_rf_recall.best_estimator_
best_RF_model_recall

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 100},
            criterion='gini', max_depth=5, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=150, n_jobs=None, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [123]:
# Predict class labels for the test set
predictedBest_recall = best_RF_model_recall.predict(X_test)

# Generate class probabilities
probsBest_recall = best_RF_model_recall.predict_proba(X_test)

In [124]:
results_recall = pd.DataFrame(grid_search_rf_recall.cv_results_)
results_sortrecall = results_recall.sort_values(by='mean_test_recall_score', ascending=False)
results_sortrecall[['mean_test_precision_score', 'mean_test_recall_score', 'mean_test_f1_score', 'mean_train_precision_score', 'mean_train_recall_score', 'mean_train_f1_score','param_max_depth', 'param_class_weight', 'param_n_estimators']].round(3).head()
# Recall is improved but the precision decreases very much

Unnamed: 0,mean_test_precision_score,mean_test_recall_score,mean_test_f1_score,mean_train_precision_score,mean_train_recall_score,mean_train_f1_score,param_max_depth,param_class_weight,param_n_estimators
15,0.152,0.645,0.246,0.156,0.656,0.252,5.0,"{0: 1, 1: 100}",150
14,0.168,0.616,0.264,0.178,0.644,0.279,5.0,"{0: 1, 1: 100}",10
16,0.694,0.528,0.595,0.793,0.782,0.786,15.0,"{0: 1, 1: 100}",10
17,0.876,0.527,0.654,0.93,0.783,0.85,15.0,"{0: 1, 1: 100}",150
1,1.0,0.527,0.689,1.0,1.0,1.0,,"{0: 1, 1: 0.2}",150


# Conclusion and recommendation

In [125]:
# Based on the above var importance
trainDF = pd.concat([X_train, y_train], axis=1)
pd.crosstab(trainDF["n_dev_shared"],trainDF["class"])
# The larger n_dev_shared, the higher rate of fraud

class,0,1
n_dev_shared,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,104966,461
0.2,4403,371
0.4,152,172
0.6,37,87
0.8,13,32
1.0,1,5


In [126]:
fraud_data.groupby("class")[['interval_after_signup']].mean() # action velocity(consecutive operations/actions of user)
# Interval_after_signup on frauds are significantly lower compared to legits

Unnamed: 0_level_0,interval_after_signup
class,Unnamed: 1_level_1
0,5191179.0
1,2570226.0


In [127]:
fraud_data.groupby("class")[['interval_after_signup']].median()
# More than half of fraud happened 1 second after signed up

Unnamed: 0_level_0,interval_after_signup
class,Unnamed: 1_level_1
0,5194911.0
1,1.0


In [128]:
fraud_data[fraud_data['class'] == 1].head(100)

Unnamed: 0,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,interval_after_signup,signup_days_of_year,signup_seconds_of_day,purchase_days_of_year,purchase_seconds_of_day
136961,24,VLHGCDPFCICDA,SEO,Chrome,F,33,3.432126e+09,1,United States,3327952.0,218,80113,257,38465
136962,14,YLUQSRNYYIPXU,Ads,Chrome,M,40,3.905319e+09,1,,1.0,12,4207,12,4208
136963,63,ABUBCQDATQMQH,Ads,FireFox,F,46,5.505670e+08,1,United States,7640070.0,49,40723,137,77593
136964,34,QHEODGCAVJKIQ,SEO,Chrome,M,37,9.408096e+08,1,United States,1.0,12,77710,12,77711
136965,76,DAKVYHKIEYRBH,SEO,Chrome,F,48,6.361041e+08,1,Hungary,1.0,10,48421,10,48422
136966,32,ESANFBTIVMNHX,Ads,IE,M,30,3.875475e+09,1,,5721252.0,176,53824,242,72676
136967,95,HIAMXITLJWYCT,SEO,FireFox,M,42,3.786924e+09,1,,10160108.0,9,33511,126,84819
136968,13,BQTPLJBGYXQYX,Ads,IE,M,32,2.463262e+09,1,Austria,1.0,12,29576,12,29577
136969,15,BWSMVSLCJXMCM,Direct,IE,F,39,2.937899e+09,1,Japan,1.0,7,61065,7,61066
136970,26,HPPSDIRGUSSTB,Direct,Opera,M,31,6.471261e+08,1,United States,1.0,1,80617,1,80618


In [130]:
# how to use the prediction:
t = (10 * probsBest_recall[:, 1]).astype(int)
unique, counts = np.unique(t, return_counts=True)

print(np.asarray((unique, counts)).T)

# recommendation:
# green: 1 - 3 pass
# grey: 4 - 7 need manual investigation
# red: 8 - 9 decline

[[    1     1]
 [    2 24555]
 [    3  2623]
 [    4    99]
 [    5   177]
 [    6    76]
 [    7     1]
 [    8    20]
 [    9   124]]
