In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost import plot_tree
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [6]:
data = pd.read_csv("vars_final_zscale.csv")

In [7]:
data.head()

Unnamed: 0,record,fraud_label,Days_fulladdress,fulladdress_30_count,fulladdress_7_count,ssn_30_count,Days_firstname_ssn,Days_fulladdress_homephone,fulladdresshomephone_30_count,ssnnamedob_30_count,ssn_14_count,fulladdresshomephone_14_count,namedob_14_count,firstnamessn_14_count,ssnnamedob_14_count,lastnamessn_14_count,fulladdresshomephone_7_count,homephone_14_count,homephone_1_count
0,19277,1,-3.230865,4.607631,5.073931,-0.099762,0.39406,-3.464852,5.758508,-0.097878,-0.072852,6.095677,-0.073782,-0.071503,-0.071056,-0.071466,6.416826,1.265044,-0.321508
1,19278,0,0.407271,-0.102765,-0.067384,-0.099762,0.39406,0.379935,-0.095718,-0.097878,-0.072852,-0.070587,-0.073782,-0.071503,-0.071056,-0.071466,-0.057622,2.981941,1.269795
2,19279,0,0.407271,-0.102765,-0.067384,-0.099762,0.39406,0.379935,-0.095718,-0.097878,-0.072852,-0.070587,-0.073782,-0.071503,-0.071056,-0.071466,-0.057622,-1.024153,-0.321508
3,19280,0,0.407271,-0.102765,-0.067384,-0.099762,0.39406,0.379935,-0.095718,-0.097878,-0.072852,-0.070587,-0.073782,-0.071503,-0.071056,-0.071466,-0.057622,-0.451854,-0.321508
4,19281,0,0.407271,-0.102765,-0.067384,-0.099762,0.39406,0.379935,-0.095718,-0.097878,-0.072852,-0.070587,-0.073782,-0.071503,-0.071056,-0.071466,-0.057622,-1.024153,-0.321508


In [8]:
data.shape

(980724, 19)

In [9]:
# split data into out of date(after 16/11/1), train and test
oot_df=data[data['record']>833508]
trte_df=data[data['record']<=833508]

In [10]:
trte_df.shape

(814232, 19)

In [11]:
oot_df.shape

(166492, 19)

In [12]:
oot_df=oot_df.drop(columns=['record'])
trte_df=trte_df.drop(columns=['record'])

In [13]:
def multipltrun(a=5,v=6,md=5,mf=5,ne=25):
    '''
    This function run model on different samples based on user input:
    "a" (int) how many randome samples, default as 5
    "v" (int) how many variables, default as 6 (most important ones from backward selection)
    users can modify model based on different machine learning algorithm and its parameters
    
    FDR is calculated by first sorting outcome in descending order and cut off at 3%,
    sum number of fraud records on top 3% and divided by total fraud racords for that sample
    
    Final output would be a dataframe contains FDR at 3% for training set, testing set, and oot. 
    '''
    #declare dict
    FDRdict={"train":[],"test":[],"oot":[]}
    
    for i in range(a):        
        #split training and testing
        train, test = train_test_split(trte_df, test_size=0.2,random_state=i)
        
        # split lables and features and v
        train_lab = train["fraud_label"]
        train_fea = train.iloc[:,1:v+1]

        test_lab = test["fraud_label"]
        test_fea = test.iloc[:,1:v+1]

        oot_lab=oot_df["fraud_label"]
        oot_fea=oot_df.iloc[:,1:v+1]

        #define model
        model=RandomForestClassifier(n_estimators = ne, max_depth = md, min_samples_leaf = mf, random_state = 42)
        
        #fit model
        model.fit(train_fea,train_lab) #modify based on your model
    
        #calculate FDR
        for sets in ["train","test","oot"]:
            fea=vars()[sets+'_fea'] 
            lab=vars()[sets+'_lab']
            prob=pd.DataFrame(model.predict_proba(fea)) #modify based on your model
            result=pd.concat([pd.DataFrame(lab).reset_index(),prob],axis=1)
            topRows=int(round(len(result)*0.03))
            top3per=result.sort_values(by=1,ascending=False).head(topRows)
            FDR=sum(top3per.loc[:,'fraud_label'])/sum(result.loc[:,'fraud_label'])
            FDRdict[sets].append(FDR)
    
    #convert into dataframe
    FDR_df=pd.DataFrame(FDRdict)
   
    #add new row to calculate mean
    FDR_df.loc['mean']=FDR_df.mean()
    
    return FDR_df

In [14]:
multipltrun()

Unnamed: 0,train,test,oot
0,0.519212,0.51071,0.491618
1,0.516776,0.529686,0.494552
2,0.5163,0.512929,0.492456
3,0.518262,0.516562,0.493294
4,0.517846,0.519898,0.496647
mean,0.517679,0.517957,0.493713


In [None]:
##########try differernt number of variables 

In [16]:
multipltrun(a=5,v=17,md=5,mf=5,ne=25)

Unnamed: 0,train,test,oot
0,0.550506,0.542209,0.523051
1,0.551179,0.563613,0.528919
2,0.552087,0.557016,0.528919
3,0.553083,0.551363,0.528919
4,0.546739,0.549958,0.521375
mean,0.550718,0.552832,0.526236


In [17]:
multipltrun(a=5,v=15,md=5,mf=5,ne=25)

Unnamed: 0,train,test,oot
0,0.519532,0.51155,0.489941
1,0.51391,0.526293,0.491199
2,0.516725,0.516744,0.492037
3,0.516239,0.515723,0.491618
4,0.517102,0.514818,0.492875
mean,0.516701,0.517026,0.491534


In [18]:
multipltrun(a=5,v=17,md=5,mf=5,ne=100)

Unnamed: 0,train,test,oot
0,0.555615,0.550189,0.530176
1,0.551391,0.562341,0.529757
2,0.551025,0.553201,0.529338
3,0.553509,0.551782,0.528919
4,0.554918,0.555038,0.530176
mean,0.553291,0.55451,0.529673


In [19]:
%%time 
multipltrun(a=5,v=17,md=7,mf=5,ne=100)

Wall time: 2min 46s


Unnamed: 0,train,test,oot
0,0.559021,0.556069,0.533529
1,0.553621,0.563189,0.531852
2,0.554848,0.559135,0.530595
3,0.557342,0.553459,0.532691
4,0.554918,0.555038,0.530176
mean,0.55595,0.557378,0.531769


In [20]:
%%time 
multipltrun(a=5,v=17,md=10,mf=5,ne=100)

Wall time: 3min 22s


Unnamed: 0,train,test,oot
0,0.56115,0.554809,0.537301
1,0.568167,0.570823,0.545683
2,0.568971,0.568037,0.54694
3,0.566287,0.559748,0.544426
4,0.562354,0.556732,0.539816
mean,0.565386,0.56203,0.542833


In [21]:
%%time 
multipltrun(a=5,v=17,md=10,mf=7,ne=100)

Wall time: 3min 18s


Unnamed: 0,train,test,oot
0,0.562533,0.556909,0.538558
1,0.566893,0.571671,0.545683
2,0.569714,0.565918,0.550712
3,0.564903,0.554717,0.539816
4,0.563735,0.557155,0.539816
mean,0.565556,0.561274,0.542917


In [22]:
%%time 
multipltrun(a=5,v=17,md=15,mf=7,ne=100)     ############take this as the best model set 

Wall time: 4min 16s


Unnamed: 0,train,test,oot
0,0.574029,0.568249,0.553227
1,0.573264,0.577184,0.552389
2,0.573643,0.570157,0.552389
3,0.575551,0.566038,0.552808
4,0.573189,0.568586,0.553227
mean,0.573935,0.570043,0.552808


In [24]:
%%time 
multipltrun(a=10,v=17,md=15,mf=8,ne=100)

Wall time: 8min 33s


Unnamed: 0,train,test,oot
0,0.573816,0.567409,0.55197
1,0.572627,0.577608,0.551132
2,0.57375,0.571005,0.55197
3,0.575551,0.565618,0.554484
4,0.57287,0.567739,0.551132
5,0.572196,0.578234,0.552808
6,0.574576,0.568921,0.55197
7,0.579172,0.55097,0.552389
8,0.572429,0.577561,0.55197
9,0.573102,0.576101,0.550712


In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost import plot_tree

In [27]:
# import neccessary packages
from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.base import clone
from sklearn.datasets import make_classification


In [28]:
model=RandomForestClassifier(n_estimators = 100, max_depth = 15, min_samples_leaf = 8) 

In [None]:
y_pred_proba = model.predict_proba(test_fea)[::,1]
fpr, tpr, _ = metrics.roc_curve(test_lab,  y_pred_proba)
auc = round(metrics.roc_auc_score(test_lab, y_pred_proba),3)
plt.plot(fpr,tpr,label="ROC curve, AUC="+str(auc))

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate (1-Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('Receiver operating characteristic (ROC) for Random Forest (V=12)')
plt.legend(loc="lower right")
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=.8)

plt.show()
