<a href="https://colab.research.google.com/github/yoichinaka/card-fraud-detection-by-logistic-regression/blob/master/week3_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import time
import datetime
import json
import gc
from numba import jit

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook

import lightgbm as lgb
import xgboost as xgb
#from catboost import CatBoostRegressor, CatBoostClassifier

from sklearn import metrics
from sklearn import preprocessing
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report

from itertools import product

import altair as alt
from altair.vega import v5
from IPython.display import HTML

In [0]:
def normalize(X):
    """
    Make the distribution of the values of each variable similar by subtracting the mean and by dividing by the standard deviation.
    """
    for feature in X.columns:
        X[feature] -= X[feature].mean()
        X[feature] /= X[feature].std()
    return X

In [0]:
train_identity = pd.read_csv('train_identity.csv')
train_transaction = pd.read_csv('train_transaction.csv')
test_identity = pd.read_csv('test_identity.csv')
test_transaction = pd.read_csv('test_transaction.csv')
#sub = pd.read_csv('sample_submission.csv')
# let's combine the data and work with the whole dataset
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

In [0]:
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']
X_test = test.sort_values('TransactionDT').drop(['TransactionDT', 'TransactionID'], axis=1)

In [119]:
X.shape, X_test.shape, y.shape, 

((63567, 431), (63518, 431), (63567,))

In [120]:
list(y ==1).count(True)

1695

In [121]:
print('percent of fraud is', 620/21973*100)

percent of fraud is 2.821644745824421


In [122]:
X.head()

Unnamed: 0,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,...,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,315.0,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,,yahoo.com,,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,,,84.0,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,,,...,0.0,70787.0,,,,,,,,,100.0,NotFound,,-480.0,New,NotFound,166.0,,542.0,144.0,,,,,,,,New,NotFound,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [0]:
cols_to_drop = ['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9', 'R_emaildomain'
           ]
X_num = X.drop(cols_to_drop, axis=1)
X_num_test = X_test.drop(cols_to_drop, axis=1)

In [124]:
X_num.head()

Unnamed: 0,TransactionAmt,dist1,dist2,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,V1,V2,V3,V4,V5,V6,V7,V8,...,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11
0,68.5,19.0,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,29.0,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,,0.0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,59.0,287.0,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,315.0,,,,315.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,50.0,,,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,,,84.0,,,,,111.0,,,,,,,,,...,0.0,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,50.0,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70787.0,,,,,,,,,100.0


In [0]:
X_num = normalize(X_num)
X_nu_test = normalize(X_num_test)

In [128]:
many_null_cols = [col for col in X_num.columns if X_num[col].isnull().sum() / X_num.shape[0] > 0.9]
many_null_cols_test = [col for col in X_num_test.columns if X_num_test[col].isnull().sum() / X_num_test.shape[0] > 0.9]
cols_to_drop = list(set(many_null_cols + many_null_cols_test ))
len(cols_to_drop)

60

In [0]:
# by https://www.kaggle.com/dimartinot
def clean_inf_nan(df):
    return df.replace([np.inf, -np.inf], np.nan)   

# Cleaning infinite values to NaN
X_num = clean_inf_nan(X_num)
X_num_test = clean_inf_nan(X_num_test )

X_num = X_num.fillna(X.mean())
X_num_test = X_num_test.fillna(X_test.mean())

In [130]:
X_num.head()

Unnamed: 0,TransactionAmt,dist1,dist2,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,V1,V2,V3,V4,V5,V6,V7,V8,...,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11
0,-0.287041,-0.278873,254.333784,-0.144249,-0.145976,-0.10459,-0.136524,-0.204559,-0.154359,-0.129302,-0.132556,-0.180274,-0.136031,-0.143781,-0.128988,-0.214748,-0.167974,-0.468336,163.305117,-0.257662,130.168206,39.935003,68.033799,57.212308,177.387468,0.564152,-0.650617,-0.683777,53.677414,19.373049,44.036594,-0.851974,0.016024,-0.143511,-0.200751,0.423374,0.325906,-0.152142,-0.205584,-0.111717,...,-0.072899,-0.258397,-0.196443,-0.227488,-0.21008,-0.119129,0.008827,-0.096938,-0.027767,-0.050653,-0.034922,0.252224,0.520999,0.34411,0.031736,0.233954,0.095504,0.10102,0.139222,0.108494,39.261196,60.499947,46.66109,4.510322,13.783121,7.756045,15.720741,20.625059,16.931501,-8.111852,132234.346664,0.108756,-0.069513,1.967759,-5.907565,13.373099,-40.881871,0.146,-0.369974,99.674775
1,-0.472918,125.054179,254.333784,-0.144249,-0.145976,-0.10459,-0.136524,-0.204559,-0.154359,-0.129302,-0.132556,-0.248806,-0.136031,-0.147576,-0.128988,-0.214748,-0.167974,-0.569715,163.305117,28.251621,-0.770877,39.935003,68.033799,57.212308,177.387468,0.564152,-0.72981,118.890178,53.677414,19.373049,44.036594,-0.851974,0.999743,1.024131,1.050162,0.815577,0.847102,1.028084,1.053037,1.015095,...,-0.072899,-0.258397,-0.196443,-0.227488,-0.21008,-0.119129,-0.073512,-0.096938,-0.027767,-0.050653,-0.034922,0.252224,0.520999,0.34411,0.031736,0.233954,0.095504,0.10102,0.139222,0.108494,39.261196,60.499947,46.66109,4.510322,13.783121,7.756045,15.720741,20.625059,16.931501,-8.111852,132234.346664,0.108756,-0.069513,1.967759,-5.907565,13.373099,-40.881871,0.146,-0.369974,99.674775
2,-0.331746,0.425843,254.333784,-0.144249,-0.145976,-0.10459,-0.136524,-0.204559,-0.154359,-0.129302,-0.132556,-0.180274,-0.136031,-0.147576,-0.128988,-0.214748,-0.167974,-0.569715,163.305117,28.251621,-0.770877,39.935003,68.033799,57.212308,177.387468,0.564152,-0.72981,1.266362,53.677414,19.373049,44.036594,0.90293,0.016024,-0.143511,-0.200751,0.423374,0.325906,-0.152142,-0.205584,-0.111717,...,-0.072899,-0.258397,-0.196443,-0.227488,-0.21008,-0.119129,-0.073512,-0.096938,-0.027767,-0.050653,-0.034922,0.252224,0.520999,0.34411,0.031736,0.233954,0.095504,0.10102,0.139222,0.108494,39.261196,60.499947,46.66109,4.510322,13.783121,7.756045,15.720741,20.625059,16.931501,-8.111852,132234.346664,0.108756,-0.069513,1.967759,-5.907565,13.373099,-40.881871,0.146,-0.369974,99.674775
3,-0.374097,125.054179,254.333784,-0.141606,-0.136924,-0.10459,-0.136524,-0.204559,-0.138479,-0.129302,-0.132556,-0.180274,-0.136031,-0.147576,-0.128988,-0.123887,-0.167974,0.241317,-0.321813,-0.477284,-0.214194,-0.502187,68.033799,57.212308,177.387468,0.564152,-0.2181,118.890178,53.677414,19.373049,44.036594,-0.233579,0.999743,1.024131,1.050162,0.815577,0.847102,1.028084,1.053037,1.015095,...,-0.072899,0.852952,-0.196443,-0.227488,-0.21008,0.117812,0.914559,1.333712,-0.027767,-0.050653,-0.034922,0.252224,0.520999,0.34411,0.031736,0.233954,0.095504,0.10102,0.139222,0.108494,39.261196,60.499947,46.66109,4.510322,13.783121,7.756045,15.720741,20.625059,16.931501,-8.111852,132234.346664,0.108756,-0.069513,1.967759,-5.907565,13.373099,-40.881871,0.146,-0.369974,99.674775
4,-0.374097,125.054179,254.333784,-0.144249,-0.145976,-0.10459,-0.136524,-0.204559,-0.154359,-0.129302,-0.129036,-0.248806,-0.132528,-0.147576,-0.128988,-0.214748,-0.167974,-0.569715,163.305117,28.251621,130.168206,39.935003,68.033799,57.212308,177.387468,0.564152,119.802343,118.890178,53.677414,19.373049,44.036594,152.926747,0.999743,1.024131,1.050162,0.815577,0.847102,1.028084,1.053037,1.015095,...,-0.072899,-0.258397,-0.196443,-0.227488,-0.21008,-0.119129,-0.073512,-0.096938,-0.027767,-0.050653,-0.034922,-0.29326,-0.213223,-0.279995,-0.15292,-0.128641,-0.148012,-0.228911,-0.189255,-0.237271,-0.203105,-0.149557,-0.209373,-0.083834,-0.13622,-0.121496,-0.165651,-0.159906,-0.166601,0.591799,-0.509862,0.108756,-0.069513,1.967759,-5.907565,13.373099,-40.881871,0.146,-0.369974,0.259719


## Split X data to train and test

In [131]:
# Define the model
model = LogisticRegression()

# Define the splitter for splitting the data in a train set and a test set
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)

# Loop through the splits (only one)
for train_indices, test_indices in splitter.split(X_num, y):
    # Select the train and test data
    X_train, y_train = X_num.iloc[train_indices], y.iloc[train_indices]
    X_test, y_test = X_num.iloc[test_indices], y.iloc[test_indices]
    
    # Fit and predict!
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # And finally: show the results
    print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.98      1.00      0.99     30936
           1       0.85      0.09      0.16       848

    accuracy                           0.98     31784
   macro avg       0.91      0.54      0.57     31784
weighted avg       0.97      0.98      0.97     31784

