# IEEE-CIS Fraud Detection -- Model

In [84]:
import pandas as pd
import numpy as np
import pickle
import gc
import warnings
warnings.filterwarnings('ignore')

In [60]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('classic')
sns.set_palette('deep')
sns.set_style('white')
%matplotlib inline

In [13]:
%%time
df = pd.read_csv('./Data/processed_data.csv')

CPU times: user 50.3 s, sys: 20.8 s, total: 1min 11s
Wall time: 1min 17s


In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,...,DeviceType,DeviceInfo,is_train,Date,YearMonth,Year,Month,Weekday,Day,Hour
0,0,2987000,0,86400,68.5,W,13926,,150.0,discover,...,,,1,2017-12-02 00:00:00,2017-12,2017,12,5,2,0
1,1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,...,,,1,2017-12-02 00:00:01,2017-12,2017,12,5,2,0
2,2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,...,,,1,2017-12-02 00:01:09,2017-12,2017,12,5,2,0
3,3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,...,,,1,2017-12-02 00:01:39,2017-12,2017,12,5,2,0
4,4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,...,mobile,SAMSUNG SM-G892A Build/NRD90M,1,2017-12-02 00:01:46,2017-12,2017,12,5,2,0


In [7]:
del df; gc.collect()

20

## Feature Engineering

### Create feature: Card ID
- The number of digits of a credit or debit card is around 16.
- In card identify features, the combination of [*card1*, *card2*, *card3* and *card5*] gives a number around 14-16. And we may use this combination as card ID.

In [14]:
def corret_card_id(x): 
    x=x.replace('.0','')
    x=x.replace('-999','nan')
    return x

def definie_indexes(df):
    # create card ID 
    cards_cols= ['card1', 'card2', 'card3', 'card5']
    for card in cards_cols: 
        if '1' in card: 
            df['Card_ID']= df[card].map(str)
        else : 
            df['Card_ID']+= ' '+df[card].map(str)
    
    # sort train data by Card_ID and then by transaction date 
    df= df.sort_values(['Card_ID', 'Date'], ascending=[True, True])
    
    # small correction of the Card_ID
    df['Card_ID']=df['Card_ID'].apply(corret_card_id)
    
    # set indexes 
    # df= df.set_index(['Card_ID', 'Date'])
    return df

In [15]:
%%capture
definie_indexes(df)

In [24]:
df[['Card_ID', 'isFraud','TransactionDT','TransactionAmt','ProductCD', 'P_emaildomain', 
    'R_emaildomain', 'DeviceType', 'DeviceInfo']].head()

Unnamed: 0,Card_ID,isFraud,TransactionDT,TransactionAmt,ProductCD,P_emaildomain,R_emaildomain,DeviceType,DeviceInfo
0,13926 nan 150 142,0,86400,68.5,W,,,,
1,2755 404 150 102,0,86401,29.0,W,gmail.com,,,
2,4663 490 150 166,0,86469,59.0,W,outlook.com,,,
3,18132 567 150 117,0,86499,50.0,W,yahoo.com,,,
4,4497 514 150 102,0,86506,50.0,H,gmail.com,,mobile,SAMSUNG SM-G892A Build/NRD90M


In [47]:
df['Card_ID'].value_counts()[df['Card_ID'].value_counts().values > 10000]

9500 321 150 226     26121
15885 545 185 138    22580
7919 194 150 166     21866
17188 321 150 226    19499
15066 170 150 102    14542
6019 583 150 226     13212
12695 490 150 226    12690
12544 321 150 226    12658
2803 100 150 226     10989
7585 553 150 226     10063
Name: Card_ID, dtype: int64

In [43]:
df[df['Card_ID'] == '9500 321 150 226'][['isFraud','ProductCD', 'P_emaildomain', 'R_emaildomain',
                                         'card4', 'card6', 'DeviceType', 'DeviceInfo']].apply(
    lambda x: x.nunique(), axis=0)

isFraud            3
ProductCD          5
P_emaildomain     43
R_emaildomain     25
card4              1
card6              1
DeviceType         2
DeviceInfo       115
dtype: int64

- There are 10 credit cards have transaction records more than 10000.
- For the *Card_ID* with the greatest number of transactions, *P_emaildomain* has 43 different values and the *DeviceInfo* has 115 unique values which is really weird.
- Let's see whether *Card_ID* is valid.

### Drop columns
- Drop columns with Na values more than 90%.
- Drop columns that only have one values. The columns contain no information in detecting fraud or not.

In [98]:
train = df[df['is_train'] == 1]
test = df[df['is_train'] == 0]

In [99]:
train_target = train['isFraud']
train_features = train.drop('isFraud', axis=1)
test_target = test['isFraud']
test_features = test.drop('isFraud', axis=1)

In [100]:
train_null_cols = [col for col in train_features.columns 
                   if train_features[col].isnull().sum() / train_features.shape[0] >= 0.9]
test_null_cols = [col for col in test_features.columns 
                  if test_features[col].isnull().sum() / test_features.shape[0] >= 0.9]

In [101]:
train_one_value_cols = [col for col in train_features.columns if train_features[col].nunique() <= 1]
test_one_value_cols = [col for col in test_features.columns if test_features[col].nunique() <= 1]

In [102]:
cols_to_drop = list(set(train_null_cols + test_null_cols +
                        train_one_value_cols + test_one_value_cols))

In [103]:
print('Number of columns that are to drop: ', len(cols_to_drop))
print('Columns names: ', cols_to_drop)

Number of columns that are to drop:  15
Columns names:  ['dist2', 'id_24', 'id_22', 'D7', 'id_25', 'id_27', 'id_18', 'V107', 'Year', 'id_08', 'id_23', 'id_07', 'is_train', 'id_26', 'id_21']


In [104]:
train_features = train_features.drop(cols_to_drop, axis=1)
test_features = test_features.drop(cols_to_drop, axis=1)

In [105]:
train_features.shape

(590540, 428)

In [106]:
test_features.shape

(506691, 428)

### Encode categorical features

In [107]:
cat_cols = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1',
            'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
            'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 
            'DeviceType', 'DeviceInfo', 'Card_ID']

In [108]:
from sklearn import preprocessing

In [109]:
for col in cat_cols:
    if col in train_features.columns:
        le = preprocessing.LabelEncoder()
        le.fit(list(train_features[col].astype(str).values) + list(test_features[col].astype(str).values))
        train_features[col] = le.transform(list(train_features[col].astype(str).values))
        test_features[col] = le.transform(list(test_features[col].astype(str).values))   

### Drop highly correlated features

In [None]:
corr_matrix = train[train['isFraud'].notnull()].corr().abs()

## Model

## Reference
- https://www.kaggle.com/smerllo/identify-unique-cards-id