# Reading the dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_identity = pd.read_csv('train_identity.csv')
train = pd.read_csv('train_transaction.csv')
test = pd.read_csv('test_transaction.csv')

target = train.pop('isFraud')

In [3]:
!ls

ieee-fraud-detection.zip
sample_submission.csv
submissions
test_identity.csv
test_transaction.csv
train_identity.csv
train_transaction.csv
Untitled.ipynb


# EDA

In [4]:
target.value_counts()

0    569877
1     20663
Name: isFraud, dtype: int64

In [5]:
print(train.shape)
print(train_identity.shape)

(590540, 393)
(144233, 41)


In [4]:
list(train_identity.columns)

['TransactionID',
 'id_01',
 'id_02',
 'id_03',
 'id_04',
 'id_05',
 'id_06',
 'id_07',
 'id_08',
 'id_09',
 'id_10',
 'id_11',
 'id_12',
 'id_13',
 'id_14',
 'id_15',
 'id_16',
 'id_17',
 'id_18',
 'id_19',
 'id_20',
 'id_21',
 'id_22',
 'id_23',
 'id_24',
 'id_25',
 'id_26',
 'id_27',
 'id_28',
 'id_29',
 'id_30',
 'id_31',
 'id_32',
 'id_33',
 'id_34',
 'id_35',
 'id_36',
 'id_37',
 'id_38',
 'DeviceType',
 'DeviceInfo']

In [5]:
train.isnull().sum()

TransactionID          0
isFraud                0
TransactionDT          0
TransactionAmt         0
ProductCD              0
card1                  0
card2               8933
card3               1565
card4               1577
card5               4259
card6               1571
addr1              65706
addr2              65706
dist1             352271
dist2             552913
P_emaildomain      94456
R_emaildomain     453249
C1                     0
C2                     0
C3                     0
C4                     0
C5                     0
C6                     0
C7                     0
C8                     0
C9                     0
C10                    0
C11                    0
C12                    0
C13                    0
                   ...  
V310                  12
V311                  12
V312                  12
V313                1269
V314                1269
V315                1269
V316                  12
V317                  12
V318                  12


In [9]:
print(len(train.columns))
print(len(test.columns))

394
393


In [4]:
list(train.columns)

['TransactionID',
 'isFraud',
 'TransactionDT',
 'TransactionAmt',
 'ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'dist1',
 'dist2',
 'P_emaildomain',
 'R_emaildomain',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V

## Removing categorical columns

In [12]:
cat_col = ['ProductCD', 'P_emaildomain', 'R_emaildomain', 'addr1', 'addr2']
cat_col.extend(['card' + str(i) for i in range(1, 7)])
cat_col.extend(['M' + str(i) for i in range(1, 10)])

In [13]:
train = train[[col for col in train.columns if col not in set(cat_col)]]
test = test[[col for col in test.columns if col not in set(cat_col)]]

In [14]:
txn_id = train.pop('TransactionID')

In [16]:
train = train.fillna(0)

# Training the models

### RanomForest

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
classifier = RandomForestClassifier()

In [21]:
train = train.fillna(0)

In [22]:
classifier.fit(train, target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

# Submitting predictions

In [23]:
testids = test.pop('TransactionID')

test = test.fillna(0)

preds = classifier.predict(test)

In [26]:
sample = pd.read_csv('sample_submission.csv')

In [27]:
sample.head(2)

Unnamed: 0,TransactionID,isFraud
0,3663549,0.5
1,3663550,0.5


In [28]:
submission = pd.DataFrame({'TransactionID': testids, 'isFraud': preds})

submission.to_csv('first_submission.csv', index=None)

# LightGBM

#### Splitting validation set

In [20]:
from sklearn.model_selection import train_test_split

In [27]:
trainx, testx, trainy, testy = train_test_split(train, target, train_size=0.8, random_state=1, stratify=target)

In [6]:
import lightgbm as lgb

In [24]:
dataset = lgb.Dataset(trainx, label=trainy)

In [30]:
validation = lgb.Dataset(testx, label=testy, reference=dataset)

In [87]:
param = {'num_leaves': 50, 'objective': 'binary'}
param['metric'] = 'auc'

In [88]:
classifier = lgb.train(param, train_set=dataset, num_boost_round=1000, valid_sets=[validation])

[1]	valid_0's auc: 0.810389
[2]	valid_0's auc: 0.848919
[3]	valid_0's auc: 0.857641
[4]	valid_0's auc: 0.863629
[5]	valid_0's auc: 0.866604
[6]	valid_0's auc: 0.869362
[7]	valid_0's auc: 0.871716
[8]	valid_0's auc: 0.873916
[9]	valid_0's auc: 0.876742
[10]	valid_0's auc: 0.878031
[11]	valid_0's auc: 0.879705
[12]	valid_0's auc: 0.880919
[13]	valid_0's auc: 0.882434
[14]	valid_0's auc: 0.884813
[15]	valid_0's auc: 0.886144
[16]	valid_0's auc: 0.888281
[17]	valid_0's auc: 0.891005
[18]	valid_0's auc: 0.892475
[19]	valid_0's auc: 0.893871
[20]	valid_0's auc: 0.894509
[21]	valid_0's auc: 0.895686
[22]	valid_0's auc: 0.896125
[23]	valid_0's auc: 0.897275
[24]	valid_0's auc: 0.898012
[25]	valid_0's auc: 0.898378
[26]	valid_0's auc: 0.898887
[27]	valid_0's auc: 0.90042
[28]	valid_0's auc: 0.901011
[29]	valid_0's auc: 0.902496
[30]	valid_0's auc: 0.90304
[31]	valid_0's auc: 0.90341
[32]	valid_0's auc: 0.903903
[33]	valid_0's auc: 0.904563
[34]	valid_0's auc: 0.905621
[35]	valid_0's auc: 0.9063

[279]	valid_0's auc: 0.964886
[280]	valid_0's auc: 0.964943
[281]	valid_0's auc: 0.964976
[282]	valid_0's auc: 0.965012
[283]	valid_0's auc: 0.96512
[284]	valid_0's auc: 0.965158
[285]	valid_0's auc: 0.965174
[286]	valid_0's auc: 0.965204
[287]	valid_0's auc: 0.965253
[288]	valid_0's auc: 0.965317
[289]	valid_0's auc: 0.965362
[290]	valid_0's auc: 0.965385
[291]	valid_0's auc: 0.965402
[292]	valid_0's auc: 0.965497
[293]	valid_0's auc: 0.96552
[294]	valid_0's auc: 0.965565
[295]	valid_0's auc: 0.965622
[296]	valid_0's auc: 0.965668
[297]	valid_0's auc: 0.965747
[298]	valid_0's auc: 0.96578
[299]	valid_0's auc: 0.965906
[300]	valid_0's auc: 0.965931
[301]	valid_0's auc: 0.9661
[302]	valid_0's auc: 0.966144
[303]	valid_0's auc: 0.966205
[304]	valid_0's auc: 0.966223
[305]	valid_0's auc: 0.966253
[306]	valid_0's auc: 0.96635
[307]	valid_0's auc: 0.966425
[308]	valid_0's auc: 0.966441
[309]	valid_0's auc: 0.966471
[310]	valid_0's auc: 0.966489
[311]	valid_0's auc: 0.966754
[312]	valid_0's 

[555]	valid_0's auc: 0.979725
[556]	valid_0's auc: 0.979815
[557]	valid_0's auc: 0.979836
[558]	valid_0's auc: 0.979863
[559]	valid_0's auc: 0.979874
[560]	valid_0's auc: 0.979921
[561]	valid_0's auc: 0.979948
[562]	valid_0's auc: 0.979957
[563]	valid_0's auc: 0.979966
[564]	valid_0's auc: 0.979976
[565]	valid_0's auc: 0.979992
[566]	valid_0's auc: 0.980047
[567]	valid_0's auc: 0.980063
[568]	valid_0's auc: 0.98008
[569]	valid_0's auc: 0.980086
[570]	valid_0's auc: 0.980121
[571]	valid_0's auc: 0.98017
[572]	valid_0's auc: 0.980174
[573]	valid_0's auc: 0.980184
[574]	valid_0's auc: 0.980191
[575]	valid_0's auc: 0.980211
[576]	valid_0's auc: 0.980243
[577]	valid_0's auc: 0.980253
[578]	valid_0's auc: 0.980291
[579]	valid_0's auc: 0.980321
[580]	valid_0's auc: 0.980359
[581]	valid_0's auc: 0.980402
[582]	valid_0's auc: 0.980417
[583]	valid_0's auc: 0.980455
[584]	valid_0's auc: 0.980464
[585]	valid_0's auc: 0.980475
[586]	valid_0's auc: 0.980551
[587]	valid_0's auc: 0.980604
[588]	valid_

In [74]:
lgb_train = classifier.predict(testx)

In [89]:
testids = test.pop('TransactionID')

test = test.fillna(0)

preds = classifier.predict(test)

KeyError: 'TransactionID'

In [90]:
preds = classifier.predict(test)

In [91]:
submission = pd.DataFrame({'TransactionID': testids, 'isFraud': preds})

submission.to_csv('lightgbm.csv', index=None)

In [36]:
submission.head(2)

Unnamed: 0,TransactionID,isFraud
0,3663549,0.021297
1,3663550,0.021891


## XGBoost

In [37]:
import xgboost as xgb

In [45]:
trainxgb = xgb.DMatrix(trainx, label=trainy)

In [53]:
testxgb = xgb.DMatrix(testx)

In [65]:
param = {'max_depth':5, 'eta':1, 'objective':'binary:logistic' }

In [66]:
num_round = 200

In [67]:
xgb_classifier = xgb.train(param, trainxgb, num_round)

In [75]:
xgb_train = xgb_classifier.predict(testxgb)

In [58]:
from sklearn.metrics import roc_auc_score

In [76]:
roc_auc_score(y_true=testy, y_score=xgb_train)

0.9229017467940699

In [71]:
testset = xgb.DMatrix(test)

In [72]:
preds = xgb_classifier.predict(testset)

In [73]:
submission = pd.DataFrame({'TransactionID': testids, 'isFraud': preds})

submission.to_csv('xgb.csv', index=None)

# Ensemble

In [77]:
from sklearn.linear_model import LogisticRegression

In [78]:
logreg = LogisticRegression()

In [79]:
dataframe = pd.DataFrame({'xgb': xgb_train, 'lgb': lgb_train})

In [80]:
logreg.fit(dataframe, testy)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [82]:
lgb_preds = classifier.predict(test)
xgb_preds = xgb_classifier.predict(testset)

In [84]:
final_pred = logreg.predict_proba(pd.DataFrame({'xgb': xgb_preds, 'lgb': lgb_preds}))

In [85]:
submission = pd.DataFrame({'TransactionID': testids, 'isFraud': preds})

submission.to_csv('ensemble.csv', index=None)

# Kernel attempt

In [1]:
import os
import gc
import numpy as np
import pandas as pd

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
!ls

ieee-fraud-detection.zip
sample_submission.csv
submissions
test_identity.csv
test_transaction.csv
train_identity.csv
train_transaction.csv
Untitled.ipynb


In [36]:
train_transaction = pd.read_csv('train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('test_transaction.csv', index_col='TransactionID')
train_identity = pd.read_csv('train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('test_identity.csv', index_col='TransactionID')
sample_submission = pd.read_csv('sample_submission.csv', index_col='TransactionID')

In [42]:
train_transaction.memory_usage().sum()/(1024*1024)

1775.1522827148438

In [37]:
train_transaction.head(2)

Unnamed: 0_level_0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,


In [47]:
train_identity.head(2)

Unnamed: 0_level_0,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987004,0.0,70787.0,,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
2987008,-5.0,98945.0,,,0.0,-5.0,,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device


In [48]:
list(train_identity.columns)

['id_01',
 'id_02',
 'id_03',
 'id_04',
 'id_05',
 'id_06',
 'id_07',
 'id_08',
 'id_09',
 'id_10',
 'id_11',
 'id_12',
 'id_13',
 'id_14',
 'id_15',
 'id_16',
 'id_17',
 'id_18',
 'id_19',
 'id_20',
 'id_21',
 'id_22',
 'id_23',
 'id_24',
 'id_25',
 'id_26',
 'id_27',
 'id_28',
 'id_29',
 'id_30',
 'id_31',
 'id_32',
 'id_33',
 'id_34',
 'id_35',
 'id_36',
 'id_37',
 'id_38',
 'DeviceType',
 'DeviceInfo']

In [43]:
list(train_transaction.columns)

['isFraud',
 'TransactionDT',
 'TransactionAmt',
 'ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'dist1',
 'dist2',
 'P_emaildomain',
 'R_emaildomain',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67

In [38]:
pd.DataFrame.merge?

In [49]:
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
del train_transaction, train_identity
gc.collect()
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)
del test_transaction, test_identity
gc.collect

train=reduce_mem_usage(train)
test=reduce_mem_usage(test)
print('training set shape:', train.shape)
print('test set shape:', test.shape)

Mem. usage decreased to 668.22 Mb (66.2% reduction)
Mem. usage decreased to 583.43 Mb (65.6% reduction)
training set shape: (590540, 433)
test set shape: (506691, 432)


In [4]:
cols_to_drop=['V300','V309','V111','C3','V124','V106','V125','V315','V134','V102','V123','V316','V113',
              'V136','V305','V110','V299','V289','V286','V318','V103','V304','V116','V29','V284','V293',
              'V137','V295','V301','V104','V311','V115','V109','V119','V321','V114','V133','V122','V319',
              'V105','V112','V118','V117','V121','V108','V135','V320','V303','V297','V120']


print('{} features are going to be dropped for being useless'.format(len(cols_to_drop)))

train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

50 features are going to be dropped for being useless


In [5]:
train['P_isproton']=(train['P_emaildomain']=='protonmail.com')
train['R_isproton']=(train['R_emaildomain']=='protonmail.com')
test['P_isproton']=(test['P_emaildomain']=='protonmail.com')
test['R_isproton']=(test['R_emaildomain']=='protonmail.com')

In [6]:
train['nulls1'] = train.isnull().sum(axis=1)
test['nulls1'] = test.isnull().sum(axis=1)

In [7]:
a = np.zeros(train.shape[0])
train["lastest_browser"] = a
a = np.zeros(test.shape[0])
test["lastest_browser"] = a
def setbrowser(df):
    df.loc[df["id_31"]=="samsung browser 7.0",'lastest_browser']=1
    df.loc[df["id_31"]=="opera 53.0",'lastest_browser']=1
    df.loc[df["id_31"]=="mobile safari 10.0",'lastest_browser']=1
    df.loc[df["id_31"]=="google search application 49.0",'lastest_browser']=1
    df.loc[df["id_31"]=="firefox 60.0",'lastest_browser']=1
    df.loc[df["id_31"]=="edge 17.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 69.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 67.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 63.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 63.0 for ios",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 64.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 64.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 64.0 for ios",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 65.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 65.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 65.0 for ios",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 66.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 66.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 66.0 for ios",'lastest_browser']=1
    return df
train=setbrowser(train)
test=setbrowser(test)

In [8]:
train['card1_count_full'] = train['card1'].map(pd.concat([train['card1'], test['card1']], ignore_index=True).value_counts(dropna=False))
test['card1_count_full'] = test['card1'].map(pd.concat([train['card1'], test['card1']], ignore_index=True).value_counts(dropna=False))

train['card2_count_full'] = train['card2'].map(pd.concat([train['card2'], test['card2']], ignore_index=True).value_counts(dropna=False))
test['card2_count_full'] = test['card2'].map(pd.concat([train['card2'], test['card2']], ignore_index=True).value_counts(dropna=False))

train['card3_count_full'] = train['card3'].map(pd.concat([train['card3'], test['card3']], ignore_index=True).value_counts(dropna=False))
test['card3_count_full'] = test['card3'].map(pd.concat([train['card3'], test['card3']], ignore_index=True).value_counts(dropna=False))

train['card4_count_full'] = train['card4'].map(pd.concat([train['card4'], test['card4']], ignore_index=True).value_counts(dropna=False))
test['card4_count_full'] = test['card4'].map(pd.concat([train['card4'], test['card4']], ignore_index=True).value_counts(dropna=False))

train['card5_count_full'] = train['card5'].map(pd.concat([train['card5'], test['card5']], ignore_index=True).value_counts(dropna=False))
test['card5_count_full'] = test['card5'].map(pd.concat([train['card5'], test['card5']], ignore_index=True).value_counts(dropna=False))

train['card6_count_full'] = train['card6'].map(pd.concat([train['card6'], test['card6']], ignore_index=True).value_counts(dropna=False))
test['card6_count_full'] = test['card6'].map(pd.concat([train['card6'], test['card6']], ignore_index=True).value_counts(dropna=False))


train['addr1_count_full'] = train['addr1'].map(pd.concat([train['addr1'], test['addr1']], ignore_index=True).value_counts(dropna=False))
test['addr1_count_full'] = test['addr1'].map(pd.concat([train['addr1'], test['addr1']], ignore_index=True).value_counts(dropna=False))

train['addr2_count_full'] = train['addr2'].map(pd.concat([train['addr2'], test['addr2']], ignore_index=True).value_counts(dropna=False))
test['addr2_count_full'] = test['addr2'].map(pd.concat([train['addr2'], test['addr2']], ignore_index=True).value_counts(dropna=False))


In [9]:
train['TransactionAmt_to_mean_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_mean_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_std_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('std')
train['TransactionAmt_to_std_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('std')

test['TransactionAmt_to_mean_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_mean_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_std_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('std')
test['TransactionAmt_to_std_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('std')

train['id_02_to_mean_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('mean')
train['id_02_to_mean_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('mean')
train['id_02_to_std_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('std')
train['id_02_to_std_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('std')
test['id_02_to_mean_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('mean')
test['id_02_to_mean_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('mean')
test['id_02_to_std_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('std')
test['id_02_to_std_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('std')

train['D15_to_mean_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('mean')
train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
train['D15_to_std_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('std')
train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

test['D15_to_mean_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('mean')
test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
test['D15_to_std_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('std')
test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

train['D15_to_mean_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('mean')
train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
train['D15_to_std_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('std')
train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

test['D15_to_mean_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('mean')
test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
test['D15_to_std_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('std')
test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

In [10]:
train['Transaction_day_of_week'] = np.floor((train['TransactionDT'] / (3600 * 24) - 1) % 7)
test['Transaction_day_of_week'] = np.floor((test['TransactionDT'] / (3600 * 24) - 1) % 7)

train['Transaction_hour_of_day'] = np.floor(train['TransactionDT'] / 3600) % 24
test['Transaction_hour_of_day'] = np.floor(test['TransactionDT'] / 3600) % 24

train['TransactionAmt_decimal'] = ((train['TransactionAmt'] - train['TransactionAmt'].astype(int)) * 1000).astype(int)
test['TransactionAmt_decimal'] = ((test['TransactionAmt'] - test['TransactionAmt'].astype(int)) * 1000).astype(int)

In [11]:
from sklearn import preprocessing

for feature in ['id_02__id_20', 'id_02__D8', 'D11__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__C2', 
                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']:

    f1, f2 = feature.split('__')
    train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
    test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)

    le =preprocessing.LabelEncoder()
    le.fit(list(train[feature].astype(str).values) + list(test[feature].astype(str).values))
    train[feature] = le.transform(list(train[feature].astype(str).values))
    test[feature] = le.transform(list(test[feature].astype(str).values))
    
for feature in ['id_01', 'id_31', 'id_33', 'id_35']:
    # Count encoded separately for train and test
    train[feature + '_count_dist'] = train[feature].map(train[feature].value_counts(dropna=False))
    test[feature + '_count_dist'] = test[feature].map(test[feature].value_counts(dropna=False))
    
category_features=["ProductCD","P_emaildomain",
                   "R_emaildomain","M1","M2","M3","M4","M5","M6","M7","M8","M9","DeviceType","DeviceInfo","id_12",
                   "id_13","id_14","id_15","id_16","id_17","id_18","id_19","id_20","id_21","id_22","id_23","id_24",
                   "id_25","id_26","id_27","id_28","id_29","id_30","id_32","id_34", 'id_36'
                   "id_37","id_38"]
for c in category_features:
    train[feature + '_count_full'] = train[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))
    test[feature + '_count_full'] = test[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))

In [12]:
y_train = train['isFraud'].copy()


X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

del train, test

#fill in mean for floats
for c in X_train.columns:
    if X_train[c].dtype=='float16' or  X_train[c].dtype=='float32' or  X_train[c].dtype=='float64':
        X_train[c].fillna(X_train[c].mean())
        X_test[c].fillna(X_train[c].mean())

#fill in -999 for categoricals
X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)
# Label Encoding
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))  
        

print('Labelling done.')

Labelling done.


Index(['TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2',
       'card3', 'card4', 'card5', 'card6', 'addr1',
       ...
       'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain',
       'addr1__card1', 'id_01_count_dist', 'id_31_count_dist',
       'id_33_count_dist', 'id_35_count_dist', 'id_35_count_full'],
      dtype='object', length=426)

In [13]:
from sklearn.model_selection import TimeSeriesSplit,KFold
n_fold = 4
folds = KFold(n_splits=n_fold,shuffle=True)

print(folds)

KFold(n_splits=4, random_state=None, shuffle=True)


In [14]:
params = {'num_leaves': 500,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47,
         }

In [15]:
lgb_submission=sample_submission.copy()
lgb_submission['isFraud'] = 0
import lightgbm as lgb
from sklearn.metrics import roc_auc_score


lgb_submission=sample_submission.copy()
lgb_submission['isFraud'] = 0
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):
    print(fold_n)
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)
    
    lgbclf = lgb.LGBMClassifier(
        num_leaves= 512,
        n_estimators=512,
        max_depth=9,
        learning_rate=0.064,
        subsample=0.85,
        colsample_bytree=0.85,
        boosting_type= "gbdt",
        reg_alpha=0.3,
        reg_lamdba=0.243
    )
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    lgbclf.fit(X_train_,y_train_)
    
    del X_train_,y_train_
    print('finish train')
    pred=lgbclf.predict_proba(X_test)[:,1]
    val=lgbclf.predict_proba(X_valid)[:,1]
    print('finish pred')
    del lgbclf, X_valid
    print('ROC accuracy: {}'.format(roc_auc_score(y_valid, val)))
    del val,y_valid
    lgb_submission['isFraud'] = lgb_submission['isFraud']+pred/n_fold
    del pred
    gc.collect()

0
finish train
finish pred
ROC accuracy: 0.9705323718088666
1
finish train
finish pred
ROC accuracy: 0.9717637292446697
2
finish train
finish pred
ROC accuracy: 0.9671940772443912
3
finish train
finish pred
ROC accuracy: 0.972095848929921


In [16]:
lgb_submission

Unnamed: 0_level_0,isFraud
TransactionID,Unnamed: 1_level_1
3663549,0.000188
3663550,0.000806
3663551,0.000268
3663552,0.000423
3663553,0.000763
3663554,0.001786
3663555,0.004743
3663556,0.013763
3663557,0.000116
3663558,0.005744


In [31]:
lgb_submission.to_csv('submissions/kernel_submission2.csv', index=None)

In [21]:
lgb_submission['TransactionID'] = lgb_submission.index

In [23]:
lgb_submission = lgb_submission.reset_index(drop=True)

In [28]:
lgb_submission = lgb_submission[['TransactionID', 'isFraud']]

In [29]:
lgb_submission.head(2)

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000188
1,3663550,0.000806


In [27]:
sample_submission

Unnamed: 0_level_0,isFraud
TransactionID,Unnamed: 1_level_1
3663549,0.5
3663550,0.5
3663551,0.5
3663552,0.5
3663553,0.5
3663554,0.5
3663555,0.5
3663556,0.5
3663557,0.5
3663558,0.5


In [24]:
lgb_submission.index

RangeIndex(start=0, stop=506691, step=1)

In [None]:
xgb_submission=sample_submission.copy()
xgb_submission['isFraud'] = 0
import xgboost as xgb
from sklearn.metrics import roc_auc_score
for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):
    print(fold_n)
    xgbclf = xgb.XGBClassifier(
        n_estimators=512,
        max_depth=16,
        learning_rate=0.014,
        subsample=0.85,
        colsample_bytree=0.85,
        missing=-999,
        tree_method='gpu_hist',
        reg_alpha=0.3,
        reg_lamdba=0.243
    )
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    xgbclf.fit(X_train_,y_train_)
    del X_train_,y_train_
    pred=xgbclf.predict_proba(X_test)[:,1]
    val=xgbclf.predict_proba(X_valid)[:,1]
    del xgbclf, X_valid
    print('ROC accuracy: {}'.format(roc_auc_score(y_valid, val)))
    del val,y_valid
    xgb_submission['isFraud'] = xgb_submission['isFraud']+pred/n_fold
    del pred
    gc.collect()

0


In [None]:
ensemble=sample_submission.copy()
ensemble.isFraud=lgb_submission*0.5+xgb_submission*0.5
ensemble.to_csv('xgb_lgb.csv')