# Loading the dataset and reducing memory usage

In [1]:
import os
import gc
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    print('Starting memory: {}'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    print('End memory: {}'.format(end_mem))
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

train_transaction = pd.read_csv('train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('test_transaction.csv', index_col='TransactionID')
train_identity = pd.read_csv('train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('test_identity.csv', index_col='TransactionID')
sample_submission = pd.read_csv('sample_submission.csv', index_col='TransactionID')

train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
del train_transaction, train_identity
gc.collect()
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)
del test_transaction, test_identity
gc.collect

train=reduce_mem_usage(train)
test=reduce_mem_usage(test)
print('training set shape:', train.shape)
print('test set shape:', test.shape)

# Replacing transaction time

* Transactions are in order of time
* One month difference between train and test

In [None]:
(test['TransactionDT'].min() - train['TransactionDT'].max()) / (3600.0 * 24)

In [None]:
# Day of week
train['Dayofweek'] = train['TransactionDT'].apply(lambda x: int(np.floor((x / (3600*24) - 1) % 7)))
test['Dayofweek'] = test['TransactionDT'].apply(lambda x: int(np.floor((x / (3600*24) - 1) % 7)))

# Hour of day
train['Hourofday'] = train['TransactionDT'].apply(lambda x: int(np.floor((x / 3600) % 24)))
test['Hourofday'] = test['TransactionDT'].apply(lambda x: int(np.floor((x / 3600) % 24)))

In [None]:
del train['TransactionDT']

del test['TransactionDT']

# Transaction amount

In [None]:
list(train.columns)

In [None]:
train['Amtdecimal'] = train['TransactionAmt'].apply(lambda x: x - int(x))
test['Amtdecimal'] = test['TransactionAmt'].apply(lambda x: x - int(x))

In [None]:
test['Amtdecimal'] = test['TransactionAmt'].apply(lambda x: x - int(x))
test['decimalpresent'] = test['Amtdecimal'] > 0

In [None]:
train['decimalpresent'] = train['Amtdecimal'] > 0
test['decimalpresent'] = test['Amtdecimal'] > 0

In [None]:
train[train['isFraud'] == 0]['decimalpresent'].value_counts()

In [None]:
train[train['isFraud'] == 1]['decimalpresent'].value_counts()

In [None]:
list(test.columns)

In [None]:
list(train.columns)

# Saving here

In [None]:
train.to_pickle('processed_train.pkl')

In [None]:
test.to_pickle('processed_test.pkl')

In [None]:
import pandas as pd

In [2]:
train = pd.read_pickle('processed_train.pkl')
test = pd.read_pickle('processed_test.pkl')

In [3]:
def describe_column(column, threshold=10):
    print('Number of unique values: {}'.format(column.nunique()))
    if column.nunique() < threshold:
        print('Value counts: {}'.format(column.value_counts()))
    print('Missing values: {}'.format(column.isnull().sum()))

# Categorical columns

In [9]:
cats = ['ProductCD', 'Dayofweek', 'Hourofday', 'decimalpresent', 'card4', 'card6']

# Resume

In [10]:
train = train[train['TransactionAmt'] < 10000]

# Cards

* Card 4 and card 6 have only 4 unique values, others are troublesome
* selecting only cards 4 and 6

In [11]:
describe_column(train['card1'])

Number of unique values: 13553
Missing values: 0


In [13]:
pd.concat([train['card1'], test['card1']]).value_counts()

7919     28015
9500     26243
15885    22691
17188    19606
15066    14606
6019     13268
12695    12732
12544    12694
2803     11043
7585     10097
12839     9593
10616     9419
3154      9253
9633      8909
2884      7683
18132     7635
15497     7266
16132     7253
16075     7051
5812      6697
7508      6659
2616      6196
10112     6058
4461      5959
12501     5886
10057     5642
16659     5627
11207     5595
7664      5237
16136     5155
         ...  
1438         1
8603         1
17815        1
13721        1
11674        1
16787        1
7087         1
12200        1
1432         1
7574         1
8594         1
7086         1
9618         1
10642        1
7572         1
8111         1
8604         1
17808        1
5530         1
13727        1
7579         1
1964         1
16785        1
16299        1
14250        1
14750        1
12703        1
17809        1
3480         1
12204        1
Name: card1, Length: 17091, dtype: int64

In [14]:
train['card1'].map(pd.concat([train['card1'], test['card1']]).value_counts())

TransactionID
2987000       56
2987001     1338
2987002     1794
2987003     7635
2987004       30
2987005       10
2987006      320
2987007    12732
2987008    11043
2987009     3319
2987010        4
2987011     5959
2987012      403
2987013     1076
2987014     3561
2987015       16
2987016       29
2987017       85
2987018     1794
2987019     1460
2987020      104
2987021      139
2987022     2985
2987023     2604
2987024     6058
2987025     1131
2987026     1009
2987027        5
2987028     1201
2987029     4906
           ...  
3577510    26243
3577511       10
3577512      598
3577513       31
3577514       29
3577515      115
3577516    19606
3577517      598
3577518      610
3577519     2786
3577520      147
3577521       77
3577522      693
3577523      199
3577524     5021
3577525      259
3577526      939
3577527       98
3577528    14606
3577529        4
3577530    14606
3577531    13268
3577532     1373
3577533        7
3577534     6697
3577535     2110
3577536       25


In [6]:
dontwant = set(['card' + str(i) for i in [1,2,3,5]])

In [7]:
train = train[[col for col in train.columns if col not in dontwant]]
test = test[[col for col in test.columns if col not in dontwant]]

In [8]:
describe_column(train['card6'], 10)

Number of unique values: 4
Value counts: debit              439938
credit             148984
debit or credit        30
charge card            15
Name: card6, dtype: int64
Missing values: 1571


In [9]:
describe_column(train['card4'], 10)

Number of unique values: 4
Value counts: visa                384767
mastercard          189215
american express      8328
discover              6651
Name: card4, dtype: int64
Missing values: 1577


In [8]:
train['card4'] = train['card4'].fillna('visa')
test['card4'] = test['card4'].fillna('visa')

In [9]:
train['card6'] = train['card6'].fillna('debit')
test['card6'] = test['card6'].fillna('debit')

# Address

Removing addresses due to cardinality

In [16]:
describe_column(train['addr2'], 10)

KeyError: 'addr2'

In [10]:
train = train[[col for col in train.columns if col not in {'addr1', 'addr2'}]]
test = test[[col for col in test.columns if col not in {'addr1', 'addr2'}]]

# Distances

Removing distances due to lot of missing values

In [13]:
describe_column(train['dist2'], 10)

Number of unique values: 1699
Missing values: 552911


In [11]:
train = train[[col for col in train.columns if col not in {'dist1', 'dist2'}]]

test = test[[col for col in test.columns if col not in {'dist1', 'dist2'}]]

# Email

Removing emails too

In [12]:
train = train[[col for col in train.columns if col not in {'P_emaildomain', 'R_emaildomain'}]]

test = test[[col for col in test.columns if col not in {'P_emaildomain', 'R_emaildomain'}]]

In [38]:
describe_column(train['P_emaildomain'], 65)

Number of unique values: 59
Value counts: gmail.com           228355
yahoo.com           100932
hotmail.com          45250
anonymous.com        36998
aol.com              28289
comcast.net           7888
icloud.com            6267
outlook.com           5096
msn.com               4092
att.net               4033
live.com              3041
sbcglobal.net         2970
verizon.net           2705
ymail.com             2396
bellsouth.net         1909
yahoo.com.mx          1543
me.com                1522
cox.net               1393
optonline.net         1011
charter.net            816
live.com.mx            749
rocketmail.com         664
mail.com               559
earthlink.net          514
gmail                  496
outlook.es             438
mac.com                436
juno.com               322
aim.com                315
hotmail.es             305
windstream.net         305
roadrunner.com         305
hotmail.fr             295
frontier.com           280
embarqmail.com         260
web.de       

In [37]:
describe_column(train['P_emaildomain'][train['isFraud'] == 1], 65)

Number of unique values: 42
Value counts: gmail.com          9943
hotmail.com        2396
yahoo.com          2297
anonymous.com       859
aol.com             617
outlook.com         482
comcast.net         246
icloud.com          197
mail.com            106
msn.com              90
live.com             84
outlook.es           57
bellsouth.net        53
ymail.com            50
live.com.mx          41
aim.com              40
protonmail.com       31
att.net              30
cox.net              29
me.com               27
charter.net          25
verizon.net          22
hotmail.es           20
optonline.net        17
yahoo.com.mx         16
mac.com              14
sbcglobal.net        12
earthlink.net        11
gmail                11
embarqmail.com        9
frontier.com          8
juno.com              6
frontiernet.net       5
yahoo.fr              5
suddenlink.net        4
roadrunner.com        3
cableone.net          3
yahoo.es              2
rocketmail.com        2
netzero.net           

# C values

All look fine

In [56]:
describe_column(train['D1'])

Number of unique values: 641
Missing values: 1269


In [60]:
len(train)

590538

# D values

In [13]:
unwanted_columns = []
for col in ['D' + str(i) for i in range(1, 15)]:
    if train[col].isnull().sum() > 400000:
        print(col)
        unwanted_columns.append(col)

D6
D7
D8
D9
D12
D13
D14


In [14]:
train = train[[col for col in train.columns if col not in unwanted_columns]]

test = test[[col for col in test.columns if col not in unwanted_columns]]

# M values

Removing all M values, will include later

In [23]:
describe_column(train['M9'])

Number of unique values: 2
Value counts: T    205654
F     38632
Name: M9, dtype: int64
Missing values: 346252


In [15]:
unwanted_columns = ['M' + str(i) for i in range(1, 10)]

In [16]:
train = train[[col for col in train.columns if col not in unwanted_columns]]

test = test[[col for col in test.columns if col not in unwanted_columns]]

# V values

In [17]:
unwanted_columns = []
for col in ['V' + str(i) for i in range(1, 340)]:
    if train[col].isnull().sum() > 200000:
        unwanted_columns.append(col)

In [18]:
train = train[[col for col in train.columns if col not in unwanted_columns]]

test = test[[col for col in test.columns if col not in unwanted_columns]]

# ID

Excluding all IDs

In [19]:
train = train[[col for col in train.columns if 'id_' not in col]]

test = test[[col for col in test.columns if 'id_' not in col]]

# Device

In [29]:
describe_column(train['DeviceInfo'])

Number of unique values: 1786
Missing values: 471872


In [35]:
describe_column(train['DeviceType'])

Number of unique values: 2
Value counts: desktop    85165
mobile     55645
Name: DeviceType, dtype: int64
Missing values: 449728


In [20]:
train = train[[col for col in train.columns if 'Device' not in col]]

test = test[[col for col in test.columns if 'Device' not in col]]

In [21]:
cats = ['ProductCD', 'Dayofweek', 'Hourofday', 'decimalpresent', 'card4', 'card6']

In [39]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
encoder = LabelEncoder()

In [23]:
test['card6'].value_counts()

debit          388028
credit         118662
charge card         1
Name: card6, dtype: int64

In [24]:
len(train)

590538

In [25]:
train = train[train['card6'] != 'debit or credit']

In [26]:
len(train)

590508

In [27]:
train['card6'].value_counts()

debit          441509
credit         148984
charge card        15
Name: card6, dtype: int64

In [28]:
train_index = train.index
test_index = test.index

In [29]:
for col in cats:
    dums = pd.get_dummies(train[col])
    train = pd.concat([train, dums], axis=1)
    dums = pd.get_dummies(test[col])
    test = pd.concat([test, dums], axis=1)
    
    del train[col]
    del test[col]

In [30]:
len(train.columns)

239

In [31]:
len(test.columns)

238

# Modeling

In [32]:
target = train.pop('isFraud')

In [33]:
params = {'num_leaves': 500,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47,
         }

In [34]:
from sklearn.model_selection import TimeSeriesSplit,KFold
n_fold = 5
folds = KFold(n_splits=n_fold,shuffle=True)

print(folds)

KFold(n_splits=5, random_state=None, shuffle=True)


In [35]:
X_train = train.copy()

In [36]:
y_train = target.copy()

In [37]:
X_test = test.copy()

In [44]:
merged = pd.concat([X_train, X_test])

In [45]:
merged = merged.fillna(merged.mean())

In [51]:
X_train = merged.loc[train.index]
X_test = merged.loc[test.index]

In [39]:
sample_submission = pd.read_csv('sample_submission.csv')

In [54]:
# My training
lgb_submission=sample_submission.copy()
lgb_submission['isFraud'] = 0
import lightgbm as lgb
from sklearn.metrics import roc_auc_score


lgb_submission=sample_submission.copy()
lgb_submission['isFraud'] = 0
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):
    print(fold_n)
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)
    
    lgbclf = lgb.LGBMClassifier(
        num_leaves= 512,
        n_estimators=512,
        max_depth=12,
        learning_rate=0.06,
        subsample=0.85,
        colsample_bytree=0.85,
        boosting_type= "gbdt",
        reg_alpha=0.3,
        reg_lamdba=0.243,
        importance_type='gain'
    )
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    lgbclf.fit(X_train_,y_train_)
    
    del X_train_,y_train_
    print('finish train')
    pred=lgbclf.predict_proba(X_test)[:,1]
    val=lgbclf.predict_proba(X_valid)[:,1]
    print('finish pred')
    del lgbclf, X_valid
    print('ROC accuracy: {}'.format(roc_auc_score(y_valid, val)))
    del val,y_valid
    lgb_submission['isFraud'] = lgb_submission['isFraud']+pred/n_fold
    del pred
    gc.collect()

0
finish train
finish pred
ROC accuracy: 0.9508079123065232
1
finish train
finish pred
ROC accuracy: 0.9483916434555311
2
finish train
finish pred
ROC accuracy: 0.9509886980463349
3
finish train
finish pred
ROC accuracy: 0.947414281874355
4
finish train
finish pred
ROC accuracy: 0.9483125660043741


In [55]:
lgb_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000221
1,3663550,0.000145
2,3663551,0.00043
3,3663552,9.9e-05
4,3663553,0.000715


In [56]:
lgb_submission.to_csv('submissions/my_features.csv', index=None)