In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pylab as plt
import sys, gc, warnings, random, math, time, datetime 
from tqdm import tqdm

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

import xgboost as xgb
import lightgbm as lgb

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

import os
print(os.listdir("../input/"))

['train_transaction.csv', 'train_identity.csv', 'ieee-fe-with-some-eda', 'separated', 'sample_submission.csv', 'ieee-data-minification', 'reduced', 'standalone-train-and-test-preprocessing', 'test_identity.csv', 'mini-model-data', 'test_transaction.csv', 'ieee-fe-for-local-test']


In [2]:
from utils import *

In [3]:
SEED = 42
seed_everything(SEED)

In [4]:
# loading data
train_df = pd.read_csv('../input/train_transaction.csv', index_col='TransactionID')
test_df = pd.read_csv('../input/test_transaction.csv', index_col='TransactionID')
test_df['isFraud'] = 0

train_identity = pd.read_csv('../input/train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('../input/test_identity.csv', index_col='TransactionID')

In [5]:
train_df = reduce_mem_usage(train_df)
test_df  = reduce_mem_usage(test_df)

train_identity = reduce_mem_usage(train_identity)
test_identity  = reduce_mem_usage(test_identity)

Mem. usage decreased to 544.60 Mb (69.3% reduction)
Mem. usage decreased to 475.00 Mb (68.8% reduction)
Mem. usage decreased to 26.41 Mb (41.5% reduction)
Mem. usage decreased to 25.98 Mb (41.5% reduction)


In [6]:
print(train_df['ProductCD'].unique())

['W' 'H' 'C' 'S' 'R']


In [7]:
# W
train_df_w = train_df[train_df['ProductCD'] == 'W']
test_df_w = test_df[test_df['ProductCD'] == 'W']
print(train_df_w.shape)
print(test_df_w.shape)

print('W fraud rate is: ',train_df_w['isFraud'].sum() / len(train_df_w) * 100, '%')

train_df_w.to_pickle('../input/mini-model-data/train_df_w.pkl')
test_df_w.to_pickle('../input/mini-model-data/test_df_w.pkl')

(439670, 393)
(360987, 393)
W fraud rate is:  2.0399390451929857 %


In [8]:
# H
train_df_h = train_df[train_df['ProductCD'] == 'H']
test_df_h = test_df[test_df['ProductCD'] == 'H']
print(train_df_h.shape)
print(test_df_h.shape)

print('H fraud rate is: ',train_df_h['isFraud'].sum() / len(train_df_h) * 100, '%')

train_df_h.to_pickle('../input/mini-model-data/train_df_h.pkl')
test_df_h.to_pickle('../input/mini-model-data/test_df_h.pkl')

(33024, 393)
(29373, 393)
H fraud rate is:  4.766230620155039 %


In [9]:
# C
train_df_c = train_df[train_df['ProductCD'] == 'C']
test_df_c = test_df[test_df['ProductCD'] == 'C']
print(train_df_c.shape)
print(test_df_c.shape)

print('C fraud rate is: ',train_df_c['isFraud'].sum() / len(train_df_c) * 100, '%')

train_df_c.to_pickle('../input/mini-model-data/train_df_c.pkl')
test_df_c.to_pickle('../input/mini-model-data/test_df_c.pkl')

(68519, 393)
(69266, 393)
C fraud rate is:  11.687269224594637 %


In [10]:
# S
train_df_s = train_df[train_df['ProductCD'] == 'S']
test_df_s = test_df[test_df['ProductCD'] == 'S']
print(train_df_s.shape)
print(test_df_s.shape)

print('S fraud rate is: ',train_df_s['isFraud'].sum() / len(train_df_s) * 100, '%')

train_df_s.to_pickle('../input/mini-model-data/train_df_s.pkl')
test_df_s.to_pickle('../input/mini-model-data/test_df_s.pkl')

(11628, 393)
(11418, 393)
S fraud rate is:  5.8995528035775715 %


In [11]:
# R
train_df_r = train_df[train_df['ProductCD'] == 'R']
test_df_r = test_df[test_df['ProductCD'] == 'R']
print(train_df_r.shape)
print(test_df_r.shape)

print('R fraud rate is: ',train_df_r['isFraud'].sum() / len(train_df_r) * 100, '%')

train_df_r.to_pickle('../input/mini-model-data/train_df_r.pkl')
test_df_r.to_pickle('../input/mini-model-data/test_df_r.pkl')

(37699, 393)
(35647, 393)
R fraud rate is:  3.782593702750736 %


In [12]:
train_identity.to_pickle('../input/mini-model-data/train_identity.pkl')
test_identity.to_pickle('../input/mini-model-data/test_identity.pkl')