In [1]:
from sklearn import preprocessing
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import datetime
import sys
import os
sys.path.append(os.path.abspath('../src'))
import data_prep, ai_config
import importlib
importlib.reload(data_prep)

<module 'data_prep' from '/Users/wihill/git/AI_Based_Hyperparameter_Tuning/src/data_prep.py'>

# LOAD RAW

In [63]:
%%time
# 45 seconds
print(datetime.datetime.now())

df = data_prep.get_ieee_fraud_data()

display(df.shape)
print(round((df.memory_usage().sum() / 1024**3),2), '\n')
print(df.dtypes.value_counts(),'\n')   

2020-02-09 11:28:45.561849


(1097231, 433)

3.55 

float64    399
object      31
int64        3
dtype: int64 

CPU times: user 35.8 s, sys: 11.8 s, total: 47.6 s
Wall time: 48.7 s


In [64]:
%%time
# 11 SECONDS
df.reset_index().to_feather('raw.ftr')

CPU times: user 8.43 s, sys: 8.49 s, total: 16.9 s
Wall time: 12.9 s


In [65]:
%%time
# 10 SECONDS

df = pd.read_feather('raw.ftr').set_index('TransactionID')

display(df.shape)
print(round((df.memory_usage().sum() / 1024**3),2), '\n')
print(df.dtypes.value_counts(),'\n')   

(1097231, 433)

3.55 

float64    399
object      31
int64        3
dtype: int64 

CPU times: user 5.1 s, sys: 11.2 s, total: 16.3 s
Wall time: 11 s


In [None]:
df

# OBJECT TO NUMERIC

In [5]:
%%time
# 3 SECONDS
print(datetime.datetime.now())

for c in tqdm(df.columns):
    if df[c].dtype == 'object':
        df[c] = pd.to_numeric(df[c], errors='ignore')

display(df.shape)
print(round((df.memory_usage().sum() / 1024**3),2), '\n')
print(df.dtypes.value_counts(),'\n')  

2020-02-09 10:41:38.775758


HBox(children=(FloatProgress(value=0.0, max=433.0), HTML(value='')))




(1097231, 433)

3.55 

float64    399
object      31
int64        3
dtype: int64 

CPU times: user 711 ms, sys: 32.8 ms, total: 744 ms
Wall time: 746 ms


In [6]:
%%time
# 7 SECONDS
df.reset_index().to_feather('obj2num.ftr')

CPU times: user 5.88 s, sys: 3.43 s, total: 9.31 s
Wall time: 6.88 s


In [7]:
%%time
# 7 SECONDS

df = pd.read_feather('obj2num.ftr').set_index('TransactionID')

display(df.shape)
print(round((df.memory_usage().sum() / 1024**3),2), '\n')
print(df.dtypes.value_counts(),'\n')   

(1097231, 433)

3.55 

float64    399
object      31
int64        3
dtype: int64 

CPU times: user 4.28 s, sys: 5.33 s, total: 9.61 s
Wall time: 6.52 s


# IMPUTE

In [8]:
%%time
# 13 SECONDS

for c in tqdm(df.columns):

    if df[c].isnull().sum() > 0:

        if np.issubdtype(df[c].dtype, np.number):
            if ai_config.NUM_IMPUTE_TYPE == 'MEDIAN':
                df[c] = df[c].fillna(df[c].median())
        else:
            df[c] = df[c].fillna(df[c].mode()[0])
            
display(df.shape)
print(round((df.memory_usage().sum() / 1024**3),2), '\n')
print(df.dtypes.value_counts())        

HBox(children=(FloatProgress(value=0.0, max=433.0), HTML(value='')))




(1097231, 433)

3.55 

float64    399
object      31
int64        3
dtype: int64
CPU times: user 12.5 s, sys: 1.57 s, total: 14.1 s
Wall time: 14.1 s


# CHECK FOR MISSING

In [9]:
for c in tqdm(df.columns.tolist()):
    if df[c].isnull().sum() > 0:
        print(c)

HBox(children=(FloatProgress(value=0.0, max=433.0), HTML(value='')))




In [10]:
%%time
# 6 SECONDS
df.reset_index().to_feather('imputed.ftr')

CPU times: user 5.12 s, sys: 2.62 s, total: 7.74 s
Wall time: 5.81 s


In [11]:
%%time
# 7 SECONDS

df = pd.read_feather('imputed.ftr').set_index('TransactionID')

display(df.shape)
print(round((df.memory_usage().sum() / 1024**3),2), '\n')
print(df.dtypes.value_counts(),'\n') 

(1097231, 433)

3.55 

float64    399
object      31
int64        3
dtype: int64 

CPU times: user 3.69 s, sys: 4.49 s, total: 8.18 s
Wall time: 6.96 s


# FLOAT > INT

In [21]:
%%time
# 6.5 MINUTES
print(datetime.datetime.now())

pbar = tqdm(df.columns)
for c in pbar:        
    pbar.set_description("Processing %s" % c)
    if str(df[c].dtypes)[:5] == 'float':
        # print('column ', c)        
        df[c] = pd.to_numeric(df[c], errors='ignore', downcast='integer')


display(df.shape)
print(round((df.memory_usage().sum() / 1024**3),2), '\n')
print(df.dtypes.value_counts())        

HBox(children=(FloatProgress(value=0.0, max=433.0), HTML(value='')))


1.37 

int8       228
int16       90
float64     81
object      31
int64        3
dtype: int64
CPU times: user 3min 35s, sys: 2min 49s, total: 6min 25s
Wall time: 6min 25s


In [22]:
%%time
# 4 SECONDS
df.reset_index().to_feather('flt_2_int.ftr')

CPU times: user 2.46 s, sys: 1.44 s, total: 3.91 s
Wall time: 3.61 s


In [14]:
%%time
# 2 SECONDS

df = pd.read_feather('flt_2_int.ftr').set_index('TransactionID')

display(df.shape)
print(round((df.memory_usage().sum() / 1024**3),2), '\n')
print(df.dtypes.value_counts(),'\n')    

(1097231, 433)

1.37 

int8       228
int16       90
float64     81
object      31
int64        3
dtype: int64 

CPU times: user 1.94 s, sys: 1.05 s, total: 2.99 s
Wall time: 2.31 s


# Minify

In [15]:
%%time
# 15 seconds

data_prep.minify(df)

display(df.shape)
print(round((df.memory_usage().sum() / 1024**3),2), '\n')
print(df.dtypes.value_counts(),'\n')    

Mem. usage decreased to  1.02 GB (25.5% reduction)


(1097231, 433)

1.02 

int8       229
int16       91
float32     81
object      31
int32        1
dtype: int64 

CPU times: user 10.8 s, sys: 8.18 s, total: 19 s
Wall time: 19 s


In [16]:
%%time
# 4 SECONDS
df.reset_index().to_feather('minified.ftr')

CPU times: user 2.54 s, sys: 1.6 s, total: 4.14 s
Wall time: 3.78 s


In [17]:
%%time
# 2 SECONDS

df = pd.read_feather('minified.ftr').set_index('TransactionID')

display(df.shape)
print(round((df.memory_usage().sum() / 1024**3),2), '\n')
print(df.dtypes.value_counts(),'\n')  

(1097231, 433)

1.02 

int8       229
int16       91
float32     81
object      31
int32        1
dtype: int64 

CPU times: user 1.69 s, sys: 982 ms, total: 2.67 s
Wall time: 2.23 s


# ENCODING  

In [18]:
%%time
# 70 SECONDS
print(datetime.datetime.now())

pbar = tqdm(df.columns)

for c in pbar:        

    pbar.set_description('encoding %s' % c)

    
    if df[c].dtype == 'object' and df[c].nunique() <= ai_config.MAX_N_OHE:
        pbar.set_description('OHE %s' % c)
        df = pd.concat([df, pd.get_dummies(df[c], drop_first=True, prefix=c)], axis=1)
        df = df.drop([c], axis=1)

    elif df[c].dtype == 'object':
        pbar.set_description('LE %s' % c)
        df[c] = preprocessing.LabelEncoder().fit_transform(df[c])
        

display(df.shape)
print(round((df.memory_usage().sum() / 1024**3),2), '\n')
print(df.dtypes.value_counts(),'\n')  

2020-02-09 10:51:28.186487


HBox(children=(FloatProgress(value=0.0, max=433.0), HTML(value='')))




(1097231, 445)

0.85 

int8       229
int16       91
float32     81
uint8       37
int64        6
int32        1
dtype: int64 

CPU times: user 41.7 s, sys: 25 s, total: 1min 6s
Wall time: 1min 6s


In [19]:
%%time
# 4 SECONDS
df.reset_index().to_feather('encoded.ftr')

CPU times: user 991 ms, sys: 983 ms, total: 1.97 s
Wall time: 1.22 s


In [54]:
%%time
# 2 SECONDS

df = pd.read_feather('encoded.ftr').set_index('TransactionID')

display(df.shape)
print(round((df.memory_usage().sum() / 1024**3),2), '\n')
print(df.dtypes.value_counts(),'\n')  

(1097231, 445)

0.85 

int8       229
int16       91
float32     81
uint8       37
int64        6
int32        1
dtype: int64 

CPU times: user 957 ms, sys: 1.44 s, total: 2.39 s
Wall time: 738 ms


# FOLDS

In [55]:
%%time

from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True)

i = 0
for train, test in kf.split(df):
    df.iloc[train].reset_index().to_feather('train_' + str(i) + '.ftr')
    df.iloc[test].reset_index().to_feather('test_' + str(i) + '.ftr')
    i += 1

In [56]:
!ls

0_Data_Prep.ipynb             test_5.ftr
0_Docker_PostgreSQL.ipynb     test_6.ftr
1_AutoML_Simple_Example.ipynb test_7.ftr
2_Random_Forest_Example.ipynb test_8.ftr
3_Neural_Networks.ipynb       test_9.ftr
4_NN_Optimization.ipynb       train_0.ftr
5_Mac_GPUs_2.ipynb            train_1.ftr
DATA_PREP.ipynb               train_2.ftr
catboost_example.ipynb        train_3.ftr
encoded.ftr                   train_4.ftr
[34mimgs[m[m                          train_5.ftr
test_0.ftr                    train_6.ftr
test_1.ftr                    train_7.ftr
test_2.ftr                    train_8.ftr
test_3.ftr                    train_9.ftr
test_4.ftr


In [59]:
trs = []
for i in tqdm(range(0,10)):
    trs.append(pd.read_feather('test_' + str(i) + '.ftr').set_index('TransactionID'))
all_trs = pd.concat(trs, axis=0)    

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


