In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from tensorflow.keras import metrics
from tensorflow import keras
import os
import tensorflow as tf
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
import pickle as pkl
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Embedding, Bidirectional, Dropout, concatenate, SpatialDropout1D, GlobalMaxPooling1D, Reshape, MaxPooling1D, Flatten, Conv1D
from tensorflow.keras.models import Model
from tensorflow.keras import Input

df = pd.read_csv("Train/cleaned_bureau.csv")
tdf = pd.read_csv("Test/cleaned_bureau.csv")
train_df = pd.read_csv("Train/cleaned_train.csv")
test_df = pd.read_csv("Test/cleaned_train.csv")
df.head()

Unnamed: 0,ID,SELF-INDICATOR,MATCH-TYPE,ACCT-TYPE,CONTRIBUTOR-TYPE,DATE-REPORTED,OWNERSHIP-IND,ACCOUNT-STATUS,DISBURSED-DT,CLOSE-DT,...,OVERDUE-AMT,WRITE-OFF-AMT,ASSET_CLASS,REPORTED DATE - HIST,DPD - HIST,CUR BAL - HIST,AMT OVERDUE - HIST,AMT PAID - HIST,TENURE,INSTALLMENT-TYPE
0,1,False,PRIMARY,Overdraft,NAB,2018-04-30,Individual,Delinquent,2015-10-05,,...,37873.0,0.0,Standard,2018043020180331,030000,3787312820,"37873,,",",,",,
1,1,False,PRIMARY,Auto Loan (Personal),NAB,2019-12-31,Individual,Active,2018-03-19,,...,,0.0,Standard,"20191231,20191130,20191031,20190930,20190831,2...",0000000000000000000000000000000000000000000000...,"20797,21988,23174,24341,25504,26648,27780,2891...",",,,,,,,,,,,,,,,,,,,,1452,,",",,,,,,,,,,,,,,,,,,,,,,",36.0,Monthly
2,1,True,PRIMARY,Tractor Loan,NBF,2020-01-31,Individual,Active,2019-08-30,,...,0.0,0.0,,"20200131,20191231,20191130,20191031,20190930,2...",000000000000000000,116087116087145000145000145000145000,000000,",,,,,,",,
3,1,True,PRIMARY,Auto Loan (Personal),NBF,2017-09-30,Individual,Closed,2013-09-27,2017-09-21,...,0.0,0.0,,"20170930,20170801,20170731,20170630,20170531,2...",000DDD0270260270260270240270270000320000000000...,"0,,15925,23754,31494,39147,46713,54194,61590,6...","0,,1014,1014,1014,1014,1014,1014,1014,983,0,92...",",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,",,
4,1,True,PRIMARY,Tractor Loan,NBF,2016-02-29,Individual,Closed,2012-02-10,2016-02-01,...,0.0,0.0,,"20160229,20160131,20151231,20151130,20151031,2...",0000000000000000000000000000000000000000000000...,"0,0,23658,23321,22989,46321,45662,45012,68030,...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,",,


In [2]:
df.ID.value_counts(sort=True, ascending=False)

71060     420
141732    165
51786     152
1167      138
97794     124
         ... 
107223      1
109270      1
116205      1
103121      1
9436        1
Name: ID, Length: 128655, dtype: int64

In [3]:
tdf.ID.value_counts(sort=True, ascending=False)

114111    93
114063    60
69953     59
26305     59
114125    56
          ..
78473      1
141962     1
62097      1
141994     1
98272      1
Name: ID, Length: 14745, dtype: int64

In [4]:
cat_cols = ['SELF-INDICATOR', 'MATCH-TYPE', 'ACCT-TYPE', 'CONTRIBUTOR-TYPE',
       'OWNERSHIP-IND', 'ACCOUNT-STATUS', 'INSTALLMENT-TYPE',
       'ASSET_CLASS', 'INSTALLMENT-FREQUENCY',
       'DPD - HIST']    #should not be here
date_cols = ['DATE-REPORTED',  'DISBURSED-DT', 'CLOSE-DT', 
             'LAST-PAYMENT-DATE']
reg_cols = ['CREDIT-LIMIT/SANC AMT', 'DISBURSED-AMT/HIGH CREDIT', 'INSTALLMENT-AMT', 'CURRENT-BAL',
        'OVERDUE-AMT', 'WRITE-OFF-AMT', 'TENURE'] # , 'DPD - HIST']
array_cols = ['REPORTED DATE - HIST', 'CUR BAL - HIST',
       'AMT OVERDUE - HIST', 'AMT PAID - HIST']

In [5]:
len(cat_cols) + len(date_cols) + len(reg_cols) + len(array_cols), df.shape

(25, (560844, 26))

In [6]:
assert all([df[i].dtype in ("bool" ,"object") for i in cat_cols])

In [7]:
assert all([df[i].dtype == "float64" for i in reg_cols])

In [8]:
df[date_cols].isnull().sum()

DATE-REPORTED          3683
DISBURSED-DT          32150
CLOSE-DT             251827
LAST-PAYMENT-DATE    319283
dtype: int64

In [9]:
for col in date_cols:
    df[col] = pd.to_datetime(df[col])
    tdf[col] = pd.to_datetime(tdf[col])
df[date_cols].isnull().sum()

DATE-REPORTED          3683
DISBURSED-DT          32150
CLOSE-DT             251827
LAST-PAYMENT-DATE    319283
dtype: int64

In [10]:
def get_length(amt):    
    if not pd.isnull(amt):
        return len(amt.split(","))
    else: 0
        
for col in array_cols:
    print("max train {}".format(df[col].apply(lambda x: get_length(x)).max()))
    
    print("max test {}".format(tdf[col].apply(lambda x: get_length(x)).max()))

max train 37.0
max test 37.0
max train 37.0
max test 37.0
max train 42.0
max test 37.0
max train 38.0
max test 38.0


In [11]:
max_window = 420
ts_feature_vector = 205
max_array_size = 42
label_encoder_dict = {}
num_cores = 4

def encode_reg_cols(x):
    if pd.isnull(x):
        return 0.
    return x

def encode_array_cols(x):
    if pd.isnull(x):
        return [0] * max_array_size
    else:
        ret = []
        for val in x.split(","):
            try:
                ret.append(float(val))
            except:
                ret.append(0.)
        while len(ret) < max_array_size:
            ret.append(0.)
        return ret


def encode_date_cols(x):
    if pd.isnull(x):
        return [-1, -1, -1, -1, -1]
    else:
        return [x.hour, x.minute, x.day, x.month, x.year]


def encode_cat_cols(x, col):

    if pd.isnull(x): x = str(x)
    return label_encoder_dict[col].transform([x])



In [12]:


for col in tqdm(cat_cols):
    if col not in label_encoder_dict:
        label_encoder_dict[col] = LabelEncoder()
    print(col)
    label_encoder_dict[col].fit(df[col].append(tdf[col]).fillna("nan"))
    

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

SELF-INDICATOR
MATCH-TYPE
ACCT-TYPE
CONTRIBUTOR-TYPE
OWNERSHIP-IND
ACCOUNT-STATUS
INSTALLMENT-TYPE
ASSET_CLASS
INSTALLMENT-FREQUENCY
DPD - HIST



In [13]:
temp_df = df[df.ID == 141732]

In [14]:
def encode_df_for_user(dframe):
    
    final = []
    for index, row in dframe.iterrows():
        ret = []
    
        # 10 * 1
        for col in cat_cols:
            ret.extend(encode_cat_cols(row[col], col))
        
        # 7 * 1 = 7
        for col in reg_cols:
            ret.append(encode_reg_cols(row[col]))
        
        # 5 * 4 = 20
        for col in date_cols:
            
            ret.extend(encode_date_cols(row[col]))
        
        
        # 4 * 42 = 168
        for col in array_cols:
            ret.extend(encode_array_cols(row[col]))
        
        assert len(ret) == ts_feature_vector
        final.append(ret)
    while len(final) < max_window:
        final.insert(0, [0.] * ts_feature_vector)
    assert len(final) == max_window
    return np.array(final)

In [15]:
encode_df_for_user(temp_df).shape

(420, 205)

In [16]:
epochs = 100
batch_size = 32

In [17]:
train_label_encoders = {}
target_encoder = LabelEncoder()



train_cat_cols = ['Frequency', 'InstlmentMode', 'LoanStatus', 'PaymentMode', 'BranchID', 'Area', 
            'ManufacturerID', 'SupplierID', 'SEX', 'City', 'State', 'ZiPCODE']
target_col = ['Top-up Month']
train_reg_cols = ['AmountFinance', 'DisbursalAmount', 'EMI', 'AssetID', 'MonthlyIncome', 'Tenure', 'AssetCost', 'LTV', 'AGE']
train_date_cols = ['DisbursalDate', 'MaturityDAte', 'AuthDate']

for col in train_date_cols:
    train_df[col] = pd.to_datetime(train_df[col], errors="coerce")
    test_df[col] = pd.to_datetime(test_df[col], errors="coerce")
    
for col in tqdm(train_cat_cols):
    if col not in train_label_encoders:
        train_label_encoders[col] = LabelEncoder()
    print(col)
    fill_val = -1 if train_df[col].dtype == "int64" else "nan"
    if col == target_col[0]:
        train_label_encoders[col].fit(train_df[col].fillna(fill_val))

    else: train_label_encoders[col].fit(train_df[col].append(test_df[col]).fillna(fill_val))

target_encoder.fit(train_df[target_col])

def train_encode_cat_cols(x, col, tpe):

    if pd.isnull(x): 
        if tpe == "object": x = str(x)
        elif x == "int64": x = 0
        else: assert False
        
    return train_label_encoders[col].transform([x])   

def encode_target(x):
    return target_encoder.transform(x)


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

Frequency
InstlmentMode
LoanStatus
PaymentMode
BranchID
Area
ManufacturerID
SupplierID
SEX
City
State
ZiPCODE



  return f(**kwargs)


In [18]:
train_max_len = 36
def generate_training_data(row):
    row = row[0]
    ret = []
    columns = ['ID', 'Frequency', 'InstlmentMode', 'LoanStatus', 'PaymentMode',
       'BranchID', 'Area', 'Tenure', 'AssetCost', 'AmountFinance',
       'DisbursalAmount', 'EMI', 'DisbursalDate', 'MaturityDAte', 'AuthDate',
       'AssetID', 'ManufacturerID', 'SupplierID', 'LTV', 'SEX', 'AGE',
       'MonthlyIncome', 'City', 'State', 'ZiPCODE']
    column_tpes = ['int64', 'object', 'object', 'object', 'object', 
                   'int64', 'object', 'int64', 'int64','float64', 
                   'float64', 'float64',  '<M8[ns]', '<M8[ns]', '<M8[ns]',
                   'int64', 'int64', 'int64', 'float64', 'object', 
                   'float64', 'float64', 'object', 'object', 'int64', 'object']
    
    for index in range(len(columns)):
        if columns[index] in train_cat_cols:
            
            ret.extend(train_encode_cat_cols(row[index], columns[index], column_tpes[index]))

        elif columns[index] in train_reg_cols:
            ret.append(encode_reg_cols(row[index]))

        elif columns[index] in train_date_cols:
            ret.extend(encode_date_cols(row[index]))
        else: pass
    return np.array(ret)
    
def generate_datasets_to_train(train_dframe, bureau_df, val_size=.2):
    ids = train_dframe["ID"].unique()
    np.random.shuffle(ids)
    sp = int((1. - val_size) * ids.shape[0])
    tr_ids, val_ids = ids[: sp], ids[sp:]
    X_br, X_val_br = [], []
    X, X_val = [], []
    y, y_val = [], []
    for i in tqdm(tr_ids):
        
#         X_br = Parallel(n_jobs=num_cores)(delayed(encode_df_for_user)(bureau_df[bureau_df.ID == i]))
#         X = Parallel(n_jobs=num_cores)(delayed(generate_training_data)(train_dframe[train_dframe.ID == i].to_numpy()))
        
        
        X_br.append(encode_df_for_user(bureau_df[bureau_df.ID == i]))
        X.append(generate_training_data(train_dframe[train_dframe.ID == i].to_numpy()))
        y.append(target_encoder.transform(train_dframe[train_dframe.ID == i][target_col].values))
        
    for i in tqdm(val_ids):
#         X_val_br = Parallel(n_jobs=num_cores)(delayed(encode_df_for_user)(bureau_df[bureau_df.ID == i]))
#         X_val = Parallel(n_jobs=num_cores)(delayed(generate_training_data)(train_dframe[train_dframe.ID == i].to_numpy()))
        
        
        X_val_br.append(encode_df_for_user(bureau_df[bureau_df.ID == i]))
        
        X_val.append(generate_training_data(train_dframe[train_dframe.ID == i].to_numpy()))
        y_val.append(target_encoder.transform(train_dframe[train_dframe.ID == i][target_col].values))
    
    
    return np.array(X), np.array(X_val), np.array(X_br), np.array(X_val_br), np.array(y), np.array(y_val)
    

    
    
    

In [19]:
z = generate_datasets_to_train(train_df.sample(100), df)

HBox(children=(FloatProgress(value=0.0, max=80.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [20]:
z[0].shape, z[1].shape, z[2].shape, z[3].shape, z[4].shape, z[5].shape

((80, 36), (20, 36), (80, 420, 205), (20, 420, 205), (80, 1), (20, 1))

In [21]:
pkl.dump(z, open("data.pkl", "wb"))

In [22]:
# early_stop = tf.keras.callbacks.EarlyStopping(
#     monitor='val_loss', min_delta=1e-2, patience=5, verbose=0, mode='auto',
#     baseline=None, restore_best_weights=True)



# model = keras.Sequential()
# model.add(keras.layers.LSTM(64, kernel_initializer='he_uniform', batch_input_shape=(None, max_window, ts_feature_vector), return_sequences=True, name='encoder_1'))
# model.add(keras.layers.LSTM(32, kernel_initializer='he_uniform', return_sequences=True, name='encoder_2'))
# model.add(keras.layers.LSTM(16, kernel_initializer='he_uniform', return_sequences=False, name='encoder_3'))


# model.add(keras.layers.TimeDistributed(keras.layers.Dense(ts_feature_vector)))
# model.compile(loss="mse",optimizer='adam')
# print(model.summary())


# model.fit(x=z[0], y=z[0], validation_data=(z[1], z[1]), epochs=10, batch_size=batch_size, shuffle=True, callbacks=[early_stop])

In [23]:
train_input = Input

In [27]:
def get_model():
    train_in = Input(shape=(train_max_len, ))
    bureau_in = Input(shape=(max_window, ts_feature_vector))
    bureau_in = LSTM(128, kernel_initializer='he_uniform', return_sequences=True)(bureau_in)
    bureau_in = LSTM(64, kernel_initializer='he_uniform', return_sequences=True)(bureau_in)
    bureau_in = LSTM(36, kernel_initializer='he_uniform', return_sequences=True)(bureau_in)
    
    x = concatenate([train_in, bureau_in])
    x = SpatialDropout1D(0.6)(x)
    output = Dense(1, activation="sigmoid")(x)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss='binary_crossentropy',
        metrics=["acc"],
    )
    print(model.summary())
    return model

model = get_model()


model.fit(x=[z[0], z[2]], y=z[4], validation_data=([z[1], z[3]], z[5]), epochs=10, batch_size=batch_size, shuffle=True, callbacks=[early_stop])

ValueError: A `Concatenate` layer requires inputs with matching shapes except for the concat axis. Got inputs shapes: [(None, 36), (None, 420, 36)]