In [1]:
import os
os.chdir('drive/My Drive/MLProject_20202')

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from gensim.models.doc2vec import Doc2Vec
from tensorflow import keras

from utils import get_doc2vec_vectors, get_lstm_vectors

In [22]:
df_cat = pd.read_csv('Data/data_all_categorical.csv', encoding='utf-8')

y = df_cat.fraudulent
# y = np.expand_dims(y, axis=1)

df_cat_no_target = df_cat.loc[:, df_cat.columns != 'fraudulent']
categorical_vectors = df_cat_no_target.to_numpy()

In [34]:
y_train, y_test = train_test_split(y, test_size=0.2)
y_train, y_val = train_test_split(y_train, test_size=0.2)

Embedding for LSTM/BiLSTM

In [24]:
df = pd.read_json("Data/data_text_embedded.json")
embedded_text = df.embedded_text.tolist()

In [25]:
lstm_model = keras.models.load_model(f'Model/LSTM/model_lstm_256_8.h5')

In [26]:
context_vectors_lstm = get_lstm_vectors(lstm_model, 8, 256, embedded_text)
X_lstm = np.concatenate((context_vectors_lstm, categorical_vectors), axis = 1)

In [27]:
bilstm_model = keras.models.load_model(f'Model/LSTM/model_bilstm_512_8.h5')

In [28]:
context_vectors_bilstm = get_lstm_vectors(bilstm_model, 8, 512, embedded_text)
X_bilstm = np.concatenate((context_vectors_bilstm, categorical_vectors), axis = 1)

Embedding for DM/DBOW

In [29]:
df_text = pd.read_csv("Data/data_text_clean.csv", encoding='utf-8')
text_all = df_text['clean_text'].tolist()

In [30]:
dm_model = Doc2Vec.load(f"Model/Doc2Vec/doc2vec_pv_dm_100.model")

In [31]:
context_vectors_dm = get_doc2vec_vectors(dm_model, 100, text_all)
X_dm = np.concatenate((context_vectors_dm, categorical_vectors), axis = 1)

In [32]:
dbow_model = Doc2Vec.load(f"Model/Doc2Vec/doc2vec_pv_dbow_100.model")

In [33]:
context_vectors_dbow = get_doc2vec_vectors(dbow_model, 100, text_all)
X_dbow = np.concatenate((context_vectors_dbow, categorical_vectors), axis = 1)

Store data

In [41]:
def prepare_data(X, y_train, y_val, y_test, name):
    df_X = pd.DataFrame(X)
    df_train = pd.concat([df_X.iloc[y_train.index.tolist()], y_train], axis=1)
    df_val = pd.concat([df_X.iloc[y_val.index.tolist()], y_val], axis=1)
    df_test = pd.concat([df_X.iloc[y_test.index.tolist()], y_test], axis=1)

    df_train.to_csv(f'Data/Train data/data_{name}.csv', index=False)
    df_val.to_csv(f'Data/Val data/data_{name}.csv', index=False)
    df_test.to_csv(f'Data/Test data/data_{name}.csv', index=False)

In [42]:
prepare_data(X_lstm, y_train, y_val, y_test, 'LSTM')
prepare_data(X_bilstm, y_train, y_val, y_test, 'BiLSTM')
prepare_data(X_dm, y_train, y_val, y_test, 'DM')
prepare_data(X_dbow, y_train, y_val, y_test, 'DBOW')