In [70]:
import numpy as np # linear algebra
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import GlobalMaxPool1D
from keras.layers import Bidirectional

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [71]:
train_df = pd.read_csv('./preprocessing/train.csv')
test_df = pd.read_csv('./preprocessing/test.csv')

In [72]:
train_text = train_df.text
test_text = test_df.text

In [73]:
max_features = 5000
tokens = Tokenizer(max_features)
tokens.fit_on_texts(train_text)
index_train = tokens.texts_to_sequences(train_text)
index_test = tokens.texts_to_sequences(test_text)

In [74]:
max_len = 200
tr_x = pad_sequences(index_train,maxlen = 200)
te_x = pad_sequences(index_test,maxlen = 200)

In [75]:
model = Sequential()
model.add(Embedding(input_dim = 5000,output_dim = 256,input_length = 200))
model.add(Bidirectional(LSTM(256,return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dense(150,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2,activation='sigmoid'))

model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 200, 256)          1280000   
                                                                 
 bidirectional_6 (Bidirectio  (None, 200, 512)         1050624   
 nal)                                                            
                                                                 
 global_max_pooling1d_6 (Glo  (None, 512)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_12 (Dense)            (None, 150)               76950     
                                                                 
 dropout_6 (Dropout)         (None, 150)               0         
                                                                 
 dense_13 (Dense)            (None, 2)                

In [76]:
import tensorflow.keras.optimizers as optimizers

model.compile(loss= 'binary_crossentropy',  optimizer=optimizers.Adam(learning_rate=.0001), metrics=[ 'accuracy' ])

In [77]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
callbacks = [
    EarlyStopping(patience=3, verbose=1),
    ReduceLROnPlateau(factor=0.25, patience=2, min_lr=0.00001, verbose=1),
    # ModelCheckpoint('model_lstm.h5', verbose=1, save_best_only=True, save_weights_only=True)
]

In [78]:
from keras.utils import to_categorical

X = tr_x
y = train_df.target
y = to_categorical(y)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [79]:
model.fit(X_train, y_train, epochs=10, callbacks=callbacks, validation_data=(X_val,y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 6: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 7/10
Epoch 7: early stopping


<keras.callbacks.History at 0x2785e7b33d0>

In [92]:
predictions = model.predict(te_x)
y_pred = predictions.argmax(axis = 1)



In [101]:
from sklearn.metrics import accuracy_score

ans = pd.read_csv('./dataset/ans.csv')['target'].values
accuracy_score(y_pred= y_pred, y_true= ans)

0.7704566349984676