In [86]:
import numpy as np # linear algebra
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.initializers import Constant
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report

import matplotlib.pyplot as plt
import seaborn as sns

# run GPU
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[]

In [87]:
df = pd.read_csv('./preprocessing/train.csv')
# df_train['text'] = df_train['text'].fillna('na')

df_test = pd.read_csv('./preprocessing/test.csv')
# df_test['text'] = df_test['text'].fillna('na')

In [88]:
x_train,x_val,y_train,y_val=train_test_split(df['text'].values,df['target'].values,test_size=0.2,random_state=123)

In [89]:
tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index
num_words = len(tokenizer.word_index) + 1

In [90]:
train_sequences = tokenizer.texts_to_sequences(x_train)
train_padded = tf.keras.preprocessing.sequence.pad_sequences(train_sequences,padding='post')

max_len = len(train_padded[0])

validation_sequences = tokenizer.texts_to_sequences(x_val)
validation_padded = tf.keras.preprocessing.sequence.pad_sequences(validation_sequences,padding='post',maxlen=max_len)

x_test = df_test['text']
test_sequences = tokenizer.texts_to_sequences(x_test)
test_padded = tf.keras.preprocessing.sequence.pad_sequences(test_sequences,padding='post',maxlen=max_len)

In [91]:
model = tf.keras.Sequential([
tf.keras.layers.Embedding(num_words, 100,mask_zero=True),
#tf.keras.layers.LSTM(64),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,recurrent_dropout=0.1)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])

In [92]:
model.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(1e-4),metrics=['accuracy'])

In [93]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 100)         1541800   
                                                                 
 bidirectional_4 (Bidirectio  (None, 128)              84480     
 nal)                                                            
                                                                 
 dense_8 (Dense)             (None, 64)                8256      
                                                                 
 dense_9 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,634,601
Trainable params: 1,634,601
Non-trainable params: 0
_________________________________________________________________


In [94]:
num_epochs = 30

In [95]:
history = model.fit(train_padded, y_train, epochs=num_epochs,validation_data=(validation_padded, y_val), verbose=2)

Epoch 1/30
191/191 - 11s - loss: 0.6859 - accuracy: 0.5668 - val_loss: 0.6686 - val_accuracy: 0.6244 - 11s/epoch - 57ms/step
Epoch 2/30
191/191 - 6s - loss: 0.5341 - accuracy: 0.7741 - val_loss: 0.4944 - val_accuracy: 0.8004 - 6s/epoch - 32ms/step
Epoch 3/30
191/191 - 6s - loss: 0.3215 - accuracy: 0.8852 - val_loss: 0.4998 - val_accuracy: 0.7945 - 6s/epoch - 33ms/step
Epoch 4/30
191/191 - 7s - loss: 0.2216 - accuracy: 0.9305 - val_loss: 0.5442 - val_accuracy: 0.7945 - 7s/epoch - 35ms/step
Epoch 5/30
191/191 - 7s - loss: 0.1590 - accuracy: 0.9524 - val_loss: 0.6180 - val_accuracy: 0.7899 - 7s/epoch - 38ms/step
Epoch 6/30
191/191 - 7s - loss: 0.1188 - accuracy: 0.9645 - val_loss: 0.6839 - val_accuracy: 0.7840 - 7s/epoch - 35ms/step
Epoch 7/30
191/191 - 7s - loss: 0.0936 - accuracy: 0.9729 - val_loss: 0.7514 - val_accuracy: 0.7774 - 7s/epoch - 36ms/step
Epoch 8/30
191/191 - 7s - loss: 0.0781 - accuracy: 0.9767 - val_loss: 0.8094 - val_accuracy: 0.7708 - 7s/epoch - 36ms/step
Epoch 9/30
191

In [99]:
y_pred = model.predict(test_padded)
y_pred = list(map(lambda x: 1 if x >= 0.5 else 0, y_pred))



In [97]:
# predictions = model.predict(data_test)
# predictions = predictions.reshape(1,-1)
# predictions = predictions[0]
# predictions = list(map(lambda x: 1 if x>=0.5 else 0,predictions))

In [100]:
from sklearn.metrics import accuracy_score

ans = pd.read_csv('./dataset/ans.csv')['target'].values
# accuracy_score(y_pred= predictions, y_true= ans)
accuracy_score(y_pred= y_pred, y_true= ans)

0.7581979773214833