In [60]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import sys
import pickle

import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, \
                                precision_score, f1_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import matplotlib.pyplot as plt
%matplotlib inline

### Loading preprocessed Data

In [45]:
with open(os.path.join('pickles', 'dataset.pickle'), 'rb') as handle:
    dataset = pickle.load(handle)

X_train = dataset['X_train']
y_train = dataset['y_train']
X_valid = dataset['X_valid']
y_valid = dataset['y_valid']
X_test = dataset['X_test']
y_test = dataset['y_test']

In [46]:
with open(os.path.join('pickles', 'encoder.pickle'), 'rb') as handle:
    encoder = pickle.load(handle)

## Building Model 

In [50]:
embedding_dim = 16

model = keras.Sequential([
    layers.Embedding(encoder.vocab_size, embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 16)          130960    
_________________________________________________________________
global_average_pooling1d_2 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 131,249
Trainable params: 131,249
Non-trainable params: 0
_________________________________________________________________


In [51]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_valid, y_valid),
    callbacks=[early_stop]
)

Train on 127000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [82]:
y_pred = model.predict(X_test)

In [83]:
y_pred = y_pred.reshape(1, -1)[0]
y_pred[np.where(y_pred >= 0.5)] = 1
y_pred[np.where(y_pred < 0.5)] = 0

In [84]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))

Accuracy: 0.87065625
Recall: 0.8268977208513845
Precision: 0.9050302363936229
F1 Score: 0.8642015814167131


In [85]:
weights = model.layers[0]\
                .get_weights()[0]
vect_file = os.path.join('embeddings', 'vecs.tsv')
meta_file = os.path.join('embeddings', 'meta.tsv')

with open(vect_file, 'w') as out_vect, open(meta_file, 'w') as out_meta:
    for i, word in enumerate(encoder.subwords):
        vec = weights[num + 1]
        out_meta.write(word + '\n')
        out_vect.write('\t'.join([str(w) for w in vec]) + '\n')