In [1]:
import numpy as np
from time import time

import tensorflow as tf

import tensorflow_hub as hub

In [2]:
from pyspark import SparkConf, SparkContext

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql import functions as F

import pandas as pd

spark = SparkSession \
    .builder \
    .appName("tensorflow") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/hotel.clean") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/hotel.tensorflow") \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.4.1') \
    .getOrCreate()

In [3]:
# get reviews dataframe
reviews = spark.read.format("mongo").load()
df = reviews.toPandas()
df.review.values

array([' The room was really completely sound proof We appreciated the bathroom with both tube and big shower cabin together with the super cozy bath ropes And if you like to listen to music properly don t forget to bring a little audio cable jack to jack The superior rooms are equipped with a Bose sound system but it doesn t have a bluethooth connection and you will need this cable ',
       ' Excellent location good standard of hotel Professional service',
       ' Location was reason for booking and was spot on Staff were all very pleasant Time was spent out and about as we were visiting friends and family so didn t use facilities or eat breakfast Room was clean and tidy with great black out curtains Check out swift and easy ',
       ..., ' Area around hotel was a little too quiet',
       ' It was horrible',
       ' Pool was small and not heated rooms are quite small '],
      dtype=object)

In [4]:
# pandas dataframe to tf dataset
dataset = tf.data.Dataset.from_tensor_slices((df["review"].values, df["sentiment"].values))

In [5]:
# split into train and test
DATASET_SIZE = len(df)

train_size = int(0.7 * DATASET_SIZE)
test_size = int(0.3 * DATASET_SIZE)

shuffled_dataset = dataset.shuffle(DATASET_SIZE)
train_data = dataset.take(train_size)
test_dataset = dataset.skip(train_size)
test_data = test_dataset.take(test_size)

In [6]:
# use transfer learning
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)

In [7]:
# setup neural network
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 20)                400020    
_________________________________________________________________
dense (Dense)                (None, 16)                336       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 400,373
Trainable params: 400,373
Non-trainable params: 0
_________________________________________________________________


In [8]:
# loss function and optimizer
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [9]:
# record training time
t0 = time()

# train model using 20 epochs in mini-batches of 512 samples
history = model.fit(train_data.shuffle(10000).batch(512),
                    epochs=20,
                    validation_data=test_data.batch(512),
                    verbose=1)

tt = time() - t0
print("It took {} seconds to train.".format(tt))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
It took 14.295791864395142 seconds to train.


In [10]:
# evaluate model
results = model.evaluate(test_data.batch(512), verbose=2)

for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

19/19 - 0s - loss: 0.3441 - accuracy: 0.8938
loss: 0.344
accuracy: 0.894


In [11]:
# predict test set
prediction = model.predict_classes(test_data.batch(512))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [12]:
# get numpy array of reviews of test_data
test_data_values, test_data_labels = next(iter(test_data.batch(test_size)))

In [13]:
# evaluating our model along more metrics
predicted = prediction.flatten()
actual = test_data_labels
TP = tf.math.count_nonzero(predicted * actual).numpy()
TN = tf.math.count_nonzero((predicted - 1) * (actual - 1)).numpy()
FP = tf.math.count_nonzero(predicted * (actual - 1)).numpy()
FN = tf.math.count_nonzero((predicted - 1) * actual).numpy()

In [21]:
FN

0

In [14]:
# check to avoid divide-by-zero
if TP == 0:
    TP = 0.0001
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * precision * recall / (precision + recall)
print("Precision: {} \nRecall: {} \n F1: {}".format(precision, recall, f1))

Precision: 9.699320106758477e-08 
Recall: 1.0 
 F1: 1.9398638331980927e-07


In [16]:
# removes 'b' char in front of every review
test_data_values = np.array([x.decode() for x in test_data_values.numpy()])

In [17]:
# create dataframe of reviews and sentiment to write to mongo
final_df = pd.DataFrame({"review": test_data_values, "sentiment": prediction[:, 0]})

In [18]:
# write predicted reviews to mongo
spark \
    .createDataFrame(final_df) \
    .write.format("mongo").mode("append").save()