In [24]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

import numpy as np
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import StringType
from pyspark.ml.feature import StringIndexer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dense, Embedding

In [4]:
conf = SparkConf()
sc = SparkContext(conf=conf)
spark = SparkSession(sc)


24/08/03 19:40:54 WARN Utils: Your hostname, thenn-5570 resolves to a loopback address: 127.0.1.1; using 192.168.1.121 instead (on interface wlp3s0)
24/08/03 19:40:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/03 19:40:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# Reading the Data
rtData = spark.read.option("header","true").option("inferSchema","true").csv("rt_reviews.csv").toDF("label","text").select("text","label").limit(10000)

# Modifying the Labels
rtData = rtData.withColumn('label', F.when(col("label") == "fresh", 1).otherwise(col("label")))
rtData = rtData.withColumn('label', F.when(col("label") == "rotten", -1).otherwise(col("label")))

# Tokenization
tokenizer = Tokenizer(inputCol="text", outputCol="words")
rtWordsData = tokenizer.transform(rtData)

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
rtFilteredData = remover.transform(rtWordsData).select("label", "filtered")
rtFilteredData = spark.createDataFrame(rtFilteredData.rdd.map(lambda x: (x[0], [word for word in x[1] if len(word) > 0])).collect(), ['label','filtered'])

# Run StringIndexer on all the words in the data
allWords = spark.createDataFrame(rtFilteredData.select("filtered").rdd.flatMap(lambda x: x[0]), StringType())
indexer = StringIndexer(inputCol = "value", outputCol="categoryIndex")
indexed = indexer.fit(allWords).transform(allWords)

# Create a dictionary with the word as key and the index as value
index_dict = dict(indexed.distinct().rdd.map(lambda x: (x["value"], x["categoryIndex"])).collect())

# Convert the columns with list of words to list of idexes, using the index_dict
rtFinalData = spark.createDataFrame(rtFilteredData.rdd.map(lambda x: (x[0], x[1], [index_dict[word] for word in x[1] if len(word) > 0])).collect(), ['label','filtered', 'filteredIndex'])
rtFinalData = rtFinalData.select("label", "filteredIndex")

24/08/03 19:42:38 WARN DAGScheduler: Broadcasting large task binary with size 1118.7 KiB
24/08/03 19:42:44 WARN DAGScheduler: Broadcasting large task binary with size 1124.6 KiB
                                                                                

In [6]:
# Split the data into training and testing sets
(rtTrainingData, rtValidationData, rtTestingData) = rtFinalData.randomSplit([0.7, 0.1, 0.2], seed=1234)

rt_train_label = rtTrainingData.select("label")
rt_train_data = rtTrainingData.select("filteredIndex")

rt_valid_label = rtValidationData.select("label")
rt_valid_data = rtValidationData.select("filteredIndex")

rt_test_label = rtTestingData.select("label")
rt_test_data = rtTestingData.select("filteredIndex")

In [7]:
# Convert the DataFrames to lists of arrays
rt_train_data = rt_train_data.rdd.flatMap(lambda x: np.array(x)).collect()
rt_train_label = np.array(rt_train_label.rdd.flatMap(lambda x: x).collect())

rt_valid_data = rt_valid_data.rdd.flatMap(lambda x: np.array(x)).collect()
rt_valid_label = np.array(rt_valid_label.rdd.flatMap(lambda x: x).collect())

rt_test_data = rt_test_data.rdd.flatMap(lambda x: np.array(x)).collect()
rt_test_label = np.array(rt_test_label.rdd.flatMap(lambda x: x).collect())

                                                                                

In [8]:
# Pad the data, so each column is of the same length
rt_train_data = sequence.pad_sequences(rt_train_data, maxlen = 40)
rt_valid_data = sequence.pad_sequences(rt_valid_data, maxlen = 40)
rt_test_data = sequence.pad_sequences(rt_test_data, maxlen = 40)

In [11]:
rnn_model1 = Sequential()
rnn_model1.add(Embedding(50000, 64))
rnn_model1.add(SimpleRNN(units = 32, dropout = 0.2, recurrent_dropout = 0.2))
rnn_model1.add(Dense(1, activation = 'sigmoid'))

rnn_model1.compile(loss = 'binary_crossentropy', optimizer = 'rmsprop', metrics=['accuracy'])

history_rnn1 = rnn_model1.fit(rt_train_data, rt_train_label.astype(int), batch_size = 32, epochs = 10, validation_data = (rt_valid_data, rt_valid_label.astype(int)))

Epoch 1/10
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 84ms/step - accuracy: 0.0535 - loss: 0.3580 - val_accuracy: 0.0000e+00 - val_loss: 0.1486
Epoch 2/10
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 73ms/step - accuracy: 4.5551e-04 - loss: 0.0044 - val_accuracy: 0.0000e+00 - val_loss: 0.1363
Epoch 3/10
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 68ms/step - accuracy: 0.0123 - loss: -0.1182 - val_accuracy: 0.0051 - val_loss: 0.0656
Epoch 4/10
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 70ms/step - accuracy: 0.0871 - loss: -1.1464 - val_accuracy: 0.0213 - val_loss: -0.0536
Epoch 5/10
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 70ms/step - accuracy: 0.1925 - loss: -3.0635 - val_accuracy: 0.0711 - val_loss: -0.0814
Epoch 6/10
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 72ms/step - accuracy: 0.2898 - loss: -6.0063 - val_accuracy: 0.0863 - val_loss: -0.03

In [15]:
rnn_model1.evaluate(rt_test_data, rt_test_label.astype(int), verbose=0)

[-0.8564020395278931, 0.14098690450191498]

In [17]:
rnn_model2 = Sequential(name="Simple_RNN")
rnn_model2.add(Embedding(50000, 64))
rnn_model2.add(SimpleRNN(128, activation='tanh', return_sequences=True))
rnn_model2.add(SimpleRNN(64, activation='tanh', return_sequences=False))
rnn_model2.add(Dense(1, activation='sigmoid'))

rnn_model2.compile(loss="binary_crossentropy",optimizer='adam',metrics=['accuracy'])

history_rnn2 = rnn_model2.fit(rt_train_data, rt_train_label.astype(int), batch_size=64, epochs=10, verbose=1, validation_data = (rt_valid_data, rt_valid_label.astype(int)))

Epoch 1/10
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 140ms/step - accuracy: 0.0142 - loss: 0.1109 - val_accuracy: 0.0000e+00 - val_loss: 0.1721
Epoch 2/10
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 132ms/step - accuracy: 0.0419 - loss: -0.6192 - val_accuracy: 0.0284 - val_loss: 0.2305
Epoch 3/10
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 136ms/step - accuracy: 0.4360 - loss: -8.4634 - val_accuracy: 0.0985 - val_loss: 0.8879
Epoch 4/10
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 131ms/step - accuracy: 0.4792 - loss: -15.7845 - val_accuracy: 0.1442 - val_loss: 1.8224
Epoch 5/10
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 126ms/step - accuracy: 0.4939 - loss: -20.4512 - val_accuracy: 0.1706 - val_loss: 3.4088
Epoch 6/10
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 108ms/step - accuracy: 0.4955 - loss: -24.7023 - val_accuracy: 0.1706 - val_loss: 4.248

In [None]:
rnn_model2.evaluate(rt_test_data, rt_test_label.astype(int), verbose=0)

[8.237541198730469, 0.21802617609500885]

In [21]:
rnn_model2.save("./model_checkpoint/rnn_model2.keras")

In [25]:
reconstructed_model = tf.keras.models.load_model("./model_checkpoint/rnn_model2.keras")

In [26]:
reconstructed_model.evaluate(rt_test_data, rt_test_label.astype(int), verbose=0)

[10.195833206176758, 0.22809667885303497]

In [27]:
rnn_model3 = Sequential(name="Simple_RNN")
rnn_model3.add(Embedding(1000, 64))
rnn_model3.add(SimpleRNN(128, activation='tanh', return_sequences=True))
rnn_model3.add(SimpleRNN(64, activation='tanh', return_sequences=False))
rnn_model3.add(Dense(1, activation='sigmoid'))

rnn_model3.compile(loss="binary_crossentropy",optimizer='adam',metrics=['accuracy'])

history_rnn3 = rnn_model3.fit(rt_train_data, rt_train_label.astype(int), batch_size=64, epochs=10, verbose=1, validation_data = (rt_valid_data, rt_valid_label.astype(int)))

Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node Simple_RNN_1/embedding_5_1/GatherV2 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/asyncio/base_events.py", line 641, in run_forever

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/asyncio/base_events.py", line 1987, in _run_once

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/asyncio/events.py", line 88, in _run

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/tmp/ipykernel_160203/1808241564.py", line 9, in <module>

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 318, in fit

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 121, in one_step_on_iterator

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 108, in one_step_on_data

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 51, in train_step

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/layers/layer.py", line 882, in __call__

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/models/sequential.py", line 209, in call

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/models/functional.py", line 175, in call

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/ops/function.py", line 171, in _run_through_graph

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/models/functional.py", line 556, in call

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/layers/layer.py", line 882, in __call__

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/layers/core/embedding.py", line 140, in call

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/ops/numpy.py", line 4875, in take

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/backend/tensorflow/numpy.py", line 1951, in take

indices[60,34] = 2002 is not in [0, 1000)
	 [[{{node Simple_RNN_1/embedding_5_1/GatherV2}}]] [Op:__inference_one_step_on_iterator_66420]

In [28]:
rnn_model4 = tf.keras.models.load_model("sentiment_analysis_model.h5")



In [29]:
rnn_model4.evaluate(rt_test_data, rt_test_label.astype(int), verbose=0)

2024-08-04 10:20:26.624127: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: INVALID_ARGUMENT: indices[29,36] = 6020 is not in [0, 5000)
	 [[{{node sequential_2_1/embedding_2_1/GatherV2}}]]


InvalidArgumentError: Graph execution error:

Detected at node sequential_2_1/embedding_2_1/GatherV2 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/asyncio/base_events.py", line 641, in run_forever

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/asyncio/base_events.py", line 1987, in _run_once

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/asyncio/events.py", line 88, in _run

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/tmp/ipykernel_160203/4083094222.py", line 1, in <module>

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 429, in evaluate

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 165, in one_step_on_iterator

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 154, in one_step_on_data

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 82, in test_step

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/layers/layer.py", line 882, in __call__

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/models/sequential.py", line 209, in call

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/models/functional.py", line 175, in call

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/ops/function.py", line 171, in _run_through_graph

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/models/functional.py", line 556, in call

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/layers/layer.py", line 882, in __call__

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/layers/core/embedding.py", line 140, in call

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/ops/numpy.py", line 4875, in take

  File "/home/thenn/anaconda3/envs/ytproject/lib/python3.12/site-packages/keras/src/backend/tensorflow/numpy.py", line 1951, in take

indices[29,36] = 6020 is not in [0, 5000)
	 [[{{node sequential_2_1/embedding_2_1/GatherV2}}]] [Op:__inference_one_step_on_iterator_67635]