# Sentiment Analysis using - RNN model

In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Read the JSON file into a list of JSON objects with the specified encoding
with open("yelp_academic_dataset_review.json", "r", encoding="utf-8") as json_file:
    data = [json.loads(line) for line in json_file]


In [3]:
# Create a DataFrame from the list of JSON objects
df = pd.DataFrame(data)
# Assuming you want to perform sentiment analysis based on the text data alone, create a binary sentiment label
df['sentiment'] = df['stars'].apply(lambda x: 1 if x > 3 else 0)


In [4]:
# Split the dataset into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)


In [5]:
# Tokenize the text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data)

In [6]:
# Convert text data to sequences
train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)


In [7]:
# Pad sequences to a fixed length
max_sequence_length = 100  # you can adjust this based on your data
train_data_padded = pad_sequences(train_sequences, maxlen=max_sequence_length)
test_data_padded = pad_sequences(test_sequences, maxlen=max_sequence_length)

In [8]:
# Initialize the RNN model
def init_model():
    model = Sequential()
    model.add(layers.Masking())
    model.add(layers.LSTM(20, activation='tanh'))
    model.add(layers.Dense(15, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', # compiling the model with rmsprop optimizer
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    
    return model

# Initialize the model
model = init_model()

In [9]:
# Define a data generator
def data_generator(data, labels, batch_size):
    num_batches = len(data) // batch_size
    while True:
        for i in range(num_batches):
            batch_data = data[i * batch_size: (i + 1) * batch_size]
            batch_labels = labels[i * batch_size: (i + 1) * batch_size]
            yield batch_data, batch_labels


In [10]:
# Usage:
batch_size = 32
train_generator = data_generator(train_data_padded, train_labels, batch_size)

In [11]:
# Fitting the RNN model on padded data with early stopping
es = EarlyStopping(patience=5, restore_best_weights=True)
history = model.fit(train_generator, epochs=100, steps_per_epoch=len(train_data_padded)//batch_size, validation_split=0.3, callbacks=[es])

ValueError: `validation_split` is only supported for Tensors or NumPy arrays, found following types in the input: [<class 'generator'>]

In [None]:
# Save the trained model
model.save("sentiment_analysis_rnn_model.h5")

In [None]:
# Evaluate the model on test data
test_loss, test_acc = model.evaluate(test_data_padded, test_labels)
print(f"Test Accuracy: {test_acc}")