<a href="https://colab.research.google.com/github/xhavien/CCDEPLRL_EXERCISES_COM222ML/blob/main/Exercise6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 6

In [198]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [199]:
import numpy as np
import pandas as pd

path = "https://github.com/robitussin/CCDEPLRL_EXERCISES/blob/9b8ac1c5683abecc144f0af47eb7cda0688e12b7/dataset/reviews.json?raw=true"

dataset = pd.read_json(path)

In [200]:
dataset.head()

Unnamed: 0,review,rating
0,sir okay armygreen shorts nice,5
1,di pareha yong mga size nila may sobrang liit ...,5
2,super worth it ang ganda Sombra grabi order na...,5
3,ganda po salamat,5
4,maayos pagkadeliver maganda den sya,5


## 1. Tokenize the data

In [201]:
# Extract the review sentences from the dataset
training_sentences = dataset['review'].tolist()  # Get all reviews as a list

# Set tokenizer parameters
vocab_size = 1000
embedding_dim = 16
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = ""

# Create and fit the tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

## 2. Sequence the data

In [202]:
# Convert the text to sequences
training_sequences = tokenizer.texts_to_sequences(training_sentences)

## 3. Pad the data

In [203]:
# Pad the sequences
training_padded = pad_sequences(training_sequences, maxlen=max_length,
                               padding=padding_type, truncating=trunc_type)

# Convert to numpy arrays
training_padded = np.array(training_padded)

# Prepare the labels (ratings)
training_labels = np.array(dataset['rating'])

## 4. Train a sentiment model

In [204]:
# Build a basic sentiment network
# Note the embedding layer is first,
# and the output is only 1 node as it is either 0 or 1 (negative or positive)
# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(3, activation='softmax')  # 3 outputs: negative, neutral, positive
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Convert ratings to sentiment categories (0=negative, 1=neutral, 2=positive)
# Assuming ratings 1-2 = negative, 3 = neutral, 4-5 = positive
sentiment_labels = []
for rating in training_labels:
    if rating <= 2:
        sentiment_labels.append(0)  # Negative
    elif rating == 3:
        sentiment_labels.append(1)  # Neutral
    else:
        sentiment_labels.append(2)  # Positive

sentiment_labels = np.array(sentiment_labels)

# Train the model
num_epochs = 30
history = model.fit(training_padded, sentiment_labels,
                    epochs=num_epochs,
                    validation_split=0.2,
                    verbose=1)

Epoch 1/30




[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 48ms/step - accuracy: 0.5144 - loss: 1.0594 - val_accuracy: 0.2139 - val_loss: 1.1364
Epoch 2/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.5573 - loss: 0.9588 - val_accuracy: 0.2139 - val_loss: 1.1064
Epoch 3/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.5741 - loss: 0.9277 - val_accuracy: 0.2139 - val_loss: 1.1156
Epoch 4/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.5726 - loss: 0.8825 - val_accuracy: 0.6667 - val_loss: 0.9107
Epoch 5/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.7308 - loss: 0.6990 - val_accuracy: 0.6816 - val_loss: 0.8953
Epoch 6/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.7883 - loss: 0.5920 - val_accuracy: 0.6468 - val_loss: 1.0553
Epoch 7/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━

## Get files for visualing the network

In [205]:
# First get the weights of the embedding layer
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(1000, 16)


In [206]:
import io

# Write out the embedding vectors and metadata

# Create the reverse word index
reverse_word_index = dict([(value, key) for (key, value) in tokenizer.word_index.items()])

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [207]:
# Download the files
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 5. Predict sentiment with new reviews

In [208]:
# Test with the fake reviews from the screenshots
fake_reviews = [
    'sakto lang',
    'i hate you',
    'napakahigh quality ng product nito',
    'solid ng customer service',
    'Pinaka masarap na natikman ko',
    'Hoy, bibirahiin ko gumawa neto'
]

# Tokenize the fake reviews
fake_sequences = tokenizer.texts_to_sequences(fake_reviews)

# Pad the fake review sequences
fake_padded = pad_sequences(fake_sequences, maxlen=max_length,
                           padding=padding_type, truncating=trunc_type)

# Make predictions on the fake reviews
predictions = model.predict(fake_padded)

# Process the predictions into a list of dictionaries for printing
prediction_results = []
sentiment_labels_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}

for i, review in enumerate(fake_reviews):
    # Get the sentiment scores for the current review
    scores = predictions[i]
    # Find the index of the highest score (predicted sentiment)
    predicted_sentiment_index = np.argmax(scores)
    # Get the corresponding sentiment label
    predicted_sentiment = sentiment_labels_map[predicted_sentiment_index]

    prediction_results.append({
        'review': review,
        'sentiment': predicted_sentiment,
        'scores': {
            'negative': scores[0],
            'neutral': scores[1],
            'positive': scores[2]
        }
    })


# Print the results in a nice format
print("\n===== SENTIMENT ANALYSIS RESULTS =====\n")
for result in prediction_results:
    print(f"Review: {result['review']}")
    print(f"Sentiment: {result['sentiment']}")
    print(f"Scores: Negative: {result['scores']['negative']:.4f}, Neutral: {result['scores']['neutral']:.4f}, Positive: {result['scores']['positive']:.4f}")
    print("-" * 50)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 487ms/step

===== SENTIMENT ANALYSIS RESULTS =====

Review: sakto lang
Sentiment: Neutral
Scores: Negative: 0.2615, Neutral: 0.6307, Positive: 0.1078
--------------------------------------------------
Review: i hate you
Sentiment: Positive
Scores: Negative: 0.0821, Neutral: 0.2634, Positive: 0.6544
--------------------------------------------------
Review: napakahigh quality ng product nito
Sentiment: Positive
Scores: Negative: 0.0075, Neutral: 0.0442, Positive: 0.9484
--------------------------------------------------
Review: solid ng customer service
Sentiment: Neutral
Scores: Negative: 0.1883, Neutral: 0.5327, Positive: 0.2790
--------------------------------------------------
Review: Pinaka masarap na natikman ko
Sentiment: Positive
Scores: Negative: 0.0095, Neutral: 0.0454, Positive: 0.9451
--------------------------------------------------
Review: Hoy, bibirahiin ko gumawa neto
Sentiment: Negative
Scores: Negative: 0.6