<a href="https://colab.research.google.com/github/tadiwamark/CourseSentimentOracle/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Constants
MAX_LEN = 100
EMBEDDING_DIM = 16
TRUNC_TYPE = 'post'
PADDING_TYPE = 'post'
OOV_TOKEN = "<OOV>"
TRAINING_SIZE = int(40000*0.8)

In [4]:
# Load Data
url = "https://raw.githubusercontent.com/tadiwamark/CourseSentimentOracle/main/reviews.csv"
df = pd.read_csv(url)

In [5]:
df

Unnamed: 0,Id,Review,Label
0,0,good and interesting,5
1,1,"This class is very helpful to me. Currently, I...",5
2,2,like!Prof and TAs are helpful and the discussi...,5
3,3,Easy to follow and includes a lot basic and im...,5
4,4,Really nice teacher!I could got the point eazl...,4
...,...,...,...
107013,107013,Trendy topic with talks from expertises in the...,4
107014,107014,"Wonderful! Simple and clear language, good ins...",5
107015,107015,an interesting and fun course. thanks. dr quincy,5
107016,107016,"very broad perspective, up to date information...",4


In [6]:
reviews = df['Review']
labels = df['Label']

In [7]:
labels = labels - 1

# One-hot encode
labels_encoded = to_categorical(labels)

In [8]:
# Tokenization
tokenizer = Tokenizer(oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(reviews)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(reviews)
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE)

In [11]:
with open('/content/drive/My Drive/Colab Notebooks/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

In [None]:
# Data Split
training_sequences = np.array(padded_sequences[:TRAINING_SIZE])
training_labels = np.array(labels[:TRAINING_SIZE])
validation_sequences = np.array(padded_sequences[TRAINING_SIZE:])
validation_labels = np.array(labels[TRAINING_SIZE:])

In [None]:
# Model Architecture
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=padded_sequences.shape[1]))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(32, activation='relu'))
model.add(Dense(5, activation='softmax'))  # 5 neurons for 5 ratings (1 to 5)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Checkpoint to save the best model
checkpoint_path = "/content/drive/My Drive/Colab Notebooks/best_model.h5"
checkpoint = ModelCheckpoint(filepath=checkpoint_path, save_best_only=True, save_weights_only=False, verbose=1)

In [None]:
# Training the model
model.fit(padded_sequences, labels_encoded, epochs=10, batch_size=64, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b54e83b98a0>

In [None]:
# Saving the final version of the model after all epochs
model.save("/content/drive/My Drive/Colab Notebooks/final_model.h5")

  saving_api.save_model(


In [None]:
loss, accuracy = model.evaluate(padded_sequences, labels_encoded)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 87.31%
