<a href="https://colab.research.google.com/github/thadduslee/Sentiment-Analysis-for-NUS-modules/blob/main/Sentiment_Analysis_for_NUS_modules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import tensorflow as tf
import pandas as pd
import numpy as np

### **Train sentiment analysis model**

In [28]:
data = pd.read_csv("/content/reviews.csv")
data = data.drop('Id', axis = 1)
vocab_size = 10000 #top 10k most frequent words
max_length = 1000 #max length of review, not the best for this dataset considering reviews are shorter but nusmods reviews or reddit reviews drag on to a few hundred words
embedding_dim = 64 #if we increase this, we better capture the nuance of every word
vectorize = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size,
    standardize='lower_and_strip_punctuation', #clean text
    output_mode='int',
    output_sequence_length = max_length) #forces output to 1000

vectorize.adapt(data["Review"].astype(str)) #prevents crash if there are numbers in the dataset
model = tf.keras.models.Sequential([
    tf.keras.Input(shape=(1,), dtype=tf.string),
    vectorize,
    tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        mask_zero=True),
    tf.keras.layers.LSTM(128, activation = "tanh"),
    tf.keras.layers.Dense(1, activation = "relu")
])

model.compile(loss = "mean_squared_error", optimizer = "adam", metrics = ["mean_absolute_error"]) #i treat this as a regression problem not a multi-class classification

data['Label'] = data['Label'].astype(int)
model.fit(data['Review'].astype(str).values, data["Label"], epochs = 3) #too many epochs and model memorizes data

Epoch 1/3
[1m3345/3345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - loss: 1.2732 - mean_absolute_error: 0.6694
Epoch 2/3
[1m3345/3345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 11ms/step - loss: 0.3085 - mean_absolute_error: 0.3539
Epoch 3/3
[1m3345/3345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 11ms/step - loss: 0.2456 - mean_absolute_error: 0.3130


<keras.src.callbacks.history.History at 0x7bbbb68d8470>

## **Scrape NUSMods to get data for every module (save data to a csv and json file)**

In [2]:
import requests
import json
import csv
import time

# --- CONFIGURATION ---
DISQUS_API_KEY = "YOUR_DISQUS_PUBLIC_KEY_HERE"
FORUM_NAME = "nusmods-prod"
LIMIT = 100  # Max allowed by Disqus is 100

def get_all_items(endpoint, item_name):
    """
    Generic function to fetch all items (threads or posts) handling pagination.
    """
    url = f"https://disqus.com/api/3.0/{endpoint}"
    items = []
    cursor = None

    print(f"--- Starting download for: {item_name} ---")

    while True:
        params = {
            "api_key": DISQUS_API_KEY,
            "forum": FORUM_NAME,
            "limit": LIMIT,
            "order": "asc"
        }
        if cursor:
            params["cursor"] = cursor

        response = requests.get(url, params=params)

        if response.status_code != 200:
            print(f"Error: {response.status_code} - {response.text}")
            break

        data = response.json()

        # Add new items to our list
        current_batch = data.get("response", [])
        items.extend(current_batch)
        print(f"Fetched {len(current_batch)} {item_name}. Total: {len(items)}")

        # Check pagination
        cursor_data = data.get("cursor", {})
        if cursor_data.get("hasNext"):
            cursor = cursor_data.get("next")
            # Be polite to the API (avoid rate limits)
            time.sleep(0.5)
        else:
            print(f"Finished fetching {item_name}.\n")
            break

    return items

def main():
    if DISQUS_API_KEY == "YOUR_DISQUS_PUBLIC_KEY_HERE":
        print("Please replace 'YOUR_DISQUS_PUBLIC_KEY_HERE' with your actual API key.")
        return

    # 1. Fetch ALL Threads (Courses) first
    # We need this to map the 'thread_id' in comments back to a Module Code (e.g., CS1010)
    threads = get_all_items("forums/listThreads.json", "Threads")

    # Create a lookup dictionary: { thread_id: module_title }
    # Thread titles in NUSMods are usually like "CS1010 Programming Methodology"
    thread_map = {t["id"]: t["title"] for t in threads}
    thread_link_map = {t["id"]: t["link"] for t in threads} # Store URL too if you want

    # 2. Fetch ALL Posts (Comments)
    posts = get_all_items("forums/listPosts.json", "Posts")

    # 3. Process and Clean the Data
    cleaned_data = []
    for post in posts:
        # Disqus comments include HTML tags (e.g., <p>text</p>).
        # You might want to strip these later, but we keep them raw for now.

        thread_id = post.get("thread")

        record = {
            "comment_id": post.get("id"),
            "module_title": thread_map.get(thread_id, "Unknown Module"),
            "module_link": thread_link_map.get(thread_id, ""),
            "author_name": post.get("author", {}).get("name"),
            "date": post.get("createdAt"),
            "message": post.get("message"), # The actual comment
            "likes": post.get("likes"),
            "dislikes": post.get("dislikes"),
            "is_approved": post.get("isApproved"),
            "parent_comment_id": post.get("parent") # If it's a reply to another comment
        }
        cleaned_data.append(record)

    # 4. Save to JSON
    with open("nusmods_reviews.json", "w", encoding="utf-8") as f:
        json.dump(cleaned_data, f, indent=4)
    print(f"Saved {len(cleaned_data)} reviews to nusmods_reviews.json")

    # 5. Save to CSV
    if cleaned_data:
        keys = cleaned_data[0].keys()
        with open("nusmods_reviews.csv", "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(cleaned_data)
        print(f"Saved {len(cleaned_data)} reviews to nusmods_reviews.csv")

if __name__ == "__main__":
    main()

--- Starting download for: Threads ---
Fetched 100 Threads. Total: 100
Fetched 100 Threads. Total: 200
Fetched 100 Threads. Total: 300
Fetched 100 Threads. Total: 400
Fetched 100 Threads. Total: 500
Fetched 100 Threads. Total: 600
Fetched 100 Threads. Total: 700
Fetched 100 Threads. Total: 800
Fetched 100 Threads. Total: 900
Fetched 100 Threads. Total: 1000
Fetched 100 Threads. Total: 1100
Fetched 100 Threads. Total: 1200
Fetched 100 Threads. Total: 1300
Fetched 100 Threads. Total: 1400
Fetched 100 Threads. Total: 1500
Fetched 100 Threads. Total: 1600
Fetched 100 Threads. Total: 1700
Fetched 100 Threads. Total: 1800
Fetched 100 Threads. Total: 1900
Fetched 100 Threads. Total: 2000
Fetched 100 Threads. Total: 2100
Fetched 100 Threads. Total: 2200
Fetched 100 Threads. Total: 2300
Fetched 100 Threads. Total: 2400
Fetched 100 Threads. Total: 2500
Fetched 100 Threads. Total: 2600
Fetched 100 Threads. Total: 2700
Fetched 100 Threads. Total: 2800
Fetched 100 Threads. Total: 2900
Fetched 100 T

### **Open and clean data from NUSMods**

In [9]:
nus_mods_data = pd.read_csv("/content/nusmods_reviews.csv")
new = []
for x in nus_mods_data["module_title"]:
  empty = x.index(' ') #remove the description of the course, include only the name
  new.append(x[:empty])
nus_mods_data["module_title"] = new
nus_mods_data

Unnamed: 0,comment_id,module_title,module_link,author_name,date,message,likes,dislikes,is_approved,parent_comment_id
0,1504826384,CS3216,http://nusmods.com/modules/CS3216/reviews,ahbengish,2014-06-29T23:43:01,<p>Awesomeness!</p>,2,0,True,
1,1504826420,CS1020E,http://nusmods.com/modules/CS1020E/reviews,Bhavesh .R,2014-07-06T17:08:51,<p>This looks interesting :). Anybody has any ...,0,0,True,
2,1504826421,CS1020E,http://nusmods.com/modules/CS1020E/reviews,Yangshun,2014-07-06T19:07:57,"<p>Hey Bhavesh, sure thing, you can find more ...",0,0,True,
3,1504826396,ACC1002X,http://nusmods.com/modules/ACC1002X/reviews,Toh Weiqing,2014-07-06T20:20:30,<p>Definitely a course to take for people who ...,1,0,True,
4,1504826381,CS3216,http://nusmods.com/modules/CS3216/reviews,Yangshun,2014-07-07T10:04:11,<p>For those who are thinking of taking CS3216...,3,0,True,
...,...,...,...,...,...,...,...,...,...,...
8819,6816294566,GET1029,http://nusmods.com/modules/GET1029/reviews,xzxzxz,2025-12-30T04:41:39,<p>AY25/26 sem1</p><p>My major: chemical engin...,0,0,True,
8820,6816318352,BT4014,https://nusmods.com/modules/BT4014/reviews,BT4014 Taker,2025-12-30T07:42:10,<p>i have</p>,0,0,True,6.805576e+09
8821,6816324926,CS3240,http://nusmods.com/modules/CS3240/reviews,user1230,2025-12-30T08:26:50,<p>Final Grade: A- got saved by individual des...,0,0,True,6.800773e+09
8822,6816325154,BT4212,https://nusmods.com/modules/BT4212/reviews,user1230,2025-12-30T08:28:04,<p>Taken in AY25/26 sem 1<br>Lecturer: Zhai Yi...,0,0,True,


### **Receive user input for module to conduct sentiment analysis on**

In [61]:
module = input("Enter the module you would like to hear about: ").upper()
data = nus_mods_data[nus_mods_data["module_title"] == module]
messages = data["message"]





Enter the module you would like to hear about: ma1521


### Helper function to clean **comments**

In [30]:
from bs4 import BeautifulSoup
import re

def clean_review_text(html_text):
    # 1. Parse HTML and separate tags with a space to prevent word merging
    soup = BeautifulSoup(html_text, "html.parser")
    text = soup.get_text(separator=' ')

    # 2. Remove Metadata/Noise (Specific to your dataset)
    # Remove "Module review by..." at the start
    text = re.sub(r'Module review by.*?:', '', text)
    # Remove "Taken in AY..." (Optional: keep if date matters, usually it doesn't for sentiment)
    text = re.sub(r'Taken in AY\d+/\d+ Sem \d+', '', text)
    # Remove "Module review also posted here" footer
    text = re.sub(r'Module review also posted here:.*', '', text)

    # 3. Remove URLs
    text = re.sub(r'http\S+', '', text)

    # 4. Remove Newlines and extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text



### **Clean all the comments and store them in a new list variable**

In [62]:
clean_messages = []
for message in messages:
  clean_messages.append(clean_review_text(message))

### **Conduct sentiment analysis on all text data obtained and return overall sentiment regarding module**

In [63]:
count = 0
total = 0
for clean_message in clean_messages:
  total += model.predict(tf.constant([clean_message]))[0][0]
  count +=1
average = float(total/count)
average = round(average, 2)
if average > 3.5:
  print(f"Overall sentiment of {module} is positive with a score of {average}/5")

elif average < 2.5:
  print(f"Overall sentiment of {module} is negative with a score of {average}/5")
else:
  print(f"Overall sentiment of {module} is neutral with a score of {average}/5")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30