In [1]:
!pip install pdfplumber pandas

Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.5.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.1/68.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.9-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20251230-py3-none-any.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

**PDF EXTRACTION TO DATAFRAME PIPELINE**

In [2]:
from google.colab import files
import pdfplumber
import pandas as pd
import re

uploaded = files.upload()
pdf_filename = list(uploaded.keys())[0]

rows = []

def extract_course_metadata(block_text):
    course_code = None
    academic_year = None
    section = None

    course_match = re.search(r"Essay Results for:\s+.*?((?:EECS|MATH|CSSD)\s+\d{4})", block_text) # Maybe consider all course codes in lassonde
    year_match = re.search(r"Academic Year:\s+(\d+)", block_text)
    section_match = re.search(r"Section\(s\):\s+([A-Z, ]+)", block_text)

    if course_match:
        course_code = course_match.group(1)
    if year_match:
        academic_year = year_match.group(1)
    if section_match:
        section = section_match.group(1).strip()

    return course_code, academic_year, section


with pdfplumber.open(pdf_filename) as pdf:
    pages_text = []
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            pages_text.append(text)

full_text = "\n".join(pages_text)

essay_blocks = full_text.split("ESSAY RESULTS")

for block in essay_blocks[1:]:
    course_code, academic_year, section = extract_course_metadata(block)

    # Find all questions in this block
    question_matches = list(re.finditer(r'(\d+\)\s+.+?:)', block))

    for i, q_match in enumerate(question_matches):
        current_question = q_match.group(1)

        # Find the text for this question (from question to next question or end)
        start_pos = q_match.end()
        if i + 1 < len(question_matches):
            end_pos = question_matches[i + 1].start()
            question_text = block[start_pos:end_pos]
        else:
            question_text = block[start_pos:]

        # Find all bullet points in this question's text
        # Use regex to find bullet points that may span multiple lines
        bullet_pattern = r'-\s*(.*?)(?=\s*\n\s*-|\s*\n\s*\d+\)|$)'
        bullets = re.findall(bullet_pattern, question_text, re.DOTALL)

        for bullet in bullets:
            # Clean up the bullet text
            review_text = re.sub(r'\s+', ' ', bullet.strip())
            if review_text and review_text != ".":
                rows.append({
                    "course_code": course_code,
                    "academic_year": academic_year,
                    "section": section,
                    "question_text": current_question,
                    "review": review_text
                })


df = pd.DataFrame(rows)
print("Total extracted reviews:", len(df))
display(df.head())

# output_filename = "course_eval_extracted.csv"
# df.to_csv(output_filename, index=False)
# files.download(output_filename)

Saving course evaluations.pdf to course evaluations.pdf
Total extracted reviews: 1837


Unnamed: 0,course_code,academic_year,section,question_text,review
0,EECS 2021,2022,B,1) The best things about this course are:,The grade distribution is very generous and en...
1,EECS 2021,2022,B,1) The best things about this course are:,- Begin able to learn more about how computers...
2,EECS 2021,2022,B,1) The best things about this course are:,The topics on how it focused a lot on assembly...
3,EECS 2021,2022,B,1) The best things about this course are:,Nothing
4,EECS 2021,2022,B,1) The best things about this course are:,How directly related the homework was to asses...


In [3]:
pd.set_option('display.max_colwidth', None)
df.head(10)

Unnamed: 0,course_code,academic_year,section,question_text,review
0,EECS 2021,2022,B,1) The best things about this course are:,The grade distribution is very generous and encourages students to try their hardest in every instance of the course to earn marks.
1,EECS 2021,2022,B,1) The best things about this course are:,- Begin able to learn more about how computers work behind the scenes
2,EECS 2021,2022,B,1) The best things about this course are:,The topics on how it focused a lot on assembly coding which was helpful in seeing how each instruction is executed on the lower level.
3,EECS 2021,2022,B,1) The best things about this course are:,Nothing
4,EECS 2021,2022,B,1) The best things about this course are:,How directly related the homework was to assessments. Made me feel that studying was effective.
5,EECS 2021,2022,B,1) The best things about this course are:,The professor was really nice and easy to approach.
6,EECS 2021,2022,B,1) The best things about this course are:,"Assembly (RISC-V) part of the labs was well constructed and assisted my learning. However, the verilog labs were difficult and hard to understand"
7,EECS 2021,2022,B,1) The best things about this course are:,Learnt alot of fundamentals.
8,EECS 2021,2022,B,1) The best things about this course are:,strictly follow the syllabus
9,EECS 2021,2022,B,1) The best things about this course are:,Many opportunities given to achieve success in the class. Homework groups were very useful as questions done by peers could be reviewed for assessments meaning less time was needed having to figure out steps for many questions. G4-12


**DATAFRAME CLEANING**

In [4]:
#Remove 1) and ":"
df['question_text'] = (
    df['question_text']
    .astype(str)
    .str.replace(r'^\s*\d+\s*[\)\.\-]\s*', '', regex=True)
    .str.replace(r':\s*$', '', regex=True)
    .str.strip()
)

#Remove page numbers that ended up under reviews during extraction
df['review'] = (
    df['review']
    .astype(str)
    .str.replace(r'\b[A-Z]\d+-\d+\b', '', regex=True)
    .str.strip()
)

#Remove hyphen symbol in from of reviews (for readability) that ended up there during extraction
df['review'] = (
    df['review']
    .str.replace(r'^\s*-\s*', '', regex=True)
    .str.strip()
)



In [5]:
df.head(10)

Unnamed: 0,course_code,academic_year,section,question_text,review
0,EECS 2021,2022,B,The best things about this course are,The grade distribution is very generous and encourages students to try their hardest in every instance of the course to earn marks.
1,EECS 2021,2022,B,The best things about this course are,Begin able to learn more about how computers work behind the scenes
2,EECS 2021,2022,B,The best things about this course are,The topics on how it focused a lot on assembly coding which was helpful in seeing how each instruction is executed on the lower level.
3,EECS 2021,2022,B,The best things about this course are,Nothing
4,EECS 2021,2022,B,The best things about this course are,How directly related the homework was to assessments. Made me feel that studying was effective.
5,EECS 2021,2022,B,The best things about this course are,The professor was really nice and easy to approach.
6,EECS 2021,2022,B,The best things about this course are,"Assembly (RISC-V) part of the labs was well constructed and assisted my learning. However, the verilog labs were difficult and hard to understand"
7,EECS 2021,2022,B,The best things about this course are,Learnt alot of fundamentals.
8,EECS 2021,2022,B,The best things about this course are,strictly follow the syllabus
9,EECS 2021,2022,B,The best things about this course are,Many opportunities given to achieve success in the class. Homework groups were very useful as questions done by peers could be reviewed for assessments meaning less time was needed having to figure out steps for many questions.


**DATAFRAME CLEANING: NULL REVIEW REMOVAL**

In [6]:
#Check Point 1
review_counts = (
    df['review']
    .astype(str)
    .str.strip()
    .value_counts()
    .reset_index()
)

review_counts.columns = ['review_text', 'count']

short_reviews = review_counts[
    review_counts['review_text'].str.len() <= 7
]

print("Initial Overview: ")
print(short_reviews)

Initial Overview: 
     review_text  count
0            N/A    105
1            n/a     60
2           None     25
3        Nothing     23
4           none     22
5        nothing     19
6                    17
8             NA     12
10         None.      8
11            no      6
12           N/a      5
13             A      4
14             n      4
15             -      4
16            Na      3
22            No      3
26           N o      2
27          hard      2
30           Nil      2
32           N.A      2
35           No.      2
36         none.      2
38        unsure      2
152       Strike      1
203          N\A      1
309      so hard      1
423      zybooks      1
461          216      1
462       N/A :)      1
525          244      1
683      Noting.      1
703            x      1
707           :)      1
738         No e      1
751          nil      1
794      Nothung      1
1012          --      1
1035          ..      1
1092     no idea      1
1166        labs     

In [7]:
import numpy as np

# Normalize text
df['review_norm'] = (
    df['review']
    .str.lower()
    .str.strip()
)

# Remove symbol such as --, .. or : ) that arent meaningful to the analysis
df.loc[
    ~df['review_norm'].str.contains(r'[a-z0-9]', regex=True),
    'review_norm'
] = np.nan

# Remove null-semantic words
NULL_WORDS = {'na', 'n/a', 'none', 'nil', 'n.a'}

normalized = df['review_norm'].str.replace(r'[^a-z]', '', regex=True)

df.loc[normalized.isin(NULL_WORDS), 'review_norm'] = np.nan

In [8]:
#Check Point 2
review_counts2 = (
    df['review_norm']
    .value_counts(dropna=False)
    .reset_index()
)

review_counts2.columns = ['review_text2', 'count']

short_reviews2 = review_counts2[
    review_counts2['review_text2'].astype(str).str.len() <= 7
]

print("After assigning null values: ")
print(short_reviews2)

After assigning null values: 
     review_text2  count
0             NaN    275
1         nothing     42
3              no      9
6               a      4
7               n      4
14         unsure      2
17           hard      2
25            n o      2
27            no.      2
122        strike      1
294       so hard      1
411       zybooks      1
448           216      1
510           244      1
666       noting.      1
686             x      1
720          no e      1
776       nothung      1
1069      no idea      1
1141         labs      1
1207         bash      1
1453      the tas      1


In [9]:
df.loc[df['review_norm'].str.fullmatch(r'\d+', na=False), 'review_norm'] = np.nan

# single letters only
df.loc[df['review_norm'].str.fullmatch(r'[a-z]', na=False), 'review_norm'] = np.nan


In [10]:
#Check Point 3
review_counts3 = (
    df['review_norm']
    .value_counts(dropna=False)
    .reset_index()
)

review_counts3.columns = ['review_text3', 'count']

short_reviews3 = review_counts3[
    review_counts3['review_text3'].astype(str).str.len() <= 7
]

print("After sorting single letter and numbers into numpy null: ")
print(short_reviews3)

After sorting single letter and numbers into numpy null: 
     review_text3  count
0             NaN    286
1         nothing     42
3              no      9
11         unsure      2
15           hard      2
22            no.      2
23            n o      2
144        strike      1
288       so hard      1
419       zybooks      1
660       noting.      1
714          no e      1
772       nothung      1
1065      no idea      1
1136         labs      1
1201         bash      1
1447      the tas      1


In [11]:
df['review_norm'].shape

(1837,)

**DATAFRAME CLEANING: SORTING MISPELLED NULL VALUES**

In [12]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m104.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.3


In [13]:
df['review_norm'] = df['review_norm'].str.replace(r'[^\w\s]', '', regex=True)

In [14]:
from rapidfuzz import fuzz

def is_nullish(text):
  if pd.isna(text):
        return np.nan

  #Isolate the removal of space in between text
  compact = re.sub(r'\s+', '', text.lower().strip())

  # Catch mispellings of no and none
  if len(compact) <= 3 and compact.startswith("no"):
      return np.nan

  # normalize misspellings of "nothing"
  if fuzz.ratio(compact, "nothing") >= 80:
      return "nothing"

  # other null-like responses
  if (
      fuzz.ratio(compact, "none") >= 90 or
      fuzz.ratio(compact, "na") >= 90 or
      fuzz.ratio(compact, "nil") >= 90
    ):
      return np.nan

  return text

df['review_norm'] = df['review_norm'].apply(is_nullish)

In [15]:
#Check Point 4
review_counts4 = (
    df['review_norm']
    .value_counts(dropna=False)
    .reset_index()
)

review_counts4.columns = ['review_text4', 'count']

short_reviews4 = review_counts4[
    review_counts4['review_text4'].astype(str).str.len() <= 7
]

print("After sorting null value mispellings: ")
print(short_reviews4)

After sorting null value mispellings: 
     review_text4  count
0             NaN    300
1         nothing     58
20         unsure      2
21           hard      2
123        strike      1
281       so hard      1
412       zybooks      1
798       quizzes      1
1051      no idea      1
1121         labs      1
1185         bash      1
1442      the tas      1


In [16]:
df['review_norm'].shape

(1837,)

**DATA CLEANING: FINAL ASSIGNMENT AND NAN VALUE DROP**

In [17]:
df['review'] = df['review_norm']
df = df.drop(columns=['review_norm'])

In [18]:
df[df.isna().any(axis=1)]
df.isna().sum()

Unnamed: 0,0
course_code,0
academic_year,0
section,0
question_text,0
review,300


In [19]:
df = df.dropna(subset=['review']).reset_index(drop=True)

In [20]:
df.isna().sum()

Unnamed: 0,0
course_code,0
academic_year,0
section,0
question_text,0
review,0


In [21]:
df.shape

(1537, 5)

**SENTIMENT ANALYSIS**

In [22]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline


# Load model
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
device = 0 if torch.cuda.is_available() else -1

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=device,
    return_all_scores=True,
    truncation=True,
    padding=True,
    max_length=512
)

# Convert column to list
reviews = df["review"].tolist()

# Run analysis
results = sentiment_pipeline(reviews, batch_size=8)

# Extract scores
neg_scores, neu_scores, pos_scores, final_labels = [], [], [], []

# The pipeline with return_all_scores=True is returning a list of dictionaries
# where each dictionary represents the top sentiment and its score.
# It does not provide all three (negative, neutral, positive) scores explicitly.
# So, we populate the relevant score with the predicted score and others as 0.
for res_dict in results:
    # res_dict is a dictionary like {'label': 'positive', 'score': 0.9431226253509521}
    current_label = res_dict['label']
    current_score = res_dict['score']

    neg = 0.0
    neu = 0.0
    pos = 0.0

    if current_label == 'negative':
        neg = current_score
    elif current_label == 'neutral':
        neu = current_score
    elif current_label == 'positive':
        pos = current_score

    neg_scores.append(neg)
    neu_scores.append(neu)
    pos_scores.append(pos)

    final_labels.append(current_label.capitalize())

df["negative_score"] = neg_scores
df["neutral_score"] = neu_scores
df["positive_score"] = pos_scores
df["sentiment"] = final_labels

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]



special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: cardiffnlp/twitter-roberta-base-sentiment-latest
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.pooler.dense.bias       | UNEXPECTED |  | 
roberta.pooler.dense.weight     | UNEXPECTED |  | 
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

In [23]:
df.head(60)

Unnamed: 0,course_code,academic_year,section,question_text,review,negative_score,neutral_score,positive_score,sentiment
0,EECS 2021,2022,B,The best things about this course are,the grade distribution is very generous and encourages students to try their hardest in every instance of the course to earn marks,0.0,0.0,0.943123,Positive
1,EECS 2021,2022,B,The best things about this course are,begin able to learn more about how computers work behind the scenes,0.0,0.748756,0.0,Neutral
2,EECS 2021,2022,B,The best things about this course are,the topics on how it focused a lot on assembly coding which was helpful in seeing how each instruction is executed on the lower level,0.0,0.0,0.629354,Positive
3,EECS 2021,2022,B,The best things about this course are,nothing,0.0,0.468179,0.0,Neutral
4,EECS 2021,2022,B,The best things about this course are,how directly related the homework was to assessments made me feel that studying was effective,0.0,0.684208,0.0,Neutral
5,EECS 2021,2022,B,The best things about this course are,the professor was really nice and easy to approach,0.0,0.0,0.96693,Positive
6,EECS 2021,2022,B,The best things about this course are,assembly riscv part of the labs was well constructed and assisted my learning however the verilog labs were difficult and hard to understand,0.0,0.480507,0.0,Neutral
7,EECS 2021,2022,B,The best things about this course are,learnt alot of fundamentals,0.0,0.772886,0.0,Neutral
8,EECS 2021,2022,B,The best things about this course are,strictly follow the syllabus,0.0,0.921463,0.0,Neutral
9,EECS 2021,2022,B,The best things about this course are,many opportunities given to achieve success in the class homework groups were very useful as questions done by peers could be reviewed for assessments meaning less time was needed having to figure out steps for many questions,0.0,0.0,0.917252,Positive


In [24]:
sentiment_counts = df["sentiment"].value_counts()
print(sentiment_counts)

sentiment
Positive    650
Neutral     574
Negative    313
Name: count, dtype: int64


In [25]:
from google.colab import files
uploaded = files.upload()

import pandas as pd

# Get the uploaded filename
filename = list(uploaded.keys())[0]

# Read into dataframe
df = pd.read_csv(filename)

df["Sentiment_Label"].value_counts()

Saving sentiment_course_eval.csv to sentiment_course_eval.csv


Unnamed: 0_level_0,count
Sentiment_Label,Unnamed: 1_level_1
Positive,899
Neutral,395
Negative,243


ASPECT-BASED SENTIMENT ANALYSIS

In [26]:
aspect_keywords = {
    "instructor": [
        "professor", "instructor", "teacher", "lecturer", "ta", "teaching assistant", "tutor",
        "teaching style", "explanation", "clear", "engaging", "approachable", "helpful",
        "knowledgeable", "passionate", "pacing", "organized", "responsive", "feedback",
        "office hours", "answers questions", "supportive", "enthusiastic", "delivery"
    ],

    "course_content": [
        "content", "material", "topic", "subject", "curriculum", "syllabus", "concept",
        "organized", "structured", "clear progression", "easy to understand", "wide range",
        "logical flow", "relevant", "practical", "useful", "applicable", "interesting",
        "exciting", "enjoyable", "difficult", "complex", "challenging", "tips and tricks",
        "unique", "attractive", "fresh", "novel", "engaging", "depth", "coverage"
    ],

    "assignments_labs": [
        "assignment", "homework", "lab", "project", "exercise", "problem set", "task",
        "instructions", "guidelines", "clear", "vague", "confusing", "detailed",
        "time-consuming", "lengthy", "workload", "manageable", "overwhelming",
        "helpful", "practical", "relevant", "feedback", "submission", "deadline"
    ],

    "assessments": [
        "exam", "test", "midterm", "final", "quiz", "assessment", "evaluation",
        "grade", "mark", "score", "grading", "rubric", "fair", "unfair", "difficult",
        "easy", "challenging", "practice questions", "sample questions", "review",
        "weight", "curve", "feedback", "timely", "preparation", "aligned", "reflective"
    ],

    "workload_pace": [
        "workload", "pace", "speed", "fast", "slow", "rushed", "steady", "manageable",
        "overwhelming", "time-consuming", "hours", "effort", "heavy", "light",
        "balanced", "crammed", "spaced out", "deadline", "pressure", "stress"
    ],

    "learning_outcomes": [
        "learn", "understand", "grasp", "comprehend", "knowledge", "skill", "ability",
        "improvement", "growth", "development", "confidence", "mastery", "proficiency",
        "apply", "practical", "real-world", "foundation", "prepare", "ready"
    ],

    "engagement_interest": [
        "engaging", "interesting", "boring", "exciting", "fun", "enjoyable", "captivating",
        "attention", "focus", "concentration", "participate", "interactive", "active",
        "motivating", "inspiring", "stimulating", "dull", "monotonous", "attention-holding"
    ],

    "difficulty_challenge": [
        "difficult", "hard", "challenging", "tough", "demanding", "rigorous",
        "easy", "simple", "straightforward", "basic", "complex", "complicated",
        "manageable", "achievable", "stretching", "pushing", "struggle", "struggling"
    ],

    "resources_materials": [
        "textbook", "book", "reading", "slides", "notes", "handout", "video",
        "recording", "online", "platform", "website", "eclass", "zybooks",
        "resource", "material", "supplement", "additional", "reference", "source"
    ],

    "support_help": [
        "help", "support", "assistance", "guidance", "office hours", "tutoring",
        "extra help", "additional support", "available", "accessible", "responsive",
        "quick reply", "slow reply", "unavailable", "abandoned", "mentor", "guide"
    ],

    "overall_experience": [
        "overall", "experience", "enjoyed", "loved", "hated", "disliked", "recommend",
        "valuable", "worthwhile", "useful", "beneficial", "positive", "negative",
        "good course", "great course", "bad course", "average", "decent", "excellent"
    ],

    "practical_application": [
        "practical", "hands-on", "real-world", "application", "apply", "use",
        "relevant", "useful", "helpful", "beneficial", "skill", "technique",
        "practice", "experience", "implement", "execute"
    ]
}

In [27]:
import torch
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from tqdm import tqdm
tqdm.pandas()


model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
device = 0 if torch.cuda.is_available() else -1

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=device,
    truncation=True,
    padding=True,
    max_length=512,
    return_all_scores=False
)


def extract_aspects(text):
    """Extract aspects from text based on keyword matching"""
    text_lower = text.lower()
    aspects_found = []

    for aspect, keywords in aspect_keywords.items():
        for keyword in keywords:
            if re.search(r'\b' + re.escape(keyword) + r'\b', text_lower):
                aspects_found.append(aspect)
                break

    return aspects_found

def analyze_aspect_for_row(row):
    """
    Process a single review and return a list of aspect-sentiment dictionaries
    """
    text = row['review']
    aspects = extract_aspects(text)

    results = []
    for aspect in aspects:
        # Create aspect-specific input
        aspect_text = f"When discussing {aspect}: {text}"

        # Simple truncation
        if len(aspect_text) > 2000:
            aspect_text = aspect_text[:2000]

        try:
            result = sentiment_pipeline(aspect_text)[0]
            sentiment = result['label'].capitalize()
            confidence = round(result['score'], 3)
        except:
            sentiment = "Unknown"
            confidence = 0.0

        result_dict = {
            'review': text,
            'aspect': aspect,
            'sentiment': sentiment,
            'confidence': confidence,
        }

        results.append(result_dict)

    return results

print("Analyzing aspects...")
all_aspect_rows = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    aspect_results = analyze_aspect_for_row(row)
    all_aspect_rows.extend(aspect_results)

aspect_df = pd.DataFrame(all_aspect_rows)

print(f"\nCreated new dataframe with {len(aspect_df)} rows")
print(f"Original dataframe had {len(df)} rows")
print(f"Average aspects per review: {len(aspect_df)/len(df):.2f}")


# Summary statistics
print("\n" + "="*80)
print("ASPECT DISTRIBUTION")
print("="*80)
print(aspect_df['aspect'].value_counts())

print("\n" + "="*80)
print("SENTIMENT BY ASPECT")
print("="*80)
print(pd.crosstab(aspect_df['aspect'], aspect_df['sentiment']))

# Save to file
# aspect_df.to_csv('aspect_sentiment_analysis.csv', index=False)



Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: cardiffnlp/twitter-roberta-base-sentiment-latest
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.pooler.dense.bias       | UNEXPECTED |  | 
roberta.pooler.dense.weight     | UNEXPECTED |  | 
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Analyzing aspects...


  0%|          | 6/1537 [00:00<01:24, 18.15it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 1537/1537 [00:41<00:00, 36.85it/s]



Created new dataframe with 3179 rows
Original dataframe had 1537 rows
Average aspects per review: 2.07

ASPECT DISTRIBUTION
aspect
instructor               492
course_content           412
assignments_labs         382
assessments              299
difficulty_challenge     269
practical_application    256
resources_materials      247
learning_outcomes        244
overall_experience       154
support_help             149
workload_pace            138
engagement_interest      137
Name: count, dtype: int64

SENTIMENT BY ASPECT
sentiment              Negative  Neutral  Positive
aspect                                            
assessments                  92      110        97
assignments_labs             97      140       145
course_content               91      127       194
difficulty_challenge        101       95        73
engagement_interest          19       33        85
instructor                   53      105       334
learning_outcomes            63       85        96
overall_experi

In [28]:
aspect_df.head(60)

Unnamed: 0,review,aspect,sentiment,confidence
0,the grade distribution is very generous and encourages students to try their hardest in every instance of the course to earn marks,assessments,Positive,0.908
1,begin able to learn more about how computers work behind the scenes,learning_outcomes,Neutral,0.884
2,the topics on how it focused a lot on assembly coding which was helpful in seeing how each instruction is executed on the lower level,instructor,Positive,0.658
3,the topics on how it focused a lot on assembly coding which was helpful in seeing how each instruction is executed on the lower level,assignments_labs,Positive,0.655
4,the topics on how it focused a lot on assembly coding which was helpful in seeing how each instruction is executed on the lower level,practical_application,Positive,0.637
5,how directly related the homework was to assessments made me feel that studying was effective,assignments_labs,Neutral,0.659
6,the professor was really nice and easy to approach,instructor,Positive,0.957
7,the professor was really nice and easy to approach,assessments,Positive,0.966
8,the professor was really nice and easy to approach,difficulty_challenge,Positive,0.96
9,assembly riscv part of the labs was well constructed and assisted my learning however the verilog labs were difficult and hard to understand,course_content,Neutral,0.548


Emotion Analysis

In [29]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from tqdm import tqdm
tqdm.pandas()

# Load your original dataframe
# df = pd.read_csv('sentiment_course_eval.csv')  # Your original file

# Load emotion model
model_name = "j-hartmann/emotion-english-distilroberta-base"
device = 0 if torch.cuda.is_available() else -1

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create pipeline once
emotion_pipeline = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=device,
    truncation=True,
    padding=True,
    max_length=512,
    return_all_scores=False
)

def get_dominant_emotion(text):
    """
    Get the dominant emotion from text
    """
    try:
        result = emotion_pipeline(text)[0]
        return {
            'dominant_emotion': result['label'],
            'emotion_confidence': round(result['score'], 3)
        }
    except Exception as e:
        print(f"Error processing: {text[:50]}... Error: {e}")
        return {
            'dominant_emotion': 'unknown',
            'emotion_confidence': 0.0
        }

# Test with a sample first
print("Testing emotion model...")
test_reviews = df['review'].head(3).tolist()
for i, review in enumerate(test_reviews):
    print(f"\nTest {i+1}: {review[:100]}...")
    result = get_dominant_emotion(review)
    print(f"Dominant: {result['dominant_emotion']} ({result['emotion_confidence']})")

# Apply to all reviews and add directly to original dataframe
print("\nAnalyzing emotions for all reviews...")

# Initialize empty lists
emotions = []
confidences = []

# Process each review
for review in tqdm(df['review'].tolist()):
    result = get_dominant_emotion(review)
    emotions.append(result['dominant_emotion'])
    confidences.append(result['emotion_confidence'])

# Add columns directly to original dataframe
df['dominant_emotion'] = emotions
df['emotion_confidence'] = confidences

# Display results
print("\n" + "="*80)
print("EMOTION ANALYSIS RESULTS ADDED TO ORIGINAL DATAFRAME")
print("="*80)
print(df[['review', 'dominant_emotion', 'emotion_confidence']].head(10).to_string())

# Summary statistics
print("\n" + "="*80)
print("DISTRIBUTION OF DOMINANT EMOTIONS")
print("="*80)
print(df['dominant_emotion'].value_counts())

print("\n" + "="*80)
print("AVERAGE CONFIDENCE BY EMOTION")
print("="*80)
print(df.groupby('dominant_emotion')['emotion_confidence'].mean().round(3))

# Save updated dataframe (includes original columns + new emotion columns)
# df.to_csv('sentiment_with_emotions.csv', index=False)

# Now you can do correlation analysis directly on df
print("\n" + "="*80)
print("READY FOR CORRELATION ANALYSIS")
print("="*80)
print(f"Original dataframe now has {len(df.columns)} columns")
print("New columns added: 'dominant_emotion', 'emotion_confidence'")

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: j-hartmann/emotion-english-distilroberta-base
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Testing emotion model...

Test 1: the grade distribution is very generous and encourages students to try their hardest in every instan...
Dominant: joy (0.52)

Test 2: begin able to learn more about how computers work behind the scenes...
Dominant: neutral (0.679)

Test 3: the topics on how it focused a lot on assembly coding which was helpful in seeing how each instructi...
Dominant: neutral (0.902)

Analyzing emotions for all reviews...


 10%|█         | 159/1537 [00:01<00:07, 175.78it/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

100%|██████████| 1537/1537 [00:10<00:00, 146.12it/s]



EMOTION ANALYSIS RESULTS ADDED TO ORIGINAL DATAFRAME
                                                                                                                                                                                                                              review dominant_emotion  emotion_confidence
0                                                                                                 the grade distribution is very generous and encourages students to try their hardest in every instance of the course to earn marks              joy               0.520
1                                                                                                                                                                begin able to learn more about how computers work behind the scenes          neutral               0.679
2                                                                                              the topics on how it focused a lot on assembly coding

In [30]:
df.head(5)

Unnamed: 0,course_code,academic_year,section,question_text,review,Positive,Negative,Neutral,Compound,Sentiment_Label,dominant_emotion,emotion_confidence
0,EECS 2021,2022,B,The best things about this course are,the grade distribution is very generous and encourages students to try their hardest in every instance of the course to earn marks,0.0,0.747,0.253,0.7755,Positive,joy,0.52
1,EECS 2021,2022,B,The best things about this course are,begin able to learn more about how computers work behind the scenes,0.0,1.0,0.0,0.0,Neutral,neutral,0.679
2,EECS 2021,2022,B,The best things about this course are,the topics on how it focused a lot on assembly coding which was helpful in seeing how each instruction is executed on the lower level,0.074,0.743,0.182,0.4939,Positive,neutral,0.902
3,EECS 2021,2022,B,The best things about this course are,nothing,0.0,1.0,0.0,0.0,Neutral,neutral,0.479
4,EECS 2021,2022,B,The best things about this course are,how directly related the homework was to assessments made me feel that studying was effective,0.0,0.819,0.181,0.4767,Positive,joy,0.981


CORRELATION CHECK INCLUDE RQ2

In [31]:
# Step 1: Create a new dataframe for correlation analysis
correlation_df = df[['review', 'dominant_emotion', 'Sentiment_Label']].copy()

correlation_df.head(20)

Unnamed: 0,review,dominant_emotion,Sentiment_Label
0,the grade distribution is very generous and encourages students to try their hardest in every instance of the course to earn marks,joy,Positive
1,begin able to learn more about how computers work behind the scenes,neutral,Neutral
2,the topics on how it focused a lot on assembly coding which was helpful in seeing how each instruction is executed on the lower level,neutral,Positive
3,nothing,neutral,Neutral
4,how directly related the homework was to assessments made me feel that studying was effective,joy,Positive
5,the professor was really nice and easy to approach,joy,Positive
6,assembly riscv part of the labs was well constructed and assisted my learning however the verilog labs were difficult and hard to understand,neutral,Negative
7,learnt alot of fundamentals,neutral,Neutral
8,strictly follow the syllabus,fear,Neutral
9,many opportunities given to achieve success in the class homework groups were very useful as questions done by peers could be reviewed for assessments meaning less time was needed having to figure out steps for many questions,joy,Positive


In [32]:
# Step 2: Map emotions to sentiment categories
emotion_to_sentiment = {
    'neutral': 'neutral',
    'joy': 'positive',
    'sadness': 'negative',
    'surprise': 'positive',
    'anger': 'negative',
    'fear': 'negative',
    'disgust': 'negative'
}

correlation_df['emotion_sentiment'] = correlation_df['dominant_emotion'].str.lower().map(emotion_to_sentiment)

# Step 3: Map both sentiment categories to numbers
sentiment_to_num = {
    'negative': -1,
    'neutral': 0,
    'positive': 1
}

correlation_df['Sentiment_Label_numeric'] = correlation_df['Sentiment_Label'].str.lower().map(sentiment_to_num)
correlation_df['emotion_sentiment_numeric'] = correlation_df['emotion_sentiment'].str.lower().map(sentiment_to_num)

#correlation_df.head(20)

correlation_coefficient = correlation_df['Sentiment_Label_numeric'].corr(correlation_df['emotion_sentiment_numeric'])
print(f"Pearson correlation between sentiment and emotion-sentiment: {correlation_coefficient:.3f}")




Pearson correlation between sentiment and emotion-sentiment: 0.458


In [33]:
df.head(10)

Unnamed: 0,course_code,academic_year,section,question_text,review,Positive,Negative,Neutral,Compound,Sentiment_Label,dominant_emotion,emotion_confidence
0,EECS 2021,2022,B,The best things about this course are,the grade distribution is very generous and encourages students to try their hardest in every instance of the course to earn marks,0.0,0.747,0.253,0.7755,Positive,joy,0.52
1,EECS 2021,2022,B,The best things about this course are,begin able to learn more about how computers work behind the scenes,0.0,1.0,0.0,0.0,Neutral,neutral,0.679
2,EECS 2021,2022,B,The best things about this course are,the topics on how it focused a lot on assembly coding which was helpful in seeing how each instruction is executed on the lower level,0.074,0.743,0.182,0.4939,Positive,neutral,0.902
3,EECS 2021,2022,B,The best things about this course are,nothing,0.0,1.0,0.0,0.0,Neutral,neutral,0.479
4,EECS 2021,2022,B,The best things about this course are,how directly related the homework was to assessments made me feel that studying was effective,0.0,0.819,0.181,0.4767,Positive,joy,0.981
5,EECS 2021,2022,B,The best things about this course are,the professor was really nice and easy to approach,0.0,0.528,0.472,0.7397,Positive,joy,0.949
6,EECS 2021,2022,B,The best things about this course are,assembly riscv part of the labs was well constructed and assisted my learning however the verilog labs were difficult and hard to understand,0.15,0.769,0.081,-0.2023,Negative,neutral,0.529
7,EECS 2021,2022,B,The best things about this course are,learnt alot of fundamentals,0.0,1.0,0.0,0.0,Neutral,neutral,0.599
8,EECS 2021,2022,B,The best things about this course are,strictly follow the syllabus,0.0,1.0,0.0,0.0,Neutral,fear,0.625
9,EECS 2021,2022,B,The best things about this course are,many opportunities given to achieve success in the class homework groups were very useful as questions done by peers could be reviewed for assessments meaning less time was needed having to figure out steps for many questions,0.0,0.782,0.218,0.8588,Positive,joy,0.793


In [36]:
# Assuming you have:
# - original_df: with course_code, academic_year, section, review, etc.
# - aspect_df: with review, aspect, sentiment, confidence


# ============================================
# FORMAT 1: Dictionary format (for LLM feeding)
# ============================================
print("Creating dictionary format for LLM...")

# Group aspects by review
aspect_dict = aspect_df.groupby('review').apply(
    lambda x: {
        row['aspect']: {
            'sentiment': row['sentiment'],
            'confidence': row['confidence']
        }
        for _, row in x.iterrows()
    }
).to_dict()

# Add aspect dictionary to original dataframe
llm_ready_df = df.copy()
llm_ready_df['aspect_data'] = llm_ready_df['review'].map(aspect_dict)

# Fill NaN with empty dict for reviews with no aspects
llm_ready_df['aspect_data'] = llm_ready_df['aspect_data'].apply(lambda x: x if isinstance(x, dict) else {})

# Add aspects_found column (list of aspects)
llm_ready_df['aspects_found'] = llm_ready_df['aspect_data'].apply(lambda x: list(x.keys()))

# Add num_aspects column
llm_ready_df['num_aspects'] = llm_ready_df['aspects_found'].apply(len)

print(f"LLM-ready dataframe: {len(llm_ready_df)} rows")
print("\nSample of LLM-ready format:")
print(llm_ready_df[['review', 'num_aspects', 'aspects_found', 'aspect_data']].head(3))

# ============================================
# FORMAT 2: Long format (for aggregation/analysis)
# ============================================
print("\n" + "="*80)
print("Creating long format for analysis...")

# Merge original metadata with aspect_df
analysis_long_df = df.merge(
    aspect_df,
    on='review_clean',
    how='left'  # Keep reviews with no aspects
)

# For reviews with no aspects, aspect columns will be NaN
print(f"Analysis-ready long dataframe: {len(analysis_long_df)} rows")
print("\nSample of long format:")
print(analysis_long_df[['course_code', 'academic_year', 'section', 'review', 'aspect', 'sentiment', 'confidence']].head(10))

# ============================================
# FORMAT 3: Pivot format (for cross-tabulation)
# ============================================
print("\n" + "="*80)
print("Creating pivot format for cross-tabulation...")

# Create aspect presence matrix
pivot_df = df[['course_code', 'academic_year', 'section', 'review_clean']].copy()

# Get unique aspects
all_aspects = aspect_df['aspect'].unique()

# For each aspect, add column with sentiment
for aspect in all_aspects:
    # Get reviews with this aspect
    aspect_reviews = aspect_df[aspect_df['aspect'] == aspect][['review_clean', 'sentiment']]
    aspect_reviews = aspect_reviews.rename(columns={'sentiment': f'aspect_{aspect}'})

    # Merge to pivot_df
    pivot_df = pivot_df.merge(
        aspect_reviews,
        on='review_clean',
        how='left'
    )

print(f"Pivot dataframe: {len(pivot_df)} rows, {len(pivot_df.columns)} columns")
print(pivot_df.head())

Creating dictionary format for LLM...
LLM-ready dataframe: 1537 rows

Sample of LLM-ready format:
                                                                                                                                  review  \
0     the grade distribution is very generous and encourages students to try their hardest in every instance of the course to earn marks   
1                                                                    begin able to learn more about how computers work behind the scenes   
2  the topics on how it focused a lot on assembly coding which was helpful in seeing how each instruction is executed on the lower level   

   num_aspects                                          aspects_found  \
0            1                                          [assessments]   
1            1                                    [learning_outcomes]   
2            3  [instructor, assignments_labs, practical_application]   

                                                

  aspect_dict = aspect_df.groupby('review').apply(


KeyError: "['review'] not in index"

In [None]:
original_df = df.copy()
original_df.head()

In [None]:
# Assuming you have:
# - original_df with all your columns + dominant_emotion, emotion_confidence
# - aspect_df with review, aspect, sentiment, confidence

# Clean reviews for matching
original_df['review_clean'] = original_df['review'].str.strip().str.lower()
aspect_df['review_clean'] = aspect_df['review'].str.strip().str.lower()

# Create a unique course identifier for grouping
original_df['course_id'] = original_df['course_code'] + '_' + original_df['academic_year'].astype(str) + '_' + original_df['section']
original_df['course_year'] = original_df['course_code'] + '_' + original_df['academic_year'].astype(str)

print(f"Total reviews: {len(original_df)}")
print(f"Unique course-year combinations: {original_df['course_year'].nunique()}")
print(f"Unique course-year-section combinations: {original_df['course_id'].nunique()}")

In [None]:
#original_df.head(1500)

In [None]:
# Group aspects by review
aspect_dict = aspect_df.groupby('review_clean').apply(
    lambda x: {
        row['aspect']: {
            'sentiment': row['sentiment'],
            'confidence': row['confidence']
        }
        for _, row in x.iterrows()
    }
).to_dict()

# Add to original dataframe
master_df = original_df.copy()
master_df['aspect_data'] = master_df['review_clean'].map(aspect_dict)
master_df['aspect_data'] = master_df['aspect_data'].apply(lambda x: x if isinstance(x, dict) else {})
master_df['aspects_found'] = master_df['aspect_data'].apply(lambda x: list(x.keys()))
master_df['num_aspects'] = master_df['aspects_found'].apply(len)

#master_df.head(10)

In [None]:
# Create long format for analysis (one row per aspect)
analysis_long = master_df.explode('aspects_found').reset_index(drop=True)
analysis_long['aspect'] = analysis_long['aspects_found']

# Add aspect sentiment and confidence from the dictionary
analysis_long['aspect_sentiment'] = analysis_long.apply(
    lambda x: x['aspect_data'].get(x['aspect'], {}).get('sentiment') if pd.notna(x['aspect']) else None,
    axis=1
)
analysis_long['aspect_confidence'] = analysis_long.apply(
    lambda x: x['aspect_data'].get(x['aspect'], {}).get('confidence') if pd.notna(x['aspect']) else None,
    axis=1
)

# Drop rows with no aspect (optional - keep for completeness)
# analysis_long = analysis_long[analysis_long['aspect'].notna()].reset_index(drop=True)

analysis_long.head(10)

In [None]:
def compare_course_sections(master_df, analysis_long, course_code, year1, year2=None):
    """
    Compare same course across different years/sections
    """
    if year2 is None:
        # Compare sections within same year
        filtered = analysis_long[
            (analysis_long['course_code'] == course_code) &
            (analysis_long['academic_year'] == year1)
        ]
        comparison_type = f"{course_code} - {year1} (by section)"
        group_cols = ['section']
    else:
        # Compare across different years
        filtered = analysis_long[
            (analysis_long['course_code'] == course_code) &
            (analysis_long['academic_year'].isin([year1, year2]))
        ]
        comparison_type = f"{course_code} - {year1} vs {year2}"
        group_cols = ['academic_year', 'section']

    # Aspect frequency by group
    aspect_freq = filtered.groupby(group_cols + ['aspect']).size().reset_index(name='mention_count')

    # Total reviews per group
    total_reviews = filtered.groupby(group_cols)['review'].nunique().reset_index(name='total_reviews')

    # Merge to get percentages
    result = aspect_freq.merge(total_reviews, on=group_cols)
    result['percentage'] = (result['mention_count'] / result['total_reviews'] * 100).round(1)

    # Aspect sentiment distribution
    sentiment_dist = pd.crosstab(
        [filtered[col] for col in group_cols + ['aspect']],
        filtered['aspect_sentiment']
    ).reset_index()

    return {
        'type': comparison_type,
        'frequency': result.sort_values(['aspect', 'mention_count'], ascending=False),
        'sentiment': sentiment_dist,
        'total_reviews': total_reviews
    }

def get_course_summary_stats(master_df, analysis_long, course_code, year=None, section=None):
    """
    Get comprehensive summary for a specific course/section/year
    """
    # Filter data
    filtered = master_df.copy()
    filter_desc = []

    if course_code:
        filtered = filtered[filtered['course_code'] == course_code]
        filter_desc.append(f"Course: {course_code}")
    if year:
        filtered = filtered[filtered['academic_year'] == year]
        filter_desc.append(f"Year: {year}")
    if section:
        filtered = filtered[filtered['section'] == section]
        filter_desc.append(f"Section: {section}")

    filter_str = ", ".join(filter_desc)

    # Basic stats
    n_reviews = len(filtered)
    sentiment_dist = filtered['Sentiment_Label'].value_counts()

    # Emotion stats
    emotion_dist = filtered['dominant_emotion'].value_counts()

    # Aspect stats from long format
    filtered_long = analysis_long[analysis_long['review_clean'].isin(filtered['review_clean'])]

    if len(filtered_long) > 0:
        top_aspects = filtered_long.groupby('aspect').size().sort_values(ascending=False).head(10)
        aspect_sentiment = pd.crosstab(filtered_long['aspect'], filtered_long['aspect_sentiment'])
    else:
        top_aspects = pd.Series()
        aspect_sentiment = pd.DataFrame()

    return {
        'description': filter_str,
        'n_reviews': n_reviews,
        'sentiment_distribution': sentiment_dist,
        'emotion_distribution': emotion_dist,
        'top_aspects': top_aspects,
        'aspect_sentiment_matrix': aspect_sentiment
    }

In [None]:
def generate_llm_summary(master_df, analysis_long, course_code, year1=None, year2=None, section=None):
    """
    Generate comprehensive LLM summary for comparison
    """
    if year2:
        # Compare two years
        data_year1 = get_course_summary_stats(master_df, analysis_long, course_code, year1)
        data_year2 = get_course_summary_stats(master_df, analysis_long, course_code, year2)

        prompt = f"""
COURSE COMPARISON: {course_code}
===========================================
Comparing {year1} vs {year2}

📊 {year1} SUMMARY:
- Total reviews: {data_year1['n_reviews']}
- Sentiment: {data_year1['sentiment_distribution'].to_dict()}
- Top emotions: {data_year1['emotion_distribution'].head(3).to_dict()}
- Top aspects mentioned: {data_year1['top_aspects'].head(5).to_dict()}

📊 {year2} SUMMARY:
- Total reviews: {data_year2['n_reviews']}
- Sentiment: {data_year2['sentiment_distribution'].to_dict()}
- Top emotions: {data_year2['emotion_distribution'].head(3).to_dict()}
- Top aspects mentioned: {data_year2['top_aspects'].head(5).to_dict()}

🔍 KEY CHANGES:
"""
        # Compare aspect frequencies
        aspects_year1 = set(data_year1['top_aspects'].index)
        aspects_year2 = set(data_year2['top_aspects'].index)

        new_aspects = aspects_year2 - aspects_year1
        disappeared_aspects = aspects_year1 - aspects_year2

        if new_aspects:
            prompt += f"\nNew aspects in {year2}: {list(new_aspects)}"
        if disappeared_aspects:
            prompt += f"\nAspects no longer mentioned in {year2}: {list(disappeared_aspects)}"

    elif section:
        # Compare sections within same year
        data_section1 = get_course_summary_stats(master_df, analysis_long, course_code, year1, section)
        other_sections = master_df[
            (master_df['course_code'] == course_code) &
            (master_df['academic_year'] == year1) &
            (master_df['section'] != section)
        ]['section'].unique()

        prompt = f"""
COURSE SECTION COMPARISON: {course_code} ({year1})
===========================================
Comparing Section {section} vs Other Sections

📊 SECTION {section}:
- Total reviews: {data_section1['n_reviews']}
- Sentiment: {data_section1['sentiment_distribution'].to_dict()}
- Top aspects: {data_section1['top_aspects'].head(5).to_dict()}

📊 OTHER SECTIONS ({list(other_sections)}):
"""
        for other_section in other_sections:
            data_other = get_course_summary_stats(master_df, analysis_long, course_code, year1, other_section)
            prompt += f"""
   Section {other_section}:
   - Reviews: {data_other['n_reviews']}
   - Sentiment: {data_other['sentiment_distribution'].to_dict()}
   - Top aspects: {data_other['top_aspects'].head(3).to_dict()}
"""
    else:
        # Single course summary
        data = get_course_summary_stats(master_df, analysis_long, course_code, year1)

        prompt = f"""
COURSE SUMMARY: {course_code} {f'({year1})' if year1 else ''}
===========================================

📊 OVERALL STATISTICS:
- Total reviews analyzed: {data['n_reviews']}
- Sentiment distribution: {data['sentiment_distribution'].to_dict()}
- Dominant emotions: {data['emotion_distribution'].head(3).to_dict()}

🔍 TOP ASPECTS MENTIONED:
"""
        for aspect, count in data['top_aspects'].head(7).items():
            sentiment_breakdown = data['aspect_sentiment_matrix'].loc[aspect].to_dict() if aspect in data['aspect_sentiment_matrix'].index else {}
            prompt += f"\n- {aspect}: mentioned {count} times"
            prompt += f"\n  Sentiment: {sentiment_breakdown}"

        prompt += f"""

💡 RECOMMENDATIONS FOR IMPROVEMENT:
Based on negative aspects and emotions:
"""
        # Add negative aspect analysis
        if len(data['aspect_sentiment_matrix']) > 0:
            negative_aspects = data['aspect_sentiment_matrix'][data['aspect_sentiment_matrix']['negative'] > 0].index.tolist() if 'negative' in data['aspect_sentiment_matrix'].columns else []
            if negative_aspects:
                prompt += f"\n- Address issues in: {negative_aspects[:5]}"

        # Add negative emotion analysis
        negative_emotions = ['anger', 'sadness', 'fear', 'disgust']
        emotion_neg = data['emotion_distribution'][data['emotion_distribution'].index.isin(negative_emotions)]
        if len(emotion_neg) > 0:
            prompt += f"\n- Negative emotions detected: {emotion_neg.to_dict()}"

    return prompt

# Example usage
print(generate_llm_summary(master_df, analysis_long, 'EECS 2021', 2022, 2023))
print("\n" + "="*80)
print(generate_llm_summary(master_df, analysis_long, 'EECS 2021', 2022, section='B'))

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def create_radar_chart(data, categories, values1, values2, group1_name, group2_name, title):
    """
    Create a radar chart comparing two groups across categories

    Parameters:
    - data: dataframe
    - categories: list of aspect/emotion names
    - values1: list of values for first group
    - values2: list of values for second group
    - group1_name, group2_name: labels for the groups
    - title: chart title
    """
    # Number of categories
    N = len(categories)

    # What will be the angle of each axis in the plot
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]  # Close the loop

    # Initialize the spider plot
    fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))

    # Add the values (and close the loop)
    values1 += values1[:1]
    values2 += values2[:1]

    # Draw one line per group
    ax.plot(angles, values1, 'o-', linewidth=2, label=group1_name, color='#1f77b4')
    ax.fill(angles, values1, alpha=0.1, color='#1f77b4')

    ax.plot(angles, values2, 'o-', linewidth=2, label=group2_name, color='#ff7f0e')
    ax.fill(angles, values2, alpha=0.1, color='#ff7f0e')

    # Set category labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories, size=10)

    # Set y-axis limits (adjust based on your data)
    max_val = max(max(values1[:-1]), max(values2[:-1]))
    ax.set_ylim(0, max_val * 1.1)

    # Add gridlines
    ax.yaxis.grid(True)
    ax.xaxis.grid(True)

    # Add title and legend
    plt.title(title, size=14, pad=20)
    plt.legend(loc='upper right', bbox_to_anchor=(1.1, 1.1))

    plt.tight_layout()
    return fig, ax


def compare_sections_radar(analysis_long, course_code, year, section_a, section_b, metric='count', top_n=8):
    """
    Compare two sections using radar chart

    Parameters:
    - metric: 'count' for mention count, 'percentage' for % of reviews, 'sentiment' for sentiment score
    """
    # Filter data
    data = analysis_long[
        (analysis_long['course_code'] == course_code) &
        (analysis_long['academic_year'] == year) &
        (analysis_long['section'].isin([section_a, section_b]))
    ]

    if len(data) == 0:
        print("No data found")
        return

    # Get top aspects overall for these sections
    top_aspects = data['aspect'].value_counts().head(top_n).index.tolist()

    # Calculate values for each section
    values_a = []
    values_b = []

    total_reviews_a = len(data[data['section'] == section_a]['review'].unique())
    total_reviews_b = len(data[data['section'] == section_b]['review'].unique())

    for aspect in top_aspects:
        if metric == 'count':
            # Raw mention count
            val_a = len(data[(data['section'] == section_a) & (data['aspect'] == aspect)])
            val_b = len(data[(data['section'] == section_b) & (data['aspect'] == aspect)])

        elif metric == 'percentage':
            # Percentage of reviews mentioning this aspect
            val_a = len(data[(data['section'] == section_a) & (data['aspect'] == aspect)]) / total_reviews_a * 100
            val_b = len(data[(data['section'] == section_b) & (data['aspect'] == aspect)]) / total_reviews_b * 100

        elif metric == 'sentiment':
            # Average sentiment score (positive=1, neutral=0, negative=-1)
            sentiment_map = {'positive': 1, 'neutral': 0, 'negative': -1}
            aspect_data_a = data[(data['section'] == section_a) & (data['aspect'] == aspect)]
            aspect_data_b = data[(data['section'] == section_b) & (data['aspect'] == aspect)]

            if len(aspect_data_a) > 0:
                val_a = aspect_data_a['aspect_sentiment'].map(sentiment_map).mean()
            else:
                val_a = 0

            if len(aspect_data_b) > 0:
                val_b = aspect_data_b['aspect_sentiment'].map(sentiment_map).mean()
            else:
                val_b = 0

        values_a.append(val_a)
        values_b.append(val_b)

    # Create chart
    title = f"{course_code} ({year}) - Section {section_a} vs Section {section_b}\nAspect {metric.capitalize()} Comparison"
    fig, ax = create_radar_chart(
        data, top_aspects, values_a, values_b,
        f"Section {section_a}", f"Section {section_b}", title
    )

    return fig, ax


def compare_years_radar(analysis_long, course_code, year_a, year_b, metric='count', top_n=8):
    """
    Compare two years using radar chart
    """
    # Filter data
    data = analysis_long[
        (analysis_long['course_code'] == course_code) &
        (analysis_long['academic_year'].isin([year_a, year_b]))
    ]

    if len(data) == 0:
        print("No data found")
        return

    # Get top aspects overall for these years
    top_aspects = data['aspect'].value_counts().head(top_n).index.tolist()

    # Calculate values for each year
    values_a = []
    values_b = []

    total_reviews_a = len(data[data['academic_year'] == year_a]['review'].unique())
    total_reviews_b = len(data[data['academic_year'] == year_b]['review'].unique())

    for aspect in top_aspects:
        if metric == 'count':
            # Raw mention count
            val_a = len(data[(data['academic_year'] == year_a) & (data['aspect'] == aspect)])
            val_b = len(data[(data['academic_year'] == year_b) & (data['aspect'] == aspect)])

        elif metric == 'percentage':
            # Percentage of reviews mentioning this aspect
            val_a = len(data[(data['academic_year'] == year_a) & (data['aspect'] == aspect)]) / total_reviews_a * 100
            val_b = len(data[(data['academic_year'] == year_b) & (data['aspect'] == aspect)]) / total_reviews_b * 100

        elif metric == 'sentiment':
            # Average sentiment score
            sentiment_map = {'positive': 1, 'neutral': 0, 'negative': -1}
            aspect_data_a = data[(data['academic_year'] == year_a) & (data['aspect'] == aspect)]
            aspect_data_b = data[(data['academic_year'] == year_b) & (data['aspect'] == aspect)]

            if len(aspect_data_a) > 0:
                val_a = aspect_data_a['aspect_sentiment'].map(sentiment_map).mean()
            else:
                val_a = 0

            if len(aspect_data_b) > 0:
                val_b = aspect_data_b['aspect_sentiment'].map(sentiment_map).mean()
            else:
                val_b = 0

    values_a.append(values_a[0])  # Close the loop
    values_b.append(values_b[0])  # Close the loop

    # Create chart
    title = f"{course_code} - {year_a} vs {year_b}\nAspect {metric.capitalize()} Comparison"
    fig, ax = create_radar_chart(
        data, top_aspects, values_a, values_b,
        str(year_a), str(year_b), title
    )

    return fig, ax


def compare_emotions_radar(master_df, course_code, group1, group2, group1_name, group2_name, group_type='year'):
    """
    Compare emotion distributions using radar chart
    """
    if group_type == 'year':
        data1 = master_df[(master_df['course_code'] == course_code) & (master_df['academic_year'] == group1)]
        data2 = master_df[(master_df['course_code'] == course_code) & (master_df['academic_year'] == group2)]
    else:  # section
        data1 = master_df[(master_df['course_code'] == course_code) & (master_df['section'] == group1)]
        data2 = master_df[(master_df['course_code'] == course_code) & (master_df['section'] == group2)]

    # Get top emotions
    all_emotions = pd.concat([data1['dominant_emotion'], data2['dominant_emotion']]).value_counts().head(8).index.tolist()

    # Calculate percentages
    values1 = [(data1['dominant_emotion'] == emotion).mean() * 100 for emotion in all_emotions]
    values2 = [(data2['dominant_emotion'] == emotion).mean() * 100 for emotion in all_emotions]

    # Create chart
    title = f"{course_code} - Emotion Distribution Comparison\n{group1_name} vs {group2_name}"
    fig, ax = create_radar_chart(
        None, all_emotions, values1, values2,
        group1_name, group2_name, title
    )
    ax.set_ylim(0, max(max(values1), max(values2)) * 1.1)

    return fig, ax


# ============================================
# EXAMPLE USAGE
# ============================================

# 1. Compare two sections (by aspect mention percentage)
fig1, _ = compare_sections_radar(
    analysis_long,
    course_code='EECS 2021',
    year=2022,
    section_a='A',
    section_b='B',
    metric='percentage',
    top_n=8
)
plt.show()

# 2. Compare two years (by aspect sentiment)
fig2, _ = compare_years_radar(
    analysis_long,
    course_code='EECS 2021',
    year_a=2022,
    year_b=2023,
    metric='sentiment',
    top_n=8
)
plt.show()

# 3. Compare emotions between two sections
fig3, _ = compare_emotions_radar(
    master_df,
    course_code='EECS 2021',
    group1='A',
    group2='B',
    group1_name='Section A',
    group2_name='Section B',
    group_type='section'
)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_course_comparison(analysis_long, course_code, years=None):
    """
    Create visualization comparing course across years
    """
    if years:
        data = analysis_long[
            (analysis_long['course_code'] == course_code) &
            (analysis_long['academic_year'].isin(years))
        ]
    else:
        data = analysis_long[analysis_long['course_code'] == course_code]

    # Aspect frequency by year
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # 1. Sentiment distribution by year
    sentiment_by_year = pd.crosstab(data['academic_year'], data['Sentiment_Label'])
    sentiment_by_year.plot(kind='bar', ax=axes[0,0], title='Sentiment Distribution by Year')
    axes[0,0].set_xlabel('Academic Year')
    axes[0,0].set_ylabel('Count')

    # 2. Top aspects overall
    top_aspects = data['aspect'].value_counts().head(10)
    top_aspects.plot(kind='barh', ax=axes[0,1], title='Top 10 Aspects Mentioned')
    axes[0,1].set_xlabel('Count')

    # 3. Aspect sentiment heatmap by year
    aspect_year_sentiment = data.groupby(['academic_year', 'aspect', 'aspect_sentiment']).size().unstack().fillna(0)
    if len(aspect_year_sentiment) > 0:
        sns.heatmap(aspect_year_sentiment.head(20), ax=axes[1,0], annot=True, fmt='g', cmap='Blues')
        axes[1,0].set_title('Aspect Sentiment by Year')

    # 4. Emotion distribution by year
    emotion_by_year = pd.crosstab(data['academic_year'], data['dominant_emotion'])
    emotion_by_year.plot(kind='bar', ax=axes[1,1], title='Emotion Distribution by Year', legend=False)
    axes[1,1].set_xlabel('Academic Year')

    plt.tight_layout()
    plt.show()

#Example
plot_course_comparison(analysis_long, 'EECS 2021', [2022, 2023, 2024])

In [None]:
def plot_section_comparison(analysis_long, course_code, year):
    """
    Create visualization comparing different sections of the same course in a specific year
    """
    # Filter data for specific course and year
    data = analysis_long[
        (analysis_long['course_code'] == course_code) &
        (analysis_long['academic_year'] == year)
    ]

    if len(data) == 0:
        print(f"No data found for {course_code} in {year}")
        return

    sections = sorted(data['section'].unique())
    print(f"Comparing sections: {sections}")

    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle(f'{course_code} ({year}) - Section Comparison', fontsize=16)

    # 1. Sentiment distribution by section
    sentiment_by_section = pd.crosstab(data['section'], data['Sentiment_Label'])
    sentiment_by_section.plot(kind='bar', ax=axes[0,0], title='Sentiment Distribution by Section', color=['#ff9999', '#66b3ff', '#99ff99'])
    axes[0,0].set_xlabel('Section')
    axes[0,0].set_ylabel('Count')
    axes[0,0].legend(title='Sentiment')

    # 2. Top aspects overall (across all sections)
    top_aspects = data['aspect'].value_counts().head(8)
    top_aspects.plot(kind='barh', ax=axes[0,1], title='Top 8 Aspects Mentioned (All Sections)', color='skyblue')
    axes[0,1].set_xlabel('Count')

    # 3. Aspect frequency by section (heatmap)
    aspect_by_section = pd.crosstab(data['section'], data['aspect'])
    if len(aspect_by_section.columns) > 0:
        # Show top aspects only for readability
        top_aspect_names = data['aspect'].value_counts().head(8).index.tolist()
        aspect_by_section_top = aspect_by_section[top_aspect_names]

        sns.heatmap(aspect_by_section_top, ax=axes[1,0], annot=True, fmt='g', cmap='YlOrRd')
        axes[1,0].set_title('Top Aspect Mentions by Section')
        axes[1,0].set_xlabel('Aspect')
        axes[1,0].set_ylabel('Section')

    # 4. Emotion distribution by section
    emotion_by_section = pd.crosstab(data['section'], data['dominant_emotion'])
    emotion_by_section.plot(kind='bar', ax=axes[1,1], title='Emotion Distribution by Section', stacked=True, colormap='viridis')
    axes[1,1].set_xlabel('Section')
    axes[1,1].set_ylabel('Count')
    axes[1,1].legend(title='Emotion', bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.tight_layout()
    plt.show()

    # Print summary statistics
    print("\n" + "="*60)
    print(f"SUMMARY STATISTICS BY SECTION - {course_code} ({year})")
    print("="*60)

    for section in sections:
        section_data = data[data['section'] == section]
        print(f"\n📌 Section {section}:")
        print(f"   Total reviews: {len(section_data)}")
        print(f"   Sentiment: {section_data['Sentiment_Label'].value_counts().to_dict()}")
        print(f"   Top 3 emotions: {section_data['dominant_emotion'].value_counts().head(3).to_dict()}")
        print(f"   Top 3 aspects: {section_data['aspect'].value_counts().head(3).to_dict()}")


def plot_section_sentiment_breakdown(analysis_long, course_code, year):
    """
    Additional detailed visualization showing aspect sentiment breakdown by section
    """
    data = analysis_long[
        (analysis_long['course_code'] == course_code) &
        (analysis_long['academic_year'] == year)
    ]

    if len(data) == 0:
        print(f"No data found for {course_code} in {year}")
        return

    sections = data['section'].unique()

    # Create a figure with subplots for each section
    fig, axes = plt.subplots(1, len(sections), figsize=(6*len(sections), 6))
    if len(sections) == 1:
        axes = [axes]

    fig.suptitle(f'{course_code} ({year}) - Aspect Sentiment Breakdown by Section', fontsize=14)

    for idx, section in enumerate(sorted(sections)):
        section_data = data[data['section'] == section]

        # Get top aspects for this section
        top_aspects = section_data['aspect'].value_counts().head(5).index.tolist()
        aspect_sentiment = section_data[section_data['aspect'].isin(top_aspects)]

        # Create grouped bar chart
        sentiment_counts = pd.crosstab(
            aspect_sentiment['aspect'],
            aspect_sentiment['aspect_sentiment']
        )

        # Reorder to have all sentiment columns
        for sentiment in ['positive', 'neutral', 'negative']:
            if sentiment not in sentiment_counts.columns:
                sentiment_counts[sentiment] = 0

        sentiment_counts = sentiment_counts[['positive', 'neutral', 'negative']]

        sentiment_counts.plot(
            kind='bar',
            ax=axes[idx],
            title=f'Section {section} (n={len(section_data)})',
            color=['#99ff99', '#66b3ff', '#ff9999']
        )
        axes[idx].set_xlabel('Aspect')
        axes[idx].set_ylabel('Count')
        axes[idx].tick_params(axis='x', rotation=45)

        if idx < len(sections) - 1:
            axes[idx].legend().remove()

    plt.tight_layout()
    plt.show()


# Example usage
# Compare sections within same year
plot_section_comparison(analysis_long, 'EECS 2021', 2022)

# Detailed aspect sentiment breakdown by section
plot_section_sentiment_breakdown(analysis_long, 'EECS 2021', 2022)

In [None]:
def compare_two_sections(analysis_long, course_code, year, section_a, section_b):
    """
    Direct comparison between two specific sections
    """
    data = analysis_long[
        (analysis_long['course_code'] == course_code) &
        (analysis_long['academic_year'] == year) &
        (analysis_long['section'].isin([section_a, section_b]))
    ]

    if len(data) == 0:
        print(f"No data found for {course_code} in {year}")
        return

    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle(f'{course_code} ({year}) - Section {section_a} vs Section {section_b}', fontsize=14)

    # 1. Sentiment comparison
    sentiment_compare = pd.crosstab(data['section'], data['Sentiment_Label'])
    sentiment_compare.plot(kind='bar', ax=axes[0,0], title='Sentiment Comparison', color=['#ff9999', '#66b3ff', '#99ff99'])
    axes[0,0].set_xlabel('Section')
    axes[0,0].set_ylabel('Count')

    # 2. Top aspects comparison
    for idx, section in enumerate([section_a, section_b]):
        section_data = data[data['section'] == section]
        top_aspects = section_data['aspect'].value_counts().head(5)

        axes[1, idx].barh(top_aspects.index, top_aspects.values, color='skyblue')
        axes[1, idx].set_title(f'Section {section} - Top Aspects')
        axes[1, idx].set_xlabel('Count')

    # 3. Emotion comparison
    emotion_compare = pd.crosstab(data['section'], data['dominant_emotion'])
    emotion_compare.plot(kind='bar', ax=axes[0,1], title='Emotion Comparison', stacked=True, colormap='viridis')
    axes[0,1].set_xlabel('Section')
    axes[0,1].set_ylabel('Count')
    axes[0,1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.tight_layout()
    plt.show()

    # Print comparison stats
    print("\n" + "="*60)
    print(f"DIRECT COMPARISON: Section {section_a} vs Section {section_b}")
    print("="*60)

    for section in [section_a, section_b]:
        section_data = data[data['section'] == section]
        print(f"\n📌 Section {section}:")
        print(f"   Reviews: {len(section_data)}")
        print(f"   Positive: {section_data['Sentiment_Label'].value_counts().get('positive', 0)}")
        print(f"   Negative: {section_data['Sentiment_Label'].value_counts().get('negative', 0)}")
        print(f"   Most common emotion: {section_data['dominant_emotion'].mode()[0] if len(section_data) > 0 else 'N/A'}")
        print(f"   Most discussed aspect: {section_data['aspect'].mode()[0] if len(section_data) > 0 else 'N/A'}")


# Example usage
compare_two_sections(analysis_long, 'EECS 2021', 2022, 'M', 'B')

In [None]:
# For section comparison within a specific year
def plot_aspect_sentiment_by_section(data, course_code, year):
    """
    Create heatmap showing aspect sentiment by section
    """
    # Filter data for specific course and year
    section_data = data[
        (data['course_code'] == course_code) &
        (data['academic_year'] == year)
    ]

    if len(section_data) == 0:
        print(f"No data found for {course_code} in {year}")
        return

    # Create aspect sentiment heatmap by section
    aspect_section_sentiment = section_data.groupby(
        ['section', 'aspect', 'aspect_sentiment']
    ).size().unstack().fillna(0)

    if len(aspect_section_sentiment) > 0:
        plt.figure(figsize=(12, 8))
        sns.heatmap(
            aspect_section_sentiment.head(20),
            annot=True,
            fmt='g',
            cmap='YlOrRd',
            cbar_kws={'label': 'Count'}
        )
        plt.title(f'Aspect Sentiment by Section - {course_code} ({year})', fontsize=14)
        plt.xlabel('Sentiment')
        plt.ylabel('Section - Aspect')
        plt.tight_layout()
        plt.show()

        # Also show the data
        print("\nAspect-Sentiment Counts by Section:")
        print(aspect_section_sentiment.head(15))

# For comparing across years (your original)
def plot_aspect_sentiment_by_year(data, course_code):
    """
    Create heatmap showing aspect sentiment by year
    """
    year_data = data[data['course_code'] == course_code]

    aspect_year_sentiment = year_data.groupby(
        ['academic_year', 'aspect', 'aspect_sentiment']
    ).size().unstack().fillna(0)

    if len(aspect_year_sentiment) > 0:
        plt.figure(figsize=(12, 8))
        sns.heatmap(
            aspect_year_sentiment.head(20),
            annot=True,
            fmt='g',
            cmap='Blues',
            cbar_kws={'label': 'Count'}
        )
        plt.title(f'Aspect Sentiment by Year - {course_code}', fontsize=14)
        plt.xlabel('Sentiment')
        plt.ylabel('Year - Aspect')
        plt.tight_layout()
        plt.show()

# Example usage:
plot_aspect_sentiment_by_section(analysis_long, 'EECS 2021', 2022)
# plot_aspect_sentiment_by_year(analysis_long, 'EECS 2021')

!pip install -q streamlit
%%writefile app.py

import streamlit as st
import pandas as pd
import plotly.express as px  # More interactive than matplotlib for Streamlit


usecase_df = master_df.copy()

# Sidebar controls
st.sidebar.title("Course Evaluation Analyzer")
course = st.sidebar.selectbox("Select Course", df['course_code'].unique())
analysis_type = st.sidebar.radio("Compare", ["Across Years", "Within Year (Sections)"])

if analysis_type == "Across Years":
    years = st.sidebar.multiselect("Select Years",
                                   sorted(df[df['course_code']==course]['academic_year'].unique()))
    # Generate year comparison charts
else:
    year = st.sidebar.selectbox("Select Year",
                                sorted(df[df['course_code']==course]['academic_year'].unique()))
    sections = st.sidebar.multiselect("Select Sections",
                                      df[(df['course_code']==course) &
                                         (df['academic_year']==year)]['section'].unique())
    # Generate section comparison charts
    

In [None]:
%%capture
# NUCLEAR OPTION - Reset everything
!pip uninstall transformers unsloth unsloth_zoo -y
!pip install --upgrade transformers
!pip install accelerate bitsandbytes
!pip install -U bitsandbytes transformers accelerate

In [None]:
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get('HuggingFaceToken')
login(token=hf_token)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import os
import bitsandbytes as bnb # Ensure bitsandbytes is imported

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=hf_token
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
    token=hf_token
)