<a href="https://colab.research.google.com/github/viltet/thesis/blob/main/scripts/asba.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Install necessary libraries if they are not already in the Colab environment
!pip install pandas tqdm torch transformers scikit-learn spacy -q
!python -m spacy download en_core_web_sm -q

# Mount Google Drive to access your files
from google.colab import drive
drive.mount('/content/drive')

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/12.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/12.8 MB[0m [31m184.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m12.8/12.8 MB[0m [31m237.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m12.8/12.8 MB[0m [31m237.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m108.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option

In [5]:
#@title 1. Loading libraries

import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm # Progress bar for loops
import torch # PyTorch for deep learning models
import re # Regular expressions for text processing
import gc # Garbage collector for memory management
import spacy # For sentence splitting
from transformers import AutoTokenizer, AutoModelForSequenceClassification # Hugging Face Transformers
import os # For directory creation

# Display options for pandas
pd.set_option('display.max_colwidth', 200) # Show more text in DataFrame cells
print("Libraries imported.")

Libraries imported.


In [10]:
#@title 2. Configuration Parameters & Path Setup

# --- Main Configuration ---
ASSISTANT_NAMES = ["alexa", "google"] # Datasets to process
MODEL_DEBERTA_ABSA = 'yangheng/deberta-v3-base-absa-v1.1' # Chosen ABSA model
MAX_SEQ_LENGTH = 512 # Max sequence length for the model
PROCESS_ALL_REVIEWS = True # Set to True for full run, False to use NUM_REVIEWS_TO_PROCESS_DEBUG
NUM_REVIEWS_TO_PROCESS_DEBUG = 100 # Number of reviews to process if PROCESS_ALL_REVIEWS is False (for quick debugging)

# --- Google Drive Path Setup ---
# IMPORTANT: This path should point to YOUR main thesis folder on Google Drive.
# This folder should contain a 'results' subfolder with your input CSVs.
# Based on your confirmation, it's "MyThesisProject" in your MyDrive.
THESIS_ROOT_DRIVE = Path("/content/drive/MyDrive/MyThesisProject/")

# --- Derived Paths ---
# The script expects your input CSVs (e.g., alexa_with_topics.csv)
# to be inside a 'results' subfolder within your THESIS_ROOT_DRIVE.
input_dir = THESIS_ROOT_DRIVE / "results"
output_dir = THESIS_ROOT_DRIVE / "results" / "absa_full_results_colab" # Separate output for Colab runs
output_dir.mkdir(parents=True, exist_ok=True) # Create output directory if it doesn't exist

print(f"THESIS_ROOT on Drive set to: {THESIS_ROOT_DRIVE.resolve()}")
print(f"Expecting input CSVs in: {input_dir.resolve()}")
print(f"Output directory for ABSA results: {output_dir.resolve()}")

# --- Device Setup (GPU/CPU) ---
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    DEVICE_NAME_INFO = torch.cuda.get_device_name(0)
    print(f"Using GPU: {DEVICE_NAME_INFO}")
else:
    DEVICE = torch.device("cpu")
    print("Using CPU. Processing will be slower. Ensure GPU is enabled in Colab Runtime settings.")

# --- Aspect Taxonomy (Copy from your previous script) ---
taxonomy = {
    "Functionality & Performance": [
        "command", "task", "function", "request", "execute", "perform", "play", "control",
        "music", "timer", "alarm", "respond", "slow", "fast", "quick", "accurate", "ability",
        "capability", "feature", "work", "operation", "answer", "weather", "news", "skill",
        "search", "query", "song", "playlist", "speed", "performance", "reliable", "inconsistent",
        "consistent", "accomplish", "smart", "intelligence", "stupid", "dumb", "basic"
    ],
    "Voice Recognition": [
        "hear", "listen", "recognize", "understanding", "mic", "voice", "accent", "speech",
        "microphone", "wake", "alexa", "hey google", "ok google", "command", "activation",
        "trigger", "phrase", "call", "name", "hear me", "misheard", "mishear", "understand",
        "detection", "sensitivity", "accent", "pronunciation", "dialect", "language", "recognition"
    ],
    "Knowledge Base": [
        "answer", "knowledge", "info", "response", "fact", "question", "data", "correct",
        "wrong", "information", "knowing", "research", "source", "accurate", "inaccurate",
        "encyclopedia", "intelligence", "smart", "learn", "education", "informed", "wisdom",
        "trivia", "facts", "content", "query", "request", "answer", "respond"
    ],
    "Integration & Ecosystem": [
        "integrate", "connect", "compatible", "device", "home", "nest", "smart home", "ecosystem",
        "philips", "hue", "lights", "thermostat", "tv", "television", "speaker", "app", "phone",
        "smartphone", "skill", "third-party", "partner", "service", "platform", "sync",
        "connection", "pair", "bluetooth", "wifi", "wireless", "smart", "bulb", "plug", "switch",
        "camera", "doorbell", "lock", "appliance", "interoperability", "echo", "home mini"
    ],
    "Usability & Interface": [
        "setup", "interface", "easy", "use", "design", "confusing", "intuitive", "simple",
        "complicated", "difficult", "user-friendly", "accessibility", "accessible", "learn",
        "instructions", "guide", "tutorial", "help", "clear", "straightforward", "configuration",
        "settings", "customize", "personalize", "navigate", "interaction", "command structure"
    ],
    "Privacy & Security": [
        "privacy", "data", "listening", "security", "surveillance", "record", "spy", "collect",
        "tracking", "concern", "worry", "safe", "unsafe", "breach", "leak", "consent", "permission",
        "trust", "trustworthy", "creepy", "scary", "suspicious", "watching", "monitoring", "gdpr",
        "policy", "terms", "agreement", "encryption", "protected", "vulnerable", "hack", "risk",
        "danger", "paranoid", "microphone", "camera", "recording", "personal", "information", "location"
    ],
    "Updates & Evolution": [
        "update", "version", "bug", "feature", "release", "patch", "upgrade", "improve",
        "improvement", "fix", "issue", "problem", "solved", "downgrade", "regression", "change",
        "changed", "new", "added", "removed", "missing", "development", "roadmap", "progress",
        "evolve", "evolution", "grow", "maturity", "mature", "immature", "beta", "alpha", "stable"
    ],
    "Support & Service": [
        "support", "help", "service", "issue", "resolution", "customer", "contact", "call",
        "phone", "email", "chat", "representative", "agent", "ticket", "case", "response",
        "warranty", "replacement", "refund", "return", "satisfaction", "dissatisfaction",
        "frustrated", "complaint", "feedback", "solve", "solution", "troubleshoot", "repair"
    ],
    "Social & Emotional Aspects": [
        "personality", "character", "funny", "humor", "joke", "laugh", "fun", "entertaining",
        "companion", "friend", "relationship", "emotion", "emotional", "human-like", "humanlike",
        "personal", "personable", "warm", "cold", "robotic", "mechanical", "natural", "unnatural",
        "conversation", "conversational", "chat", "talk", "dialogue", "interaction", "interactive",
        "respond", "response", "reply", "engaging", "engage", "connection", "connect", "relate"
    ],
    "Personalization & Intelligence": [
        "personalize", "customize", "preference", "learn", "adapt", "suggest", "recommendation",
        "profile", "account", "user", "individual", "specific", "tailored", "custom", "habit",
        "routine", "pattern", "predict", "predictive", "anticipate", "remember", "memory",
        "context", "contextual", "awareness", "recognize", "familiar", "personal", "special",
        "unique", "adjust", "adaptation", "history", "previous", "past", "experience"
    ]
}
print("\nConfiguration and paths set.")


THESIS_ROOT on Drive set to: /content/drive/MyDrive/MyThesisProject
Expecting input CSVs in: /content/drive/MyDrive/MyThesisProject/results
Output directory for ABSA results: /content/drive/MyDrive/MyThesisProject/results/absa_full_results_colab
Using GPU: Tesla T4

Configuration and paths set.


In [11]:
#@title 3. Load Models & Prepare Keyword Matching Tools

# --- Load spaCy for sentence splitting ---
print("Loading spaCy model (en_core_web_sm)...")
try:
    nlp_spacy = spacy.load("en_core_web_sm")
    # nlp_spacy.max_length = 1500000 # Uncomment and adjust if reviews are extremely long
    print("spaCy model loaded.")
except OSError:
    print("spaCy model 'en_core_web_sm' not found. Ensure it was installed in Cell 0.")
    raise

# --- Prepare Keyword Matching Regex ---
keyword_to_aspect_map = {}
all_keywords_patterns = []
for aspect_category, keywords in taxonomy.items():
    for keyword in keywords:
        kw_lower = keyword.lower() # Ensure keywords are lowercase for matching
        keyword_to_aspect_map[kw_lower] = aspect_category
        # Use word boundaries to match whole words only
        all_keywords_patterns.append(r'\b' + re.escape(kw_lower) + r'\b')
keyword_regex = re.compile('|'.join(all_keywords_patterns), re.IGNORECASE) # Case-insensitive matching
print("Keyword regex compiled.")

# --- Load DeBERTa ABSA Model and Tokenizer ---
# These will be loaded globally for use in the prediction function
absa_tokenizer = None
absa_model = None
print(f"Loading DeBERTa ABSA model: {MODEL_DEBERTA_ABSA}...")
try:
    absa_tokenizer = AutoTokenizer.from_pretrained(MODEL_DEBERTA_ABSA)
    absa_model = AutoModelForSequenceClassification.from_pretrained(MODEL_DEBERTA_ABSA).to(DEVICE)
    absa_model.eval() # Set to evaluation mode (important for inference)
    print(f"DeBERTa ABSA model loaded successfully and moved to device: {DEVICE}")
except Exception as e:
    print(f"Error loading DeBERTa ABSA model: {e}")
    # If model loading fails, we cannot proceed.
    raise

Loading spaCy model (en_core_web_sm)...
spaCy model loaded.
Keyword regex compiled.
Loading DeBERTa ABSA model: yangheng/deberta-v3-base-absa-v1.1...
DeBERTa ABSA model loaded successfully and moved to device: cuda


In [12]:
#@title 4. Helper Functions (Memory Release & Sentiment Prediction)

def release_memory(model_to_del=None, tokenizer_to_del=None, custom_message=""):
    """Releases memory occupied by specified components and clears CUDA cache."""
    if model_to_del:
        del model_to_del
    if tokenizer_to_del:
        del tokenizer_to_del
    gc.collect() # Force garbage collection
    if torch.cuda.is_available():
        torch.cuda.empty_cache() # Clear PyTorch's CUDA memory cache
    if custom_message:
        print(custom_message)
    # print("Memory release attempt complete.")


def predict_aspect_sentiment_batch(sentence_aspect_pairs, tokenizer, model, device_to_use, max_len, batch_size=32):
    """
    Predicts sentiment for a batch of (sentence_text, aspect_category) pairs.
    Returns a list of sentiment predictions.
    """
    predictions = []
    if not sentence_aspect_pairs:
        return predictions

    # Unzip the pairs
    sentences, aspects = zip(*sentence_aspect_pairs)

    try:
        # Tokenize the batch
        inputs = tokenizer(
            list(sentences),  # Convert tuple to list for tokenizer
            list(aspects),    # Convert tuple to list for tokenizer
            truncation=True,
            padding='max_length', # Pad to the longest sequence in the batch or max_length
            max_length=max_len,
            return_tensors='pt',
            add_special_tokens=True
        ).to(device_to_use)

        with torch.no_grad(): # Disable gradient calculations for inference
            outputs = model(**inputs)
            logits = outputs.logits

        predicted_class_ids = torch.argmax(logits, dim=-1).cpu().tolist() # Move to CPU before converting to list

        for class_id in predicted_class_ids:
            pred_sentiment = model.config.id2label[class_id].capitalize()
            if pred_sentiment not in ['Positive', 'Negative', 'Neutral']:
                predictions.append('Neutral') # Fallback for unexpected labels
            else:
                predictions.append(pred_sentiment)

        return predictions

    except Exception as e:
        # print(f"Error during batched sentiment prediction: {e}. Returning 'Error' for batch.")
        # If batch fails, return "Error" for each item in the batch
        return ["Error"] * len(sentence_aspect_pairs)

print("Helper functions defined.")

Helper functions defined.


In [None]:
#@title 5. Main Processing Loop (Full Scale ABSA)


PREDICTION_PROCESSING_BATCH_SIZE = 64 if DEVICE.type == 'cuda' else 8


for assistant in ASSISTANT_NAMES:
    print(f"\n=================================================================")
    print(f"Processing full dataset for: {assistant.upper()}")
    print(f"=================================================================")

    input_file = input_dir / f"{assistant}_with_topics.csv"
    output_file_path = output_dir / f"{assistant}_full_absa_sentiments_colab.csv"

    if not input_file.exists():
        print(f"Input file not found for {assistant}: {input_file}. Skipping.")
        continue

    try:
        df_reviews_full = pd.read_csv(input_file)
        print(f"Loaded {len(df_reviews_full)} reviews for {assistant}.")
    except Exception as e:
        print(f"Error loading {input_file}: {e}. Skipping.")
        continue

    # Validate required columns
    required_cols = ['reviewId', 'clean_content', 'at']
    if not all(col in df_reviews_full.columns for col in required_cols):
        print(f"One or more required columns ({', '.join(required_cols)}) missing in {input_file}. Skipping.")
        continue

    df_reviews_full = df_reviews_full.dropna(subset=required_cols)
    df_reviews_full['clean_content'] = df_reviews_full['clean_content'].astype(str)

    # --- Subsetting Logic for Debugging or Full Run ---
    if PROCESS_ALL_REVIEWS:
        df_to_process = df_reviews_full.copy()
        print(f"Processing all {len(df_to_process)} reviews for {assistant}.")
    else:
        if len(df_reviews_full) > NUM_REVIEWS_TO_PROCESS_DEBUG:
            df_to_process = df_reviews_full.head(NUM_REVIEWS_TO_PROCESS_DEBUG).copy()
            print(f"DEBUG MODE: Processing a subset of {len(df_to_process)} reviews for {assistant}.")
        else:
            df_to_process = df_reviews_full.copy()
            print(f"DEBUG MODE: Dataset for {assistant} has {len(df_to_process)} reviews (less than debug limit). Processing all available.")
    # --- End of Subsetting Logic ---

    all_aspect_sentiments_data = [] # To store dictionaries for final DataFrame
    sentence_aspect_pairs_batch = [] # To collect pairs for batch prediction
    metadata_for_batch = [] # To store corresponding (reviewId, sentence, aspect, keyword, timestamp)

    print(f"Identifying aspects and preparing for sentiment prediction for {assistant}...")
    # Outer loop for reviews
    for index, row in tqdm(df_to_process.iterrows(), total=len(df_to_process), desc=f"Reviews ({assistant})"):
        review_id_val = row['reviewId']
        review_text = row['clean_content']
        review_timestamp = row['at']

        if not review_text.strip():
            continue

        try:
            review_text_for_spacy = review_text[:nlp_spacy.max_length] if len(review_text) > nlp_spacy.max_length else review_text
            doc_spacy = nlp_spacy(review_text_for_spacy)

            # Inner loop for sentences in a review
            for sent in doc_spacy.sents:
                sentence_text = sent.text.strip()
                if len(sentence_text) < 10: # Skip very short sentences
                    continue

                found_keywords_in_sentence = keyword_regex.findall(sentence_text)
                aspects_processed_for_this_sentence = set()

                # Innermost loop for keywords in a sentence
                for keyword in found_keywords_in_sentence:
                    keyword_lower = keyword.lower()
                    if keyword_lower in keyword_to_aspect_map:
                        aspect_category = keyword_to_aspect_map[keyword_lower]

                        if aspect_category not in aspects_processed_for_this_sentence:
                            # Add to batch for prediction
                            sentence_aspect_pairs_batch.append((sentence_text, aspect_category))
                            metadata_for_batch.append({
                                'reviewId': review_id_val,
                                'sentence_text': sentence_text,
                                'identified_aspect': aspect_category,
                                'matched_keyword': keyword_lower,
                                'timestamp': review_timestamp
                            })
                            aspects_processed_for_this_sentence.add(aspect_category)

                            # If batch is full, predict and clear
                            if len(sentence_aspect_pairs_batch) >= PREDICTION_PROCESSING_BATCH_SIZE:
                                predicted_sentiments_batch = predict_aspect_sentiment_batch(
                                    sentence_aspect_pairs_batch,
                                    absa_tokenizer,
                                    absa_model,
                                    DEVICE,
                                    MAX_SEQ_LENGTH,
                                    batch_size=PREDICTION_PROCESSING_BATCH_SIZE # Pass configured batch size
                                )
                                # Combine metadata with predictions
                                for i, meta_item in enumerate(metadata_for_batch):
                                    meta_item['aspect_sentiment'] = predicted_sentiments_batch[i]
                                    all_aspect_sentiments_data.append(meta_item)

                                sentence_aspect_pairs_batch = [] # Clear batch
                                metadata_for_batch = []      # Clear metadata

        except Exception as e:
            print(f"Major error processing review ID {review_id_val}. Details: {e}. Skipping this review.")
            # Clear current batch if an error occurs at review level to avoid misalignment
            sentence_aspect_pairs_batch = []
            metadata_for_batch = []
            continue

    # Process any remaining items in the last batch
    if sentence_aspect_pairs_batch:
        predicted_sentiments_batch = predict_aspect_sentiment_batch(
            sentence_aspect_pairs_batch,
            absa_tokenizer,
            absa_model,
            DEVICE,
            MAX_SEQ_LENGTH,
            batch_size=PREDICTION_PROCESSING_BATCH_SIZE
        )
        for i, meta_item in enumerate(metadata_for_batch):
            meta_item['aspect_sentiment'] = predicted_sentiments_batch[i]
            all_aspect_sentiments_data.append(meta_item)

    # --- Saving Results ---
    if all_aspect_sentiments_data:
        df_results = pd.DataFrame(all_aspect_sentiments_data)
        print(f"\nSaving {len(df_results)} aspect-sentiment pairs for {assistant} to {output_file_path}...")
        try:
            df_results.to_csv(output_file_path, index=False, encoding='utf-8')
            print(f"Saved successfully for {assistant}.")
            print("\nSample of results:")
            print(df_results.head())
        except Exception as e:
            print(f"Error saving results for {assistant} to CSV: {e}")
    else:
        print(f"No aspect-sentiment pairs found or generated for {assistant}.")

    # Optional: Release memory for the DataFrame if it's very large and you need RAM for the next assistant
    if 'df_results' in locals():
        del df_results
    if 'df_to_process' in locals():
        del df_to_process
    if 'df_reviews_full' in locals():
        del df_reviews_full
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()


print("\n--- Full-scale ABSA processing complete for all specified assistants. ---")


Processing full dataset for: ALEXA
Loaded 141067 reviews for alexa.
Processing all 141067 reviews for alexa.
Identifying aspects and preparing for sentiment prediction for alexa...


Reviews (alexa):   0%|          | 0/141067 [00:00<?, ?it/s]


Saving 277313 aspect-sentiment pairs for alexa to /content/drive/MyDrive/MyThesisProject/results/absa_full_results_colab/alexa_full_absa_sentiments_colab.csv...
Saved successfully for alexa.

Sample of results:
                               reviewId  \
0  ec6d4fdc-a343-4482-a68f-bb3640701560   
1  edec9bb6-a3dd-4d84-8712-a987124e5836   
2  e541aebf-83f4-4f14-828a-71d9643e15b8   
3  e541aebf-83f4-4f14-828a-71d9643e15b8   
4  33459b25-5c2e-41c2-8574-9ae74dae7e72   

                                                                                                                                                                                  sentence_text  \
0                                                                                                           edit item shopping list ridiculously hard allow delete section lot easy delete word   
1                                                                                                                                          

Reviews (google):   0%|          | 0/70252 [00:00<?, ?it/s]