In [1]:
##########################################################
# p2_polaritySubjectivity.py
##########################################################

import json
from textblob import TextBlob

# Load the existing data from the JSON file
'''REPLACE THE FILENAME WITH THE FILE YOU WANT TO FILTER'''
# filename = f'filteredOverallRecords.json'  # Replace with your actual file name
filename = r'evaluationDataset.json'  # Replace with your actual file name

def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        sentiment = "positive"
    elif analysis.sentiment.polarity < 0:
        sentiment = "negative"
    else:
        sentiment = "neutral"
    if analysis.sentiment.subjectivity > 0.5:
        subjectivity = "opinionated"
    else:
        subjectivity = "neutral"
    # "positive" if analysis.sentiment.polarity > 0 "negative" elif analysis.sentiment.polarity > 0 else "neutral"
    return sentiment, subjectivity


with open(filename, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Assuming the data is a list of posts, and each post is a dictionary (like the `post_data` structure)
for index, post in enumerate(data):
    if(index % 100 == 0):
        print(f"{index}/{len(data)}: {index/len(data)*100:.2f}%")
    # Add the subreddit attribute to each post (you can adjust as needed based on your structure)
    post["polarity"], post["subjectivity"] = get_sentiment(post["text"])


# Save the updated data back to the JSON file
with open(filename, 'w', encoding='utf-8') as file:
    json.dump(data, file, indent=4)
print('Done')


0/1000: 0.00%
100/1000: 10.00%
200/1000: 20.00%
300/1000: 30.00%
400/1000: 40.00%
500/1000: 50.00%
600/1000: 60.00%
700/1000: 70.00%
800/1000: 80.00%
900/1000: 90.00%
Done


In [2]:
# prompt: iterate through data and count the source

from collections import Counter

sources = [item.get('source', 'Unknown') for item in data]  # Extract sources, default to 'Unknown' if missing
source_counts = Counter(sources)

if source_counts:
    print("Source Counts:")
    for source, count in source_counts.items():
        print(f"{source}: {count}")


Source Counts:
reddit: 726
twitter: 141
linkedin: 133


##Aspect Extraction

###Option 1: Noun Chunking (Basic)

In [18]:
import spacy
nlp = spacy.load("en_core_web_sm")

def do_aspect_extraction(text):
    doc = nlp(text)
    return [chunk.text for chunk in doc.noun_chunks]


###Option 2: Enhanced Extraction with Modifiers

In [19]:
def do_aspect_extraction_v2(text):
    doc = nlp(text)
    aspects = []
    for token in doc:
        if token.pos_ == "NOUN":
            modifiers = [child.text for child in token.children if child.dep_ in ("amod", "compound")]
            aspect = " ".join(modifiers + [token.text])
            aspects.append(aspect)
    return aspects


###Combined Option 1 & 2

In [5]:
def do_aspect_extraction_combined(text):
    doc = nlp(text)

    aspects = set()

    # Approach 1: noun_chunks
    for chunk in doc.noun_chunks:
        cleaned = chunk.text.strip().lower()
        if len(cleaned.split()) <= 4:  # optional filter
            aspects.add(cleaned)

    # Approach 2: NOUN + (amod / compound modifiers)
    for token in doc:
        if token.pos_ == "NOUN":
            modifiers = [child.text for child in token.children if child.dep_ in ("amod", "compound")]
            phrase = " ".join(modifiers + [token.text])
            cleaned = phrase.strip().lower()
            if len(cleaned.split()) <= 4:
                aspects.add(cleaned)

    return list(aspects)


###Option 3

In [22]:
import spacy
import re

# Load spaCy's transformer-based model
nlp = spacy.load("en_core_web_trf")

def extract_aspects(text, min_len=3, domain_keywords=None):
    """
    Extracts meaningful aspects (noun phrases) from Reddit/Twitter-like text in the AI domain.

    Args:
        text (str): The input text (e.g., Reddit comment or Tweet).
        min_len (int): Minimum length of phrase to be considered valid.
        domain_keywords (set or list): Optional set of domain-specific keywords to keep.

    Returns:
        List[str]: Cleaned list of extracted aspect phrases.
    """

    # Process text with spaCy
    doc = nlp(text)

    # Step 1: Collect raw noun chunks, excluding stopwords
    raw_chunks = [chunk.text.strip().lower() for chunk in doc.noun_chunks if not any(token.is_stop for token in chunk)]

    # Step 2: Clean each chunk
    def is_valid(chunk):
        # Remove too short chunks
        if len(chunk) < min_len:
            return False
        # Remove chunks with only symbols or numbers
        if re.fullmatch(r"[^a-zA-Z]+", chunk):
            return False
        # Optional domain filter
        if domain_keywords and not any(kw in chunk for kw in domain_keywords):
            return False
        return True

    # Step 3: Remove redundant subphrases (like 'grander' if 'grander problems' exists)
    def remove_redundant(chunks):
        return [
            chunk for chunk in chunks
            if not any(chunk != other and chunk in other for other in chunks)
        ]

    # Clean + deduplicate
    filtered_chunks = list(set(filter(is_valid, raw_chunks)))
    final_chunks = remove_redundant(filtered_chunks)

    return final_chunks


In [16]:
import spacy
import re

# Load spaCy's transformer-based model
nlp = spacy.load("en_core_web_trf")

def extract_aspects_v2(text, min_len=3, domain_keywords=None):
    """
    Extracts meaningful aspects (noun phrases) from Reddit/Twitter-like text in the AI domain.

    Args:
        text (str): The input text (e.g., Reddit comment or Tweet).
        min_len (int): Minimum length of phrase to be considered valid.
        domain_keywords (set or list): Optional set of domain-specific keywords to keep.

    Returns:
        List[str]: Cleaned list of extracted aspect phrases.
    """

    doc = nlp(text)

    # Step 1: Collect all noun chunks (no stopword filtering here)
    raw_chunks = [chunk.text.strip().lower() for chunk in doc.noun_chunks]

    # Step 2: Clean and normalize chunks
    def clean_chunk(chunk):
        # Remove terminal punctuation (not inside hashtags)
        chunk = re.sub(r'[^\w\s#-]', '', chunk)
        chunk = re.sub(r'\s+', ' ', chunk).strip()
        return chunk

    cleaned_chunks = [clean_chunk(chunk) for chunk in raw_chunks]

    # Step 3: Filter valid chunks
    def is_valid(chunk):
        if len(chunk) < min_len:
            return False
        # Must contain at least one alpha character
        if not re.search(r"[a-zA-Z]", chunk):
            return False
        # Optional domain filter
        if domain_keywords and not any(kw in chunk for kw in domain_keywords):
            return False
        return True

    # Step 4: Remove redundant subphrases
    def remove_redundant(chunks):
        return [
            chunk for chunk in chunks
            if not any(chunk != other and chunk in other for other in chunks)
        ]

    filtered_chunks = list(set(filter(is_valid, cleaned_chunks)))
    final_chunks = remove_redundant(filtered_chunks)

    return final_chunks


###Option 4

In [6]:
from keybert import KeyBERT

# Load KeyBERT model
kw_model = KeyBERT(model="all-mpnet-base-v2")  # Uses transformer embeddings

# Function to extract aspects with KeyBERT
def extract_aspects_keybert(text, num_aspects=5):
    aspects = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=num_aspects)
    return [aspect[0] for aspect in aspects]  # Return only extracted aspect phrases

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

##Aspect-Based Sentiment Scoring


###Option 1: Pre-trained ABSA Model (transformers)

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
import torch

model_name = "yangheng/deberta-v3-base-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def get_absa_sentiment(text, aspect):
    input_text = f"[CLS] {text} [ASP] {aspect} [SEP]"
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=1)
    labels = ["Negative", "Neutral", "Positive"]
    return labels[torch.argmax(probs)]


tokenizer_config.json:   0%|          | 0.00/372 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

##Combined ABSA Pipeline

In [8]:
def absa_pipeline(text):
    aspects = extract_aspects_keybert(text)
    aspect_sentiments = {}
    for aspect in aspects:
        try:
            sentiment = get_absa_sentiment(text, aspect)
            aspect_sentiments[aspect] = sentiment
        except:
            aspect_sentiments[aspect] = "Unknown"
    return aspect_sentiments


##Evaluation Table Example

In [9]:
len(data)

1000

In [21]:
# prompt: generate 10 random number from 0 to 999, no repeats

import random

def generate_unique_random_numbers(count, min_val, max_val):
    if count > (max_val - min_val + 1):
        raise ValueError("Cannot generate unique numbers within the specified range.")

    numbers = random.sample(range(min_val, max_val + 1), count)
    return numbers

random_numbers = generate_unique_random_numbers(10, 0, 999)
# random_numbers = [478, 805, 22, 572, 728, 554, 645, 129, 100, 960]
# random_numbers = [620, 189, 100, 405, 676, 422, 176, 295, 673, 652]
random_numbers


[377, 68, 305, 71, 255, 147, 72, 660, 306, 203]

In [9]:
sample_tweets = [
    "AI tools are great, but the job situation is scary.",
    "Automation is replacing jobs and it’s terrifying.",
    "Excited about GPT-4 but not sure how it affects hiring."
]

for tweet in sample_tweets:
    baseline = get_sentiment(tweet)
    absa = absa_pipeline(tweet)
    print(f"TWEET: {tweet}")
    print(f"Baseline Sentiment: {baseline}")
    print(f"ABSA Results: {absa}")
    print("-" * 60)


TWEET: AI tools are great, but the job situation is scary.
Baseline Sentiment: ('positive', 'opinionated')
ABSA Results: {'AI tools': 'Positive', 'job': 'Negative', 'job situation': 'Negative'}
------------------------------------------------------------
TWEET: Automation is replacing jobs and it’s terrifying.
Baseline Sentiment: ('negative', 'opinionated')
ABSA Results: {'Automation': 'Negative', 'jobs': 'Negative'}
------------------------------------------------------------
TWEET: Excited about GPT-4 but not sure how it affects hiring.
Baseline Sentiment: ('positive', 'opinionated')
ABSA Results: {}
------------------------------------------------------------


In [11]:
for sample_index in random_numbers:
    text = data[sample_index]["text"]
    baseline = get_sentiment(text)
    absa = absa_pipeline(text)
    print(f"TWEET: {text}")
    print(f"Baseline Sentiment: {baseline}")
    print(f"ABSA Results: {absa}")
    print("-" * 60)

TWEET: Master Your Skills with AI-Powered Learning Learn to evolve in the rapidly-changing job market.SmartCareer AI helps coaches uncover new opportunities for their clients and stay abreast of industry trends.#Skilldevelopment #AIAdvancement #CareerSuccess #AIInsights
Baseline Sentiment: ('positive', 'neutral')
ABSA Results: {'smartcareer ai': 'Positive', 'skills ai': 'Positive', 'skilldevelopment aiadvancement': 'Positive', 'ai helps': 'Positive', 'smartcareer': 'Positive'}
------------------------------------------------------------
TWEET: True, AI and automation are legitimate threats to the working class if things don't change. They don't have to be though. Technology developed for the purpose of making life better for everyone is good. Technology developed for the purpose of maximizing profits for shareholders is bad.
Baseline Sentiment: ('positive', 'opinionated')
ABSA Results: {'automation legitimate': 'Negative', 'automation': 'Negative', 'ai automation': 'Negative', 'ai': 'P

##ABSA Analysis (GPU + Batching)

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "yangheng/deberta-v3-base-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

def get_absa_sentiment(text, aspect):
    input_text = f"[CLS] {text} [ASP] {aspect} [SEP]"
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=1)
    labels = ["Negative", "Neutral", "Positive"]
    return labels[torch.argmax(probs)]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
def batch_absa_sentiment(text, aspects, batch_size=4): # Reduced batch size to 4
    """
    Performs ABSA sentiment analysis in batches.
    """
    all_sentiments = {}
    for i in range(0, len(aspects), batch_size):
        batch_aspects = aspects[i : i + batch_size]
        inputs = [f"[CLS] {text} [ASP] {aspect} [SEP]" for aspect in batch_aspects]
        encoded = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**encoded)

        probs = softmax(outputs.logits, dim=1)
        preds = torch.argmax(probs, dim=1).cpu().tolist()
        labels = ["Negative", "Neutral", "Positive"]

        # Update all_sentiments with results from this batch
        all_sentiments.update({aspect: labels[pred] for aspect, pred in zip(batch_aspects, preds)})

    return all_sentiments

In [6]:
from collections import Counter

def get_overall_sentiment(sentiments_dict):
    if not sentiments_dict:
        return "Neutral"  # Default if no aspects are found
    sentiment_counts = Counter(sentiments_dict.values())
    return sentiment_counts.most_common(1)[0][0]


In [12]:
def absa_pipeline(text):
    aspects = extract_aspects_v2(text)
    # aspects = do_aspect_extraction_v2(text)
    if not aspects:
        return {"aspects": {}, "overall": "Neutral"}

    aspect_sentiments = batch_absa_sentiment(text, aspects)
    overall_sentiment = get_overall_sentiment(aspect_sentiments)

    return {
        "aspects": aspect_sentiments,
        "overall": overall_sentiment
    }

In [20]:
for sample_index in random_numbers:
    text = data[sample_index]["text"]
    baseline = get_sentiment(text)
    absa = absa_pipeline(text)
    print(f"TWEET: {text}")
    print(f"Baseline Sentiment: {baseline}")
    print(f"ABSA Aspects: {absa['aspects']}")
    print(f"ABSA Overall: {absa['overall']}")
    print("-" * 60)

TWEET: Master Your Skills with AI-Powered Learning Learn to evolve in the rapidly-changing job market.SmartCareer AI helps coaches uncover new opportunities for their clients and stay abreast of industry trends.#Skilldevelopment #AIAdvancement #CareerSuccess #AIInsights
Baseline Sentiment: ('positive', 'neutral')
ABSA Aspects: {'job': 'Positive', 'changing job market': 'Positive', 'coaches': 'Positive', 'new opportunities': 'Positive', 'clients': 'Positive', 'industry': 'Positive', 'trends.#Skilldevelopment': 'Positive', 'AIAdvancement': 'Positive', 'AIAdvancement #': 'Positive', 'CareerSuccess AIInsights': 'Positive'}
ABSA Overall: Positive
------------------------------------------------------------
TWEET: True, AI and automation are legitimate threats to the working class if things don't change. They don't have to be though. Technology developed for the purpose of making life better for everyone is good. Technology developed for the purpose of maximizing profits for shareholders is 

In [22]:
print("| Tweet | Baseline Sentiment | ABSA Aspects | ABSA Overall |")
print("|---|---|---|---|")  # Table header

for sample_index in random_numbers:
    text = data[sample_index]["text"]
    baseline = get_sentiment(text)
    absa = absa_pipeline(text)

    # Format ABSA aspects for Markdown table
    absa_aspects_str = ""
    for aspect, sentiment in absa['aspects'].items():
        absa_aspects_str += f"'{aspect}': '{sentiment}'<br>"  # Newline for each aspect

    print(f"| {text} | {baseline} | {absa_aspects_str} | {absa['overall']} |")

| Tweet | Baseline Sentiment | ABSA Aspects | ABSA Overall |
|---|---|---|---|
| >It is nuanced, AI arguably eliminated the job of switch board operator decades ago, that form of automation was able to usurp the task 100%.I worked @ Bell Labs, so I get it! It's not even "arguably"; however I would think the term automation vs. AI makes more sense there. This sort of thing has been going on since the industrial revolution; however even I will admit that AGI is unique in terms of potential for societal upheaval. >So there migth be a "last mile" problem with a lot of these fields when it comes to fully relying on AI. It might not go away soon, for example, AI's tendency to "hallucinate", it caused it to make legal argument with case law that didn't exist. We assume OpenAI will somehow smooth out these kinks soon, but I don't think that's a certainty.Not sure if you have been following my post history, but I got access to OpenAI's secret AGI model last year for a bit. I didn't observe any 