In [None]:
##########################################################
# p2_polaritySubjectivity.py
##########################################################

import json
from textblob import TextBlob

# Load the existing data from the JSON file
'''REPLACE THE FILENAME WITH THE FILE YOU WANT TO FILTER'''
# filename = f'filteredOverallRecords.json'  # Replace with your actual file name
filename = r'evaluationDataset.json'  # Replace with your actual file name

def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        sentiment = "positive"
    elif analysis.sentiment.polarity < 0:
        sentiment = "negative"
    else:
        sentiment = "neutral"
    if analysis.sentiment.subjectivity > 0.5:
        subjectivity = "opinionated"
    else:
        subjectivity = "neutral"
    # "positive" if analysis.sentiment.polarity > 0 "negative" elif analysis.sentiment.polarity > 0 else "neutral"
    return sentiment, subjectivity


with open(filename, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Assuming the data is a list of posts, and each post is a dictionary (like the `post_data` structure)
for index, post in enumerate(data):
    if(index % 100 == 0):
        print(f"{index}/{len(data)}: {index/len(data)*100:.2f}%")
    # Add the subreddit attribute to each post (you can adjust as needed based on your structure)
    post["polarity"], post["subjectivity"] = get_sentiment(post["text"])


# Save the updated data back to the JSON file
with open(filename, 'w', encoding='utf-8') as file:
    json.dump(data, file, indent=4)
print('Done')


0/1000: 0.00%
100/1000: 10.00%
200/1000: 20.00%
300/1000: 30.00%
400/1000: 40.00%
500/1000: 50.00%
600/1000: 60.00%
700/1000: 70.00%
800/1000: 80.00%
900/1000: 90.00%
Done


##Aspect Extraction

###Option 1: Noun Chunking (Basic)

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def do_aspect_extraction(text):
    doc = nlp(text)
    return [chunk.text for chunk in doc.noun_chunks]


###Option 2: Enhanced Extraction with Modifiers

In [None]:
def do_aspect_extraction_v2(text):
    doc = nlp(text)
    aspects = []
    for token in doc:
        if token.pos_ == "NOUN":
            modifiers = [child.text for child in token.children if child.dep_ in ("amod", "compound")]
            aspect = " ".join(modifiers + [token.text])
            aspects.append(aspect)
    return aspects


##Aspect-Based Sentiment Scoring


###Option 1: Pre-trained ABSA Model (transformers)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
import torch

model_name = "yangheng/deberta-v3-base-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def get_absa_sentiment(text, aspect):
    input_text = f"[CLS] {text} [ASP] {aspect} [SEP]"
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=1)
    labels = ["Negative", "Neutral", "Positive"]
    return labels[torch.argmax(probs)]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/372 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

##Combined ABSA Pipeline

In [None]:
def absa_pipeline(text):
    aspects = do_aspect_extraction_v2(text)
    aspect_sentiments = {}
    for aspect in aspects:
        try:
            sentiment = get_absa_sentiment(text, aspect)
            aspect_sentiments[aspect] = sentiment
        except:
            aspect_sentiments[aspect] = "Unknown"
    return aspect_sentiments


##Evaluation Table Example

In [None]:
len(data)

1000

In [None]:
# prompt: generate 10 random number from 0 to 999, no repeats

import random

def generate_unique_random_numbers(count, min_val, max_val):
    if count > (max_val - min_val + 1):
        raise ValueError("Cannot generate unique numbers within the specified range.")

    numbers = random.sample(range(min_val, max_val + 1), count)
    return numbers

random_numbers = generate_unique_random_numbers(10, 0, 999)
random_numbers


[478, 805, 22, 572, 728, 554, 645, 129, 100, 960]

In [None]:
sample_tweets = [
    "AI tools are great, but the job situation is scary.",
    "Automation is replacing jobs and it’s terrifying.",
    "Excited about GPT-4 but not sure how it affects hiring."
]

for tweet in sample_tweets:
    baseline = get_sentiment(tweet)
    absa = absa_pipeline(tweet)
    print(f"TWEET: {tweet}")
    print(f"Baseline Sentiment: {baseline}")
    print(f"ABSA Results: {absa}")
    print("-" * 60)


TWEET: AI tools are great, but the job situation is scary.
Baseline Sentiment: ('positive', 'opinionated')
ABSA Results: {'AI tools': 'Positive', 'job': 'Negative', 'job situation': 'Negative'}
------------------------------------------------------------
TWEET: Automation is replacing jobs and it’s terrifying.
Baseline Sentiment: ('negative', 'opinionated')
ABSA Results: {'Automation': 'Negative', 'jobs': 'Negative'}
------------------------------------------------------------
TWEET: Excited about GPT-4 but not sure how it affects hiring.
Baseline Sentiment: ('positive', 'opinionated')
ABSA Results: {}
------------------------------------------------------------


In [None]:
for sample_index in random_numbers:
    text = data[sample_index]["text"]
    baseline = get_sentiment(text)
    absa = absa_pipeline(text)
    print(f"TWEET: {text}")
    print(f"Baseline Sentiment: {baseline}")
    print(f"ABSA Results: {absa}")
    print("-" * 60)

TWEET: Master Your Skills with AI-Powered Learning Learn to evolve in the rapidly-changing job market.SmartCareer AI helps coaches uncover new opportunities for their clients and stay abreast of industry trends.#Skilldevelopment #AIAdvancement #CareerSuccess #AIInsights
Baseline Sentiment: ('positive', 'neutral')
ABSA Results: {'job': 'Positive', 'changing job market': 'Positive', 'coaches': 'Positive', 'new opportunities': 'Positive', 'clients': 'Positive', 'industry': 'Positive', 'trends.#Skilldevelopment': 'Positive', 'AIAdvancement': 'Positive', 'AIAdvancement #': 'Positive', 'CareerSuccess AIInsights': 'Positive'}
------------------------------------------------------------
TWEET: True, AI and automation are legitimate threats to the working class if things don't change. They don't have to be though. Technology developed for the purpose of making life better for everyone is good. Technology developed for the purpose of maximizing profits for shareholders is bad.
Baseline Sentiment

##ABSA Analysis (GPU + Batching)

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

def get_absa_sentiment(text, aspect):
    input_text = f"[CLS] {text} [ASP] {aspect} [SEP]"
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=1)
    labels = ["Negative", "Neutral", "Positive"]
    return labels[torch.argmax(probs)]


In [None]:
def batch_absa_sentiment(text, aspects, batch_size=4): # Reduced batch size to 4
    """
    Performs ABSA sentiment analysis in batches.
    """
    all_sentiments = {}
    for i in range(0, len(aspects), batch_size):
        batch_aspects = aspects[i : i + batch_size]
        inputs = [f"[CLS] {text} [ASP] {aspect} [SEP]" for aspect in batch_aspects]
        encoded = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**encoded)

        probs = softmax(outputs.logits, dim=1)
        preds = torch.argmax(probs, dim=1).cpu().tolist()
        labels = ["Negative", "Neutral", "Positive"]

        # Update all_sentiments with results from this batch
        all_sentiments.update({aspect: labels[pred] for aspect, pred in zip(batch_aspects, preds)})

    return all_sentiments

In [None]:
def absa_pipeline(text):
    aspects = do_aspect_extraction_v2(text)
    if not aspects:
        return {}
    return batch_absa_sentiment(text, aspects)


In [None]:
for sample_index in random_numbers:
    text = data[sample_index]["text"]
    baseline = get_sentiment(text)
    absa = absa_pipeline(text)
    print(f"TWEET: {text}")
    print(f"Baseline Sentiment: {baseline}")
    print(f"ABSA Results: {absa}")
    print("-" * 60)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


TWEET: Master Your Skills with AI-Powered Learning Learn to evolve in the rapidly-changing job market.SmartCareer AI helps coaches uncover new opportunities for their clients and stay abreast of industry trends.#Skilldevelopment #AIAdvancement #CareerSuccess #AIInsights
Baseline Sentiment: ('positive', 'neutral')
ABSA Results: {'job': 'Positive', 'changing job market': 'Positive', 'coaches': 'Positive', 'new opportunities': 'Positive', 'clients': 'Positive', 'industry': 'Positive', 'trends.#Skilldevelopment': 'Positive', 'AIAdvancement': 'Positive', 'AIAdvancement #': 'Positive', 'CareerSuccess AIInsights': 'Positive'}
------------------------------------------------------------
TWEET: True, AI and automation are legitimate threats to the working class if things don't change. They don't have to be though. Technology developed for the purpose of making life better for everyone is good. Technology developed for the purpose of maximizing profits for shareholders is bad.
Baseline Sentiment