In [None]:
import json
import re
import time

In [None]:
# !pip install transformers torch

In [None]:
##############################################
# 1) SARCASM DETECTION
##############################################
# Use a pre-trained Hugging Face model specialized in detecting irony/sarcasm.
#   Model card: https://huggingface.co/cardiffnlp/twitter-roberta-base-irony
# Install: pip install transformers torch
# This model is trained on Twitter data

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

SARCASTIC_LABEL = "sarcastic"
NOT_SARCASTIC_LABEL = "not_sarcastic"

model_name = "cardiffnlp/twitter-roberta-base-irony"
sarcasm_tokenizer = AutoTokenizer.from_pretrained(model_name)
sarcasm_model = AutoModelForSequenceClassification.from_pretrained(model_name)

def detect_sarcasm(text: str) -> str:
    inputs = sarcasm_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        logits = sarcasm_model(**inputs).logits
    probs = torch.softmax(logits, dim=1).squeeze()
    # Label 0 = non-irony, Label 1 = irony
    if probs[1].item() > 0.5:
        return SARCASTIC_LABEL
    else:
        return NOT_SARCASTIC_LABEL

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## part 2

In [None]:
# !python -m spacy download en_core_web_trf

In [None]:
##############################################
# 2) NAMED ENTITY RECOGNITION (NER)
##############################################
# We'll use spaCy's English model
# Install: pip install spacy
# Then: python -m spacy download en_core_web_sm
# Then: python -m spacy download en_core_web_trf

import spacy

# 1) Load an advanced spaCy model (fallback to en_core_web_sm if needed)
try:
    nlp = spacy.load("en_core_web_trf")
    print("Using en_core_web_trf for advanced NER.")
except:

    # dont fall aback to smallermodel for better results
    # nlp = spacy.load("en_core_web_sm")
    print("Falling back to en_core_web_sm.")


def do_ner(text: str):
    """
    Return a list of recognized named entities (text + label).
    Example output: [{"entity": "AI", "label": "ORG"}, ...]
    """
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append({"entity": ent.text, "label": ent.label_})

    return entities

Using en_core_web_trf for advanced NER.


## part 3

In [None]:
! pip install rapidfuzz


Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0


In [None]:
##############################################
# 3) CONCEPT EXTRACTION
##############################################
# Concept extraction can be domain-specific.
# that looks for keywords/phrases in the text and groups them under certain "concepts".
# In real practice, you might use a more sophisticated approach (e.g., ontology mapping).

concept_dictionary = {
    "jobs_and_careers": [
        "job", "jobs", "career", "hire", "hiring", "position", "occupation", "employment",
        "recruitment", "headhunting", "resumé", "job search", "job application", "interview",
        "promotion", "layoff", "downsizing", "resignation", "fired", "retrenched", "workforce",
        "unemployment", "job board", "career change", "gig economy", "contract work",
        "freelance", "upskilling", "reskilling", "talent shortage", "skills gap",
        "career transition", "remote job", "flexible work", "hybrid work"
    ],

    "ai_tech": [
        "ai", "artificial intelligence", "ml", "machine learning", "deep learning",
        "neural network", "natural language processing", "NLP", "computer vision",
        "generative ai", "chatgpt", "gpt-4", "llm", "transformer model", "reinforcement learning",
        "automation", "autonomous system", "robot", "ai-powered", "ai-driven",
        "ai algorithm", "ai model", "ai tool", "ai system", "ai platform", "ai assistant",
        "predictive modeling", "intelligent agent", "intelligent automation", "digital worker",
        "AI ethics", "AI governance", "AI safety", "AI alignment", "AI regulation",
        "OpenAI", "Hugging Face", "LangChain", "Chatbot", "BERT", "GPT", "GAN",
        "AI job disruption", "AI replacing jobs", "AI impact", "AI revolution"
    ],

    "job_market_trends": [
        "job market", "labor market", "workforce trends", "employment trends",
        "market shift", "skills demand", "future of work", "career outlook",
        "talent acquisition", "workforce transformation", "job automation",
        "digital transformation", "economic uncertainty", "AI economy", "skills of the future",
        "AI in hiring", "job displacement", "employment shift", "workplace change"
    ],

    "finance": [
        "bank", "investment", "finance", "trading", "fintech", "income", "earnings",
        "financial security", "salary", "paycheck", "minimum wage", "wealth gap",
        "cost of living", "job loss compensation", "recession", "economic downturn",
        "furlough", "financial hardship", "severance", "retirement fund"
    ],

    "mental_health": [
        "stress", "burnout", "anxiety", "job stress", "mental health", "wellbeing",
        "work-life balance", "job insecurity", "career anxiety", "AI anxiety"
    ],

    "education_training": [
        "upskilling", "reskilling", "online course", "certification", "MOOC", "bootcamp",
        "lifelong learning", "career coaching", "learning path", "digital skills",
        "AI literacy", "tech training", "coursework", "workforce development"
    ],

     "automation_and_displacement": [
        "automation", "job automation", "automated process", "workflow automation",
        "displaced workers", "displacement", "automated job loss", "automated replacement",
        "robotic process automation", "RPA", "bots replacing humans", "AI takeover",
        "human redundancy", "task automation"
    ],

    "policy_and_governance": [
        "universal basic income", "UBI", "AI regulation", "labor law", "AI tax",
        "tech policy", "future legislation", "AI governance", "worker protection",
        "job guarantee", "government retraining", "economic policy", "union response",
        "digital rights", "job security policy"
    ],

    "recruitment_technology": [
        "AI recruitment", "AI hiring", "talent filter", "algorithmic hiring",
        "resume screening", "candidate ranking", "job matching platform", "talent intelligence",
        "hiring automation", "virtual interview", "digital HR", "ATS", "HR tech"
    ],

    "remote_and_gig_work": [
        "gig economy", "freelancing", "remote job", "remote-first", "digital nomad",
        "platform work", "side hustle", "Uberization", "creator economy", "independent worker",
        "contractor", "online work", "flex work", "microtask", "Upwork", "Fiverr"
    ],

    "public_sentiment_discourse": [
        "boomer", "doomer", "tech bro", "jobpocalypse", "decel", "quiet quitting",
        "great resignation", "job hopping", "layoff wave", "hustle culture", "AI hype",
        "future fear", "doomscrolling", "LinkedIn post", "upskilling frenzy", "prompt engineering"
    ],

    "tech_company_trends": [
        "OpenAI", "Google DeepMind", "Meta AI", "Anthropic", "Stability AI", "Amazon layoffs",
        "tech layoffs", "hiring freeze", "big tech", "FAANG", "tech exodus", "startup layoffs",
        "VC funding freeze", "AI startup", "early retirement", "LinkedIn hiring trends"
    ]
}






In [None]:
import re
from spacy.matcher import PhraseMatcher
# For fuzzy matching (preferred over fuzzywuzzy)
# pip install rapidfuzz
from rapidfuzz import fuzz
from rapidfuzz import process as fuzz_process

# Preprocess PhraseMatcher for multi-word terms
phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
for cat, keywords in concept_dictionary.items():
    patterns = [nlp.make_doc(kw) for kw in keywords if len(kw.split()) > 1]
    if patterns:
        phrase_matcher.add(cat, patterns)

def lemmatized_tokens(doc):
    return [token.lemma_.lower() for token in doc if token.is_alpha]

def do_concept_extraction_v3(text: str, fuzzy_threshold=88):
    doc = nlp(text)
    matched_concepts = set()

    # Phrase matching (multi-word only)
    matches = phrase_matcher(doc)
    for match_id, start, end in matches:
        matched_concepts.add(nlp.vocab.strings[match_id])

    # Token lemmatization
    tokens = lemmatized_tokens(doc)

    # Fuzzy matching for single-word concepts only
    for cat, keywords in concept_dictionary.items():
        for kw in keywords:
            if len(kw.split()) == 1:
                for token in tokens:
                    score = fuzz.partial_ratio(token, kw.lower())
                    if score >= fuzzy_threshold:
                        matched_concepts.add(cat)
                        break  # Avoid multiple adds per category

    return list(matched_concepts)


## part4

In [None]:
##############################################
# 4) ASPECT EXTRACTION
##############################################

In [None]:
'''
Richer Aspect Extraction using/;
Instead of returning every noun_chunk, we can use dependency patterns to identify
aspects that have “descriptive context” (e.g., adjectives or prepositional modifiers).

We can attempt to collect the head noun plus its modifiers (adjectival, compound)
to form a more coherent aspect phrase.
'''
def do_aspect_extraction_v2(text: str):
    """
    Extract aspects by looking for 'noun phrases' that include
    any of the following:
      - Adjectival modifiers (amod)
      - Compound nouns
      - Prepositional modifiers (nmod) linking nouns
    This approach can yield more descriptive aspects like: "powerful AI model", "advanced deep learning techniques"
    """
    doc = nlp(text)
    aspects = []

    # We'll store aspect phrases by their head noun token
    visited_heads = set()

    for token in doc:
        # If it's a noun that isn't a pronoun or 'dummy' token
        if token.pos_ == "NOUN" and token.lemma_ not in ["thing", "something", "someone"]:
            if token.dep_ in ["nsubj", "dobj", "pobj", "ROOT", "attr", "conj"]:
                aspect_span = _expand_aspect(token)
                # De-duplicate
                if aspect_span not in visited_heads:
                    visited_heads.add(aspect_span)
                    aspects.append(aspect_span)

    return aspects

def _expand_aspect(head_token):
    """
    Expand a noun token to include its adjectival or compound modifiers.
    We'll gather left-side adjectives, compound nouns, etc.
    Example:
      Head: "model"
      Left modifies: "powerful", "AI"
      => "powerful AI model"
    """
    # Gather all tokens that are part of this aspect
    min_i = head_token.i
    max_i = head_token.i
    for child in head_token.children:
        # If there's a prepositional structure "model of something", you can add logic here
        pass

    # Check left siblings for compounds or amod
    for left in reversed(list(head_token.lefts)):
        if left.dep_ in ["amod", "compound", "nmod", "nn"] or left.pos_ in ["ADJ", "NOUN"]:
            min_i = min(min_i, left.i)
        else:
            # If we hit a token that isn't describing this noun, break out
            break

    # Build the text from min_i to max_i
    aspect_span = head_token.doc[min_i : max_i + 1].text
    return aspect_span

## part5

In [None]:
##############################################
# PROCESS JSON DATA
##############################################

if __name__ == "__main__":
    # when running pls REPLACE WITH YOUR respective FILENAME
    # /content/drive/MyDrive/x_data/data.json
    # /content/drive/MyDrive/x_data/data_final.json
    filename = r"/content/drive/MyDrive/x_data/evaluationDataset.json"
    output_filename = r"/content/drive/MyDrive/x_data/evaluationDataset_final.json"

    # Load JSON
    with open(filename, "r", encoding="utf-8") as infile:
        data = json.load(infile)

    # Process each record
    for i, record in enumerate(data):

        if i % 100 == 0:
            if(i==0):
              print(f"Processing record {i}/{len(data)}")
            else:
              print(f"Processing record {i}/{len(data)} in {time.time()-start_time}sec")
            start_time=time.time()

        text = record.get("text", "")

        # 1) Sarcasm detection
        record["sarcasm_label"] = detect_sarcasm(text)  # "sarcastic" or "not_sarcastic"

        # 2) Named Entity Recognition
        record["named_entities"] = do_ner(text)
        # ner_v2_results = do_ner_v2(text)
        # # record["concepts_v2"] = do_concept_extraction_v2(text)
        # for item in ner_v2_results:
        #   if "score" in item:
        #     # Convert from np.float32 to standard Python float
        #     item["score"] = float(item["score"])
        # record["named_entities_v2"] = ner_v2_results

        # 3) Concept Extraction
        # record["concepts"] = do_concept_extraction(text)
        # record["concepts_V2"] = do_concept_extraction_v2(text)
        record["concepts"] = do_concept_extraction_v3(text)



        # 4) Aspect Extraction
        # record["aspects"] = do_aspect_extraction(text)
        record["aspects"] = do_aspect_extraction_v2(text)


    # Save updated JSON
    with open(output_filename, "w", encoding="utf-8") as outfile:
        json.dump(data, outfile, indent=4, ensure_ascii=False)

    print("All done! Check your output file for updated JSON data.")
    print(f"Output saved to {output_filename}")
    print(f"Original data count: {len(data)}")


Processing record 0/1000
Processing record 100/1000 in 138.61700344085693sec
Processing record 200/1000 in 125.30375742912292sec
Processing record 300/1000 in 131.96119809150696sec
Processing record 400/1000 in 138.91060256958008sec
Processing record 500/1000 in 127.3475775718689sec
Processing record 600/1000 in 129.9900472164154sec
Processing record 700/1000 in 120.318106174469sec
Processing record 800/1000 in 165.3344211578369sec
Processing record 900/1000 in 156.30491018295288sec
All done! Check your output file for updated JSON data.
Output saved to /content/drive/MyDrive/x_data/evaluationDataset_final.json
Original data count: 1000


In [None]:
import pprint
  # Process and print results
for i, record in enumerate(data[:10]):  # You can remove [:10] to print all
    print("=" * 100)
    print(f"Record #{i+1}")
    print("-" * 100)
    print(f"📌 Text:\n")
    pprint.pprint(f"{record['text']}")

    print("="*10)
    print(f"🌀 Sarcasm Detection: {record.get('sarcasm_label', 'N/A')}")

    print("="*10)
    print("\n🔍 Named  (spaCy):")
    for ent in record.get("named_entities", []):
        print(f"  - {ent['entity']} ({ent['label']})")

    # print("\n🔍 Named Entities_V2_(HuggingFace):")
    # for ent in record.get("named_entities_v2", []):
    #         # The HF pipeline uses 'entity_group' and 'word' keys:
    #         ent_group = ent.get("entity_group", "")
    #         word = ent.get("word", "")
    #         score = ent.get("score", 0.0)
    #         print(f"  - {word} [{ent_group}] (score={score:.2f})")

    print("="*10)
    print("\n🧠 Concepts:")
    for concept in record.get("concepts", []):
        print(f"  - {concept}")


    print("="*10)
    print("\n🔧 Aspects:")
    for aspect in record.get("aspects", []):
        print(f"  - {aspect}")



Record #1
----------------------------------------------------------------------------------------------------
📌 Text:

('Yes. AI is going to replace bullshit jobs because AI can give bulshit '
 'answers just as convincingly as a human. A nutritionist.')
🌀 Sarcasm Detection: sarcastic

🔍 Named  (spaCy):

🧠 Concepts:
  - public_sentiment_discourse
  - education_training
  - automation_and_displacement
  - mental_health
  - recruitment_technology
  - finance
  - tech_company_trends
  - ai_tech
  - remote_and_gig_work
  - jobs_and_careers

🔧 Aspects:
  - bullshit jobs
  - bulshit answers
  - human
  - nutritionist
Record #2
----------------------------------------------------------------------------------------------------
📌 Text:

'What is your job and do you think AI will replace you in the next 10 years?'
🌀 Sarcasm Detection: not_sarcastic

🔍 Named  (spaCy):
  - the next 10 years (DATE)

🧠 Concepts:
  - public_sentiment_discourse
  - education_training
  - mental_health
  - finance
  -