In [None]:
#!pip install -r requirements.txt -q

In [6]:
# imports & device configuration
import os
import re
import html
import itertools as it
import pickle
import hashlib
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.autograd import Function
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    default_data_collator,
    EarlyStoppingCallback
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from scipy.stats import ks_2samp, binomtest
from tqdm.auto import tqdm

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
tqdm.pandas(disable=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device}")

Using cpu


In [7]:
# Path definitions
dict_path = "dataset/FullDictionaries.csv"
amazon_appliances_path = "dataset/amazon_Appliances_5_jiadian.json"
amazon_fashion_path    = "dataset/AMAZON_FASHION_5_shishang.json"
amazon_beauty_path     = "dataset/All_Beauty_5_meizhuang.json"
amazon_pet_path        = "dataset/Pet_Supplies_5_sampled_2.json"
movie_path             = "dataset/Movie Reviews_train.tsv"
twitter1_path          = "dataset/train-twitter.tsv"
twitter2_path          = "dataset/test-twitter.tsv"

DRIVE_PATH = "output"
os.makedirs(DRIVE_PATH, exist_ok=True)

In [8]:
# Load SCM lexicon
lookup_key_col = 'preprocessed word 4 (minus one trailing s)'
cols = ['Sociability dictionary','Sociability direction',
        'Morality dictionary','Morality direction',
        'Ability dictionary','Ability direction',
        'Agency dictionary','Agency direction']

df_scm = pd.read_csv(dict_path)
df_scm.dropna(subset=[lookup_key_col], inplace=True)
df_scm[lookup_key_col] = df_scm[lookup_key_col].astype(str)
lookup_scm = pd.Series(list(zip(*[df_scm[c] for c in cols])),
                       index=df_scm[lookup_key_col]).to_dict()
print(f"SCM lexicon size: {len(lookup_scm)}")

SCM lexicon size: 13930


In [9]:
# Dataset loaders & text normalisation

# preprocess_tweet: remove URLs, mentions, hashtags, punctuation
def preprocess_tweet(t: str) -> str:
    if pd.isna(t): return ""
    t = html.unescape(t).lower()
    t = re.sub(r"http\S+", "", t)
    t = re.sub(r"@\w+",  "", t)
    t = re.sub(r"rt\s+", "", t)
    t = t.replace("#", "")
    t = re.sub(r"&amp;", "and", t)
    t = re.sub(r"[^\w\s]", "", t)
    return re.sub(r"\s+", " ", t).strip()

# preprocess_text: generic lowercase, punctuation removal
def preprocess_text(t: str) -> str:
    if not isinstance(t, str):
        return ""
    t = html.unescape(t).lower()
    t = re.sub(r"http\S+", "", t)
    t = re.sub(r"[^\w\s]", "", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

# Combined loader: Amazon 4‑domain, Movie, Twitter
def load_datasets():
    # Amazon
    dfs = [
        (amazon_appliances_path,'Appliances'),
        (amazon_fashion_path,'Fashion'),
        (amazon_beauty_path,'Beauty'),
        (amazon_pet_path,'Pet'),
    ]
    df_amz = pd.concat(
        [pd.read_json(p, lines=True).assign(source=s) for p,s in dfs],
        ignore_index=True
    )
    df_amz['label_5class'] = df_amz['overall'].astype(int)
    df_amz.rename(columns={'review':'reviewText'}, inplace=True)
    df_amz['review_id'] = 'amazon_' + df_amz.index.astype(str)
    df_amz["clean_text"] = df_amz["reviewText"].dropna().apply(preprocess_text)

    # Movie
    df_mov = pd.read_csv(movie_path, sep='\t')
    df_mov.rename(columns={
        'Phrase':'review_text_movie',
        'Sentiment':'original_sentiment',
        'SentenceId':'review_id'
    }, inplace=True)
    df_mov['review_id'] = (
        'movie_' + df_mov['review_id'].astype(str)
        + '_' + df_mov['PhraseId'].astype(str)
    )
    df_mov['label_5class'] = df_mov['original_sentiment'] + 1
    df_mov["clean_text"] = df_mov["review_text_movie"].dropna().apply(preprocess_text)

    # Twitter
    df_tw = pd.concat([
        pd.read_csv(twitter1_path, sep='\t', header=None, quoting=3,
                    engine='python', on_bad_lines='skip'),
        pd.read_csv(twitter2_path, sep='\t', header=None, quoting=3,
                    engine='python', on_bad_lines='skip')
    ], ignore_index=True)
    df_tw.columns = ['tweet_id','user','original_label','text']
    df_tw['clean_text'] = (
        df_tw['text']
        .replace("Not Available", np.nan)
        .dropna()
        .apply(preprocess_tweet)
    )
    label_map = {-2:1, -1:2, 0:3, 1:4, 2:5}
    df_tw['label_5class'] = df_tw['original_label'].map(label_map).astype(int)
    df_tw['review_id'] = 'tweet_' + df_tw['tweet_id'].astype(str)

    return df_amz, df_mov, df_tw

df_amazon, df_movie, df_twitter = load_datasets()


In [10]:
# Compute SCM Warmth & Competence scores

# Functions to calculate Warmth and Competence based on SCM lexicon
def compute_scm_scores(text: str):
    tokens = re.findall(r'[a-zA-Z]+', str(text).lower())
    sum_s, cnt_s = 0.0, 0
    sum_m, cnt_m = 0.0, 0
    sum_a, cnt_a = 0.0, 0
    sum_ag, cnt_ag = 0.0, 0
    matched_tokens_count = 0

    # 2. Iterate tokens and lookup in lexicon
    for t in tokens:
        if t not in lookup_scm:
            continue
        matched_tokens_count += 1
        cs, ds, cm, dm, ca, da, cag, dag = lookup_scm[t]
        try:
            # Sociability: if cs flag==1 and ds exists
            if pd.notna(cs) and int(cs) == 1 and pd.notna(ds):
                sum_s += float(ds); cnt_s += 1
            # Morality: if cm flag==1 and dm exists
            if pd.notna(cm) and int(cm) == 1 and pd.notna(dm):
                sum_m += float(dm); cnt_m += 1
            # Ability: if ca flag==1 and da exists
            if pd.notna(ca) and int(ca) == 1 and pd.notna(da):
                sum_a += float(da); cnt_a += 1
            # Agency: if cag flag==1 and dag exists
            if pd.notna(cag) and int(cag) == 1 and pd.notna(dag):
                sum_ag += float(dag); cnt_ag += 1
        except:
            pass

    # 3. Compute averages: avoid division by zero
    warmth = (sum_s + sum_m) / (cnt_s + cnt_m) if (cnt_s + cnt_m) else 0.0
    comp   = (sum_a + sum_ag) / (cnt_a + cnt_ag) if (cnt_a + cnt_ag) else 0.0
    return warmth, comp, matched_tokens_count


# Apply SCM scoring to an entire DataFrame
def apply_scm(df, text_col, name):
    total = len(df)
    print(f"\nCalculating SCM scores for '{name}' ({total} rows)…")
    results = df[text_col].progress_apply(compute_scm_scores)

    df['warmth_scm'], df['competence_scm'], df['matched_token_count'] = zip(*results)

    any_match = (df['matched_token_count'] > 0).sum()
    non_w     = (df['warmth_scm'] != 0).sum()
    non_c     = (df['competence_scm'] != 0).sum()

    print(f"  Rows with any matched tokens: {any_match}/{total} ({any_match/total*100:.2f}%)")
    print(f"  Rows with non-zero Warmth:     {non_w}/{total} ({non_w/total*100:.2f}%)")
    print(f"  Rows with non-zero Competence: {non_c}/{total} ({non_c/total*100:.2f}%)")
    return df

# returns annotated dataframes with SCM scores
df_amazon  = apply_scm(df_amazon,  'clean_text',        'Amazon')
df_movie   = apply_scm(df_movie,   'clean_text', 'Movie')
df_twitter = apply_scm(df_twitter, 'clean_text',        'Twitter')

# Bundle into dict for later use
datasets = {
    "Amazon":  {"df": df_amazon,  "text_col": "clean_text"},
    "Movie":   {"df": df_movie,   "text_col": "clean_text"},
    "Twitter": {"df": df_twitter, "text_col": "clean_text"}
}

out_path = os.path.join(DRIVE_PATH, "datasets_prepared.pkl")
with open(out_path, "wb") as f:
    pickle.dump(datasets, f, protocol=pickle.HIGHEST_PROTOCOL)


Calculating SCM scores for 'Amazon' (176404 rows)…


  0%|          | 0/176404 [00:00<?, ?it/s]

  Rows with any matched tokens: 171915/176404 (97.46%)
  Rows with non-zero Warmth:     121279/176404 (68.75%)
  Rows with non-zero Competence: 118880/176404 (67.39%)

Calculating SCM scores for 'Movie' (156060 rows)…


  0%|          | 0/156060 [00:00<?, ?it/s]

  Rows with any matched tokens: 110058/156060 (70.52%)
  Rows with non-zero Warmth:     47070/156060 (30.16%)
  Rows with non-zero Competence: 36314/156060 (23.27%)

Calculating SCM scores for 'Twitter' (25889 rows)…


  0%|          | 0/25889 [00:00<?, ?it/s]

  Rows with any matched tokens: 21099/25889 (81.50%)
  Rows with non-zero Warmth:     10940/25889 (42.26%)
  Rows with non-zero Competence: 9720/25889 (37.54%)
