In [5]:
# imports & device configuration
import os
import re
import html
import itertools as it
import pickle
import hashlib
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.autograd import Function
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    default_data_collator,
    EarlyStoppingCallback
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from scipy.stats import ks_2samp, binomtest
from tqdm.auto import tqdm

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
tqdm.pandas(disable=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device}")

Using cpu


In [6]:
# Path definitions
dict_path = "dataset/FullDictionaries.csv"
amazon_appliances_path = "dataset/amazon_Appliances_5_jiadian.json"
amazon_fashion_path    = "dataset/AMAZON_FASHION_5_shishang.json"
amazon_beauty_path     = "dataset/All_Beauty_5_meizhuang.json"
amazon_pet_path        = "dataset/Pet_Supplies_5_sampled_2.json"
movie_path             = "dataset/Movie Reviews_train.tsv"
twitter1_path          = "dataset/train-twitter.tsv"
twitter2_path          = "dataset/test-twitter.tsv"

DRIVE_PATH = "output"
os.makedirs(DRIVE_PATH, exist_ok=True)

In [7]:
# Load Prepared Datasets
in_path = os.path.join(DRIVE_PATH, "datasets_prepared.pkl")
with open(in_path, "rb") as f:
    datasets = pickle.load(f)

In [None]:
# PyTorch dataset / tokenizer

# Dataset wrapper for HuggingFace Trainer
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, enc, labels):
        self.enc = enc
        self.labels = list(labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k,v in self.enc.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self): return len(self.labels)

# Tokenize a list of raw texts in batches to prevent out-of-memory errors from processing all texts at once
def batch_tokenize(tok, texts, max_len, name, batch=1024):
    enc = {'input_ids': [], 'attention_mask': []}
    it = range(0, len(texts), batch)
    it = tqdm(it, desc=f"Tokenizing {name}", leave=False)

    # tokenize each batch
    for i in it:
        batch_text = [str(t) if isinstance(t, str) else "" for t in texts[i:i+batch]]
        out = tok(batch_text,
                  truncation=True,
                  padding='max_length',
                  max_length=max_len,
                  return_tensors=None)
        enc['input_ids'] += out['input_ids']
        enc['attention_mask'] += out['attention_mask']
    return enc

# Define an evaluation metrics function
def compute_metrics(pred):
    y_true = pred.label_ids
    y_pred = pred.predictions.argmax(-1)
    prec,rec,f1,_ = precision_recall_fscore_support(
        y_true, y_pred, average='macro', zero_division=0)
    return {'accuracy': accuracy_score(y_true, y_pred),
            'f1_macro': f1,
            'precision_macro': prec,
            'recall_macro': rec}

In [None]:
# train & evaluate

# Global hyperparameters/constants for training and evaluation:
MODEL_NAME       = 'bert-base-uncased'
MAX_LEN          = 256
TRAIN_BATCH_SIZE = 128
EVAL_BATCH_SIZE  = 256

# Perform train/val/test split, fine-tuning, and evaluation on a given DataFrame
def train_and_eval(df, text_col, dataset_name, epochs):
    df = df.copy()

    # Shift 1–5 labels to 0–4 for model training
    df['model_label'] = df['label_5class'] - 1

    # split into train/val/test with 80/10/10 proportions
    idx_all = df.index.to_numpy()
    train_idx, temp_idx = train_test_split(
        idx_all, test_size=0.2, random_state=42,
        stratify=df['model_label'])
    val_idx, test_idx = train_test_split(
        temp_idx, test_size=0.5, random_state=42,
        stratify=df.loc[temp_idx,'model_label'])

    # Load
    tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

    # Tokenize train, validation, and test texts in batches
    enc_train = batch_tokenize(tok, df.loc[train_idx, text_col].tolist(),
                               MAX_LEN, dataset_name+" Train")
    enc_val   = batch_tokenize(tok, df.loc[val_idx,   text_col].tolist(),
                               MAX_LEN, dataset_name+" Val")
    enc_test  = batch_tokenize(tok, df.loc[test_idx,  text_col].tolist(),
                               MAX_LEN, dataset_name+" Test")

    # Construct PyTorch Datasets
    ds_train = SentimentDataset(enc_train, df.loc[train_idx,'model_label'])
    ds_val   = SentimentDataset(enc_val,   df.loc[val_idx,  'model_label'])
    ds_test  = SentimentDataset(enc_test,  df.loc[test_idx, 'model_label'])

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=5).to(device)

    # Configure TrainingArguments for the Trainer
    args = TrainingArguments(output_dir=f'{DRIVE_PATH}/results_{dataset_name}',
                             num_train_epochs=epochs,
                             per_device_train_batch_size=TRAIN_BATCH_SIZE,
                             per_device_eval_batch_size=EVAL_BATCH_SIZE,
                             logging_steps=100,
                             fp16=torch.cuda.is_available(),
                             save_strategy="epoch",
                             save_total_limit=1,
                             report_to="none")
    trainer = Trainer(model=model, args=args,
                      train_dataset=ds_train,
                      eval_dataset=ds_val,
                      tokenizer=tok,
                      compute_metrics=compute_metrics)
    trainer.train()
    val_res  = trainer.evaluate(ds_val)
    test_res = trainer.evaluate(ds_test)
    print(f"[TEST] {dataset_name}  acc={test_res['eval_accuracy']:.4f}  f1={test_res['eval_f1_macro']:.4f}")

    return {
        'train_idx':    train_idx,
        'val_idx':      val_idx,
        'ds_train':    ds_train,
        'ds_val':      ds_val,
        'test_idx':    test_idx,
        'trainer':     trainer,
        'tokenizer':   tok,
        'val_f1_macro': val_res['eval_f1_macro'],
        'accuracy':     test_res['eval_accuracy'],
        'f1_macro':     test_res['eval_f1_macro'],
    }

In [None]:
# epoch grid per dataset

# Specify the list of epochs to try for each dataset
EPOCH_GRID_DS = {
    "Amazon" : [2,3,4],
     #"Amazon" : [2],
    "Movie"  : [3,4,5],
     #"Movie" : [3],
    "Twitter": [4,5,6]
     #"Twitter": [5]
}

# Threshold for performance drop
TH = 0.005

all_results = {}

# Loop over each dataset to perform epoch grid search
for ds_name, info in datasets.items():
    epoch_list  = EPOCH_GRID_DS[ds_name]
    grid_stats  = {}
    best_f1     = -1

    # For each candidate epoch, call train_and_eval
    for ep in epoch_list:
        print(f"\n[GRID] {ds_name}  epoch={ep}")
        res = train_and_eval(info['df'], info['text_col'],
                             f"{ds_name}_ep{ep}", ep)
        grid_stats[ep] = res
        if res['val_f1_macro'] > best_f1:
            best_f1 = res['val_f1_macro']

    chosen = min(e for e in epoch_list
                 if grid_stats[e]['val_f1_macro'] >= best_f1 - TH)
    print(f"[SELECT] {ds_name}: choose epoch={chosen}  (best F1={best_f1:.4f})")
    all_results[ds_name] = {**grid_stats[chosen], 'chosen_epoch': chosen}

    # save to CSV
    pd.DataFrame({'epoch':epoch_list,
                  'dev_f1':[grid_stats[e]['val_f1_macro'] for e in epoch_list]}
                ).to_csv(f"{ds_name}_epoch_curve.csv", index=False)




[GRID] Amazon  epoch=2


Tokenizing Amazon_ep2 Train:   0%|          | 0/138 [00:00<?, ?it/s]

Tokenizing Amazon_ep2 Val:   0%|          | 0/18 [00:00<?, ?it/s]

Tokenizing Amazon_ep2 Test:   0%|          | 0/18 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args,


Step,Training Loss
100,1.1962
200,0.9817
300,0.9568
400,0.9334
500,0.92
600,0.8929
700,0.8726
800,0.8828
900,0.8656
1000,0.8626


[TEST] Amazon_ep2  acc=0.6489  f1=0.6017

[GRID] Amazon  epoch=3


Tokenizing Amazon_ep3 Train:   0%|          | 0/138 [00:00<?, ?it/s]

Tokenizing Amazon_ep3 Val:   0%|          | 0/18 [00:00<?, ?it/s]

Tokenizing Amazon_ep3 Test:   0%|          | 0/18 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args,


Step,Training Loss
100,1.1776
200,0.9821
300,0.9542
400,0.9325
500,0.9185
600,0.894
700,0.8755
800,0.8841
900,0.8684
1000,0.8625


[TEST] Amazon_ep3  acc=0.6491  f1=0.6074

[GRID] Amazon  epoch=4


Tokenizing Amazon_ep4 Train:   0%|          | 0/138 [00:00<?, ?it/s]

Tokenizing Amazon_ep4 Val:   0%|          | 0/18 [00:00<?, ?it/s]

Tokenizing Amazon_ep4 Test:   0%|          | 0/18 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args,


Step,Training Loss
100,1.1658
200,0.9825
300,0.9552
400,0.9343
500,0.9187
600,0.8934
700,0.8779
800,0.8823
900,0.8659
1000,0.8652


[TEST] Amazon_ep4  acc=0.6424  f1=0.6004
[SELECT] Amazon: choose epoch=2  (best F1=0.6076)

[GRID] Movie  epoch=3


Tokenizing Movie_ep3 Train:   0%|          | 0/122 [00:00<?, ?it/s]

Tokenizing Movie_ep3 Val:   0%|          | 0/16 [00:00<?, ?it/s]

Tokenizing Movie_ep3 Test:   0%|          | 0/16 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args,


Step,Training Loss
100,1.0138
200,0.842
300,0.8151
400,0.7865
500,0.7845
600,0.7809
700,0.7491
800,0.7534
900,0.7426
1000,0.7213


[TEST] Movie_ep3  acc=0.7004  f1=0.6238

[GRID] Movie  epoch=4


Tokenizing Movie_ep4 Train:   0%|          | 0/122 [00:00<?, ?it/s]

Tokenizing Movie_ep4 Val:   0%|          | 0/16 [00:00<?, ?it/s]

Tokenizing Movie_ep4 Test:   0%|          | 0/16 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args,


Step,Training Loss
100,1.0011
200,0.8368
300,0.8124
400,0.7881
500,0.7843
600,0.7814
700,0.7534
800,0.7522
900,0.7431
1000,0.7212


[TEST] Movie_ep4  acc=0.6932  f1=0.6152

[GRID] Movie  epoch=5


Tokenizing Movie_ep5 Train:   0%|          | 0/122 [00:00<?, ?it/s]

Tokenizing Movie_ep5 Val:   0%|          | 0/16 [00:00<?, ?it/s]

Tokenizing Movie_ep5 Test:   0%|          | 0/16 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args,


Step,Training Loss
100,1.0221
200,0.8436
300,0.8154
400,0.7889
500,0.7859
600,0.7836
700,0.7507
800,0.7561
900,0.7463
1000,0.7255


[TEST] Movie_ep5  acc=0.6875  f1=0.6129
[SELECT] Movie: choose epoch=3  (best F1=0.6181)

[GRID] Twitter  epoch=4


Tokenizing Twitter_ep4 Train:   0%|          | 0/21 [00:00<?, ?it/s]

Tokenizing Twitter_ep4 Val:   0%|          | 0/3 [00:00<?, ?it/s]

Tokenizing Twitter_ep4 Test:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args,


Step,Training Loss
100,1.0435
200,0.8411
300,0.7544
400,0.6318
500,0.5633
600,0.4557


[TEST] Twitter_ep4  acc=0.6014  f1=0.4402

[GRID] Twitter  epoch=5


Tokenizing Twitter_ep5 Train:   0%|          | 0/21 [00:00<?, ?it/s]

Tokenizing Twitter_ep5 Val:   0%|          | 0/3 [00:00<?, ?it/s]

Tokenizing Twitter_ep5 Test:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args,


Step,Training Loss
100,1.0262
200,0.8393
300,0.7566
400,0.627
500,0.5581
600,0.4385
700,0.3997
800,0.3575


[TEST] Twitter_ep5  acc=0.6064  f1=0.4740

[GRID] Twitter  epoch=6


Tokenizing Twitter_ep6 Train:   0%|          | 0/21 [00:00<?, ?it/s]

Tokenizing Twitter_ep6 Val:   0%|          | 0/3 [00:00<?, ?it/s]

Tokenizing Twitter_ep6 Test:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args,


Step,Training Loss
100,1.0399
200,0.8365
300,0.7491
400,0.6117
500,0.5377
600,0.4115
700,0.3791
800,0.3362
900,0.294


[TEST] Twitter_ep6  acc=0.6029  f1=0.4663
[SELECT] Twitter: choose epoch=5  (best F1=0.4605)


In [8]:
# save metadata

META_PATH  = os.path.join(DRIVE_PATH, "all_results_meta.pkl")
RAND_SEED  = 42
# Proportions of the dataset to hold out for testing and validation
TEST_PROP  = 0.10
VAL_PROP   = 0.10

# Load existing metadata if it exists, otherwise start with an empty dict
if os.path.exists(META_PATH):
    with open(META_PATH, "rb") as f:
        meta = pickle.load(f)
else:
    meta = {}

#  For each dataset (Amazon, Movie, Twitter), split indices into train/val/test and save the splits into the metadata dictionary
for ds in ["Amazon", "Movie", "Twitter"]:
    df = datasets[ds]["df"].copy()
    idx_all = df.index.to_numpy()

    if "model_label" not in df.columns:
        df["model_label"] = df["label_5class"] - 1

    train_idx, temp_idx = train_test_split(
        idx_all,
        test_size=TEST_PROP + VAL_PROP,
        random_state=RAND_SEED,
        stratify=df["model_label"]
    )

    val_idx, test_idx = train_test_split(
        temp_idx,
        test_size=TEST_PROP / (TEST_PROP + VAL_PROP),
        random_state=RAND_SEED,
        stratify=df.loc[temp_idx, "model_label"]
    )

    print(f"{ds:<8s} | train={len(train_idx):5d}  val={len(val_idx):5d}  "
          f"test={len(test_idx):5d}")

    meta.setdefault(ds, {})
    meta[ds]["train_idx"] = train_idx.tolist()
    meta[ds]["val_idx"]   = val_idx.tolist()
    meta[ds]["test_idx"]  = test_idx.tolist()

# save meta
with open(META_PATH, "wb") as f:
    pickle.dump(meta, f, protocol=pickle.HIGHEST_PROTOCOL)

print(f"\n saved to {META_PATH}")




Amazon   | train=141123  val=17640  test=17641
Movie    | train=124848  val=15606  test=15606
Twitter  | train=20711  val= 2589  test= 2589

 saved to output\all_results_meta.pkl
