# Setup Environment

In [None]:
# Std libraries
import os
import re
import gc
import html
import time
import io
import hashlib
import tarfile
import json
from collections import namedtuple

# 3rd party libraries
import joblib
import torch
import pandas as pd
import numpy as np
import sklearn.metrics as skm
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from tqdm.auto import tqdm
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel, DataCollatorWithPadding
from torch.amp import GradScaler, autocast
import torch.optim as optim
from torch.optim.lr_scheduler import LinearLR
import matplotlib.gridspec as gridspec

In [None]:
# Define dataframe columns from source
CAMEL_NOTE_ID = "noteId"
RATER_PARTICIPANT_ID = "raterParticipantId"
HELPFULNESS_LEVEL_KEY = "helpfulnessLevel"
HELPFUL_NUM_KEY = "helpfulNum"
CORE_NOTE_INTERCEPT = "coreNoteIntercept"
EXPANSION_NOTE_INTERCEPT = "expansionNoteIntercept"
EXPANSION_PLUS_NOTE_INTERCEPT = "expansionPlusNoteIntercept"
CORE_NOTE_FACTOR = "coreNoteFactor1"
EXPANSION_NOTE_FACTOR = "expansionNoteFactor1"
EXPANSION_PLUS_NOTE_FACTOR = "expansionPlusNoteFactor1"
EXPANSION_RATER_INTERCEPT = "expansionRaterIntercept"
EXPANSION_RATER_FACTOR = "expansionRaterFactor1"
CURRENT_LABEL = "currentStatus"
AUTHOR_CLASSIFICATION = "author_classification"

In [None]:
# Define dataframe columns for H and NH tags
H_TAGS = [
  "helpfulOther",
  "helpfulClear",
  "helpfulGoodSources",
  "helpfulAddressesClaim",
  "helpfulImportantContext",
  "helpfulUnbiasedLanguage"
]
NH_TAGS = [
  "notHelpfulOther",
  "notHelpfulIncorrect",
  "notHelpfulSourcesMissingOrUnreliable",
  "notHelpfulMissingKeyPoints",
  "notHelpfulHardToUnderstand",
  "notHelpfulArgumentativeOrBiased",
  "notHelpfulSpamHarassmentOrAbuse",
  "notHelpfulIrrelevantSources",
  "notHelpfulOpinionSpeculation",
  "notHelpfulNoteNotNeeded"
]

In [None]:
# Define dataframe values from source
HELPFUL_VALUE_TSV = "HELPFUL"
NOT_HELPFUL_VALUE_TSV = "NOT_HELPFUL"
SOMEWHAT_HELPFUL_VALUE_TSV = "SOMEWHAT_HELPFUL"
MISINFORMED_OR_POTENTIALLY_MISLEADING = "MISINFORMED_OR_POTENTIALLY_MISLEADING"
CURRENTLY_RATED_HELPFUL = "CURRENTLY_RATED_HELPFUL"
NEEDS_MORE_RATINGS = "NEEDS_MORE_RATINGS"
CURRENTLY_RATED_NOT_HELPFUL = "CURRENTLY_RATED_NOT_HELPFUL"
NOT_MISLEADING = "NOT_MISLEADING"

In [None]:
# Define dataframe notebook columns related to text and language
NOTE_ID = "note_id"
TWEET_ID = "tweet_id"
NOTE_TEXT = "note_text"
TWEET_TEXT = "tweet_text"
TWEET_SHORTEN_URLS = "tweet_shorten_urls"
TWEET_EXPANDED_URLS = "tweet_expanded_urls"
NOTE_TEXT_UNESCAPED = "note_text_unescaped"
TWEET_TEXT_UNESCAPED = "tweet_text_unescaped"
NOTE_TEXT_FINAL = "note_text_final"
TWEET_TEXT_FINAL = "tweet_text_final"
NOTE_TEXT_NO_URLS = "note_text_no_urls"
TWEET_TEXT_NO_URLS = "tweet_text_no_urls"
NOTE_LANG = "note_lang"
TWEET_LANG = "tweet_lang"
NOTE_LANG_CONFIDENCE = "note_lang_confidence"
TWEET_LANG_CONFIDENCE = "tweet_lang_confidence"
NOTE_LANG_INFERRED = "note_lang_inferred"
TWEET_LANG_INFERRED = "tweet_lang_inferred"
CLUSTER_ID = "cluster_id"
CLUSTER_TEXT = "cluster_text"

In [None]:
# Define dataframe notebook columns related to labeling
RATING_WEIGHT = "rating_weight"
TOTAL_SIGNAL_WEIGHT = "total_signal_weight"
TOTAL_RATINGS = "total_ratings"
PREDICTED_HELPFULNESS = "predicted_helpfulness"
RELEVANCE = "relevance"
CLASSIFICATION = "classification"
INTERCEPT = "intercept"
FACTOR = "factor"
CRH = "crh"
NMR = "nmr"
CRNH = "crnh"

In [None]:
# Additional constants
CUDA = "cuda"
CPU = "cpu"
ROOT = os.path.expanduser("~/workspace")
HF_ROOT = os.path.join(ROOT, "huggingface")
MODEL_ROOT = os.path.join(HF_ROOT, "models")
MODEL_DIR = "model"
TOKENIZER_DIR = "tokenizer"
LANGUAGE_DETECTION_MODEL = "xlm-roberta-base-language-detection"
DISTILROBERTA_BASE_MODEL = "distilroberta-base"
ALL_MINILM_L6_V2 = "all-MiniLM-L6-v2"
ALL_MPNET_BASE_V2 = "all-mpnet-base-v2"
DATA_ROOT = os.path.join(ROOT, "datasets/helpfulness")
EXPANSION_GLOBAL_BIAS = 0.17178
SEED = 42

In [None]:
# Configure environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# Define helper for monitoring GPU memory usage
def get_gpu_stats():
  tmp = !nvidia-smi --query-gpu=index,name,memory.used,memory.total,utilization.gpu --format=csv
  return pd.DataFrame([row.split(",") for row in tmp[1:]], columns=tmp[0].split(","))

get_gpu_stats()

# Load and Prepare Dataset

## Load and Prune Data

In [None]:
# Load scoring inputs
ratings = pd.read_parquet(os.path.join(DATA_ROOT, "ratings.parquet"))
notes = pd.read_parquet(os.path.join(DATA_ROOT, "notes.parquet"))
nsh = pd.read_parquet(os.path.join(DATA_ROOT, "note_status_history.parquet"))                                   

In [None]:
# Load scoring outputs
scoredRaters = pd.read_parquet(os.path.join(DATA_ROOT, "scored_raters.parquet"))
scoredNotes = pd.read_parquet(os.path.join(DATA_ROOT, "scored_notes.parquet"))

In [None]:
# Load posts
posts = pd.read_parquet(os.path.join(DATA_ROOT, "posts.parquet"))
print(len(posts))

In [None]:
# Load clusters
clusters = pd.read_parquet(os.path.join(DATA_ROOT, "clusters_1753217043.parquet"))
with open(os.path.join(DATA_ROOT, "top_words.json"), "r") as handle:
  topWords = json.load(handle)

In [None]:
# Standardize types
ratings[RATER_PARTICIPANT_ID] = ratings[RATER_PARTICIPANT_ID].astype(np.int64)
scoredRaters[RATER_PARTICIPANT_ID] = scoredRaters[RATER_PARTICIPANT_ID].astype(np.int64)

In [None]:
# Standardize note_id and classification column names
ratings = ratings.rename(columns={CAMEL_NOTE_ID: NOTE_ID})
notes = notes.rename(columns={CAMEL_NOTE_ID: NOTE_ID, "classification": AUTHOR_CLASSIFICATION})
nsh = nsh.rename(columns={CAMEL_NOTE_ID: NOTE_ID})
scoredNotes = scoredNotes.rename(columns={CAMEL_NOTE_ID: NOTE_ID})

In [None]:
# Prune columns
ratings = ratings[[NOTE_ID, RATER_PARTICIPANT_ID, HELPFULNESS_LEVEL_KEY] + H_TAGS + NH_TAGS]
posts = posts[[NOTE_ID, TWEET_ID, NOTE_TEXT, TWEET_TEXT, TWEET_SHORTEN_URLS, TWEET_EXPANDED_URLS]]

## Compute Weighted Tag Ratios

In [None]:
# Augment ratings with standardized helpfulness level and scoring results for notes and raters
def add_level_and_scoring_results(ratings, scoredRaters, scoredNotes):
  # Select columns and set helpfulNum
  print(f"Original ratings: {len(ratings)}")
  ratings[HELPFUL_NUM_KEY] = np.nan
  ratings.loc[ratings[HELPFULNESS_LEVEL_KEY] == HELPFUL_VALUE_TSV, HELPFUL_NUM_KEY] = 1.0
  ratings.loc[ratings[HELPFULNESS_LEVEL_KEY] == SOMEWHAT_HELPFUL_VALUE_TSV, HELPFUL_NUM_KEY] = 0.5
  ratings.loc[ratings[HELPFULNESS_LEVEL_KEY] == NOT_HELPFUL_VALUE_TSV, HELPFUL_NUM_KEY] = 0.0
  ratings = ratings[ratings[HELPFUL_NUM_KEY].notna()].drop(columns=HELPFULNESS_LEVEL_KEY)
  print(f"Ratings with helpfulNum: {len(ratings)}")
  # Augment with scoring results
  ratings = ratings.merge(scoredRaters[[RATER_PARTICIPANT_ID, EXPANSION_RATER_FACTOR, EXPANSION_RATER_INTERCEPT]].dropna())
  ratings = ratings.merge(scoredNotes[[NOTE_ID, EXPANSION_NOTE_FACTOR, EXPANSION_NOTE_INTERCEPT]].dropna())
  print(f"Ratings with scoring results: {len(ratings)}")
  assert ratings.isna().sum().sum() == 0
  return ratings

ratings = add_level_and_scoring_results(ratings.copy(), scoredRaters, scoredNotes)

In [None]:
# Generate a prediction of how we expect a rater to rate a note based on the learned viewpoint
# representation for the note and rater, as well as bias terms.
# Notice that we add the mean of the note intercept to shift predictions appropriately without
# actually incorporating quality signal specific to the note.
def add_prediction(ratings):
  ratings[PREDICTED_HELPFULNESS] = (
    ratings[EXPANSION_RATER_FACTOR] * ratings[EXPANSION_NOTE_FACTOR]
    + ratings[EXPANSION_RATER_INTERCEPT]
    + ratings[EXPANSION_NOTE_INTERCEPT].mean()
    + EXPANSION_GLOBAL_BIAS
  )
  return ratings

ratings = add_prediction(ratings)

In [None]:
# Profile learned representation
def profile_params(scoredRaters, scoredNotes):
  fig, ax = plt.subplots(1, 4)
  fig.set_figwidth(30)
  fig.set_figheight(5)
  scoredRaters[EXPANSION_RATER_FACTOR].plot.hist(bins=50, ax=ax[0], title="Rater Factors")
  scoredRaters[EXPANSION_RATER_INTERCEPT].plot.hist(bins=50, ax=ax[1], title="Rater Intercepts")
  scoredNotes[EXPANSION_NOTE_FACTOR].plot.hist(bins=50, ax=ax[2], title="Note Factors")
  scoredNotes[EXPANSION_NOTE_INTERCEPT].plot.hist(bins=50, ax=ax[3], title="Note Intercepts")

profile_params(scoredRaters, scoredNotes)

In [None]:
# View distribution of predictions
def plot_predictions(predRatings):
  fig, ax = plt.subplots(1, 4, sharex=True)
  fig.set_figwidth(30)
  fig.set_figheight(5)
  predRatings[PREDICTED_HELPFULNESS].plot.hist(bins=50, ax=ax[0])
  ax[0].set_title("All")
  predRatings[predRatings[HELPFUL_NUM_KEY] == 1.0][PREDICTED_HELPFULNESS].plot.hist(bins=50, ax=ax[1])
  ax[1].set_title("Helpful")
  predRatings[predRatings[HELPFUL_NUM_KEY] == 0.5][PREDICTED_HELPFULNESS].plot.hist(bins=50, ax=ax[2])
  ax[2].set_title("Somewhat Helpful")
  predRatings[predRatings[HELPFUL_NUM_KEY] == 0.0][PREDICTED_HELPFULNESS].plot.hist(bins=50, ax=ax[3])
  ax[3].set_title("Not Helpful")

plot_predictions(ratings)

In [None]:
# Plot predictions passed through sigmoid for weighting
def plot_pred_sigmoid(predRatings):
  fig, ax = plt.subplots(1, 4, sharex=True)
  fig.set_figwidth(30)
  fig.set_figheight(5)
  std = predRatings[PREDICTED_HELPFULNESS].std()
  for i, multiplier in enumerate([1, 1.5, 2, 3]):
    factor = multiplier / std
    # Center the predicted helpfulness around .5 since the MF treats Helpful as 1 and Not Helpful as 0.
    # Scale by the std deviation and a multiplicative factor to determine how strongly to weight ratings.
    # Apply sigmoid.
    ((1 + np.exp(-1 * factor * (predRatings[PREDICTED_HELPFULNESS] - .5))) ** -1).plot.hist(bins=50, title=f"factor={multiplier}", ax=ax[i])

plot_pred_sigmoid(ratings)

In [None]:
# Determine rating weights with multiplier=1
def get_weighted_tag_ratios(ratings, multiplier):
  ratings = ratings.copy()
  factor = multiplier / ratings[PREDICTED_HELPFULNESS].std()
  ratings[RATING_WEIGHT] = ((1 + np.exp(-1 * factor * (ratings[PREDICTED_HELPFULNESS] - .5))) ** -1)
  for col in NH_TAGS:
    ratings[col] = ratings[col] * ratings[RATING_WEIGHT]
  for col in H_TAGS:
    ratings[col] = ratings[col] * (1 - ratings[RATING_WEIGHT])
  scores = ratings[[NOTE_ID, RATING_WEIGHT] + NH_TAGS + H_TAGS].groupby(NOTE_ID).sum().reset_index(drop=False).rename(
    columns={RATING_WEIGHT: TOTAL_SIGNAL_WEIGHT})
  for col in (NH_TAGS + H_TAGS):
    scores[f"{col}_ratio"] = scores[col] / scores[TOTAL_SIGNAL_WEIGHT]
  print(f"Total notes: {len(scores)}")
  return scores

weightedTagRatios = get_weighted_tag_ratios(ratings, 1)
weightedTagRatios

## Assemble Dataset

In [None]:
# Coalesce note factors
def get_note_factor(scoredNotes):

  def _get_factor(core, expansion, expansionPlus):
    if not pd.isna(core):
      return core
    if not pd.isna(expansion):
      return expansion
    if not pd.isna(expansionPlus):
      return expansionPlus
    return np.nan
  scoredNotes = scoredNotes[[NOTE_ID, CORE_NOTE_FACTOR, EXPANSION_NOTE_FACTOR, EXPANSION_PLUS_NOTE_FACTOR]].copy()
  scoredNotes[FACTOR] = [_get_factor(core, expansion, expansionPlus) for (core, expansion, expansionPlus) in (
    scoredNotes[[CORE_NOTE_FACTOR, EXPANSION_NOTE_FACTOR, EXPANSION_PLUS_NOTE_FACTOR]].values
  )]
  return scoredNotes[[NOTE_ID, FACTOR]].rename(columns={NOTE_ID: NOTE_ID})

noteFactors = get_note_factor(scoredNotes)
noteFactors.merge(scoredNotes[[NOTE_ID, CORE_NOTE_FACTOR, EXPANSION_NOTE_FACTOR, EXPANSION_PLUS_NOTE_FACTOR]])

In [None]:
# Coalesce note intercepts
def get_note_intercept(scoredNotes):

  def _get_intercept(core, expansion, expansionPlus):
    if not pd.isna(core):
      return core
    if not pd.isna(expansion):
      return expansion
    if not pd.isna(expansionPlus):
      return expansionPlus
    return np.nan
  scoredNotes = scoredNotes[[NOTE_ID, CORE_NOTE_INTERCEPT, EXPANSION_NOTE_INTERCEPT, EXPANSION_PLUS_NOTE_INTERCEPT]].copy()
  scoredNotes[INTERCEPT] = [_get_intercept(core, expansion, expansionPlus) for (core, expansion, expansionPlus) in (
    scoredNotes[[CORE_NOTE_INTERCEPT, EXPANSION_NOTE_INTERCEPT, EXPANSION_PLUS_NOTE_INTERCEPT]].values
  )]
  return scoredNotes[[NOTE_ID, INTERCEPT]].rename(columns={NOTE_ID: NOTE_ID})

noteIntercepts = get_note_intercept(scoredNotes)
noteIntercepts.merge(scoredNotes[[NOTE_ID, CORE_NOTE_INTERCEPT, EXPANSION_NOTE_INTERCEPT, EXPANSION_PLUS_NOTE_INTERCEPT]])

In [None]:
# Combine all signals for a final export dataset
def prepare_dataset(posts, noteFactors, noteIntercepts, nsh, notes, ratings, weightedTagRatios):
  # Extract classifications and final note status
  classifications = notes[[NOTE_ID, AUTHOR_CLASSIFICATION]]
  finalStatus = nsh[[NOTE_ID, CURRENT_LABEL]]
  ratingCounts = ratings[NOTE_ID].value_counts().to_frame().reset_index(drop=False).rename(columns={"count": TOTAL_RATINGS})
  # Compose and return dataset
  print(len(posts))
  dataset = posts.merge(
    classifications, on=NOTE_ID).merge(
    noteIntercepts, on=NOTE_ID).merge(
    noteFactors, on=NOTE_ID).merge(
    finalStatus, on=NOTE_ID).merge(
    ratingCounts, on=NOTE_ID).merge(
    weightedTagRatios, on=NOTE_ID)
  print(len(dataset))
  return dataset

In [None]:
print(len(posts))
print(len(nsh))
print(len(notes))
print()
dataset = prepare_dataset(posts, noteFactors, noteIntercepts, nsh, notes, ratings, weightedTagRatios)
print(len(dataset))
dataset.dtypes

# Prepare Text

## Inspect Dataset

In [None]:
# Validate that text is always present
dataset[[NOTE_TEXT, TWEET_TEXT]].isna().sum()

In [None]:
# Validate that text is always present
(dataset[[NOTE_TEXT, NOTE_TEXT]] == "").sum()

## Unescape Text

In [None]:
# Define unescape helper
def unescape(text):
  return html.unescape(html.unescape(text)) if isinstance(text, str) else text

In [None]:
# Unescape notes and tweets
dataset[NOTE_TEXT_UNESCAPED] = [unescape(text) for text in dataset[NOTE_TEXT]]
dataset[TWEET_TEXT_UNESCAPED] = [unescape(text) for text in dataset[TWEET_TEXT]]

In [None]:
# show text sample
for tmp in dataset[NOTE_TEXT_UNESCAPED].sample(10, random_state=SEED):
  print(tmp)
  print("------------------"*3)

In [None]:
# show text sample
for tmp in dataset[TWEET_TEXT_UNESCAPED].sample(10, random_state=SEED):
  print(tmp)
  print("------------------"*3)

## Prepare URLs

In [None]:
# Define helper to replace URLs with full text versions
def replace_urls(text, shortUrls, fullUrls, maxLength=150):
  if shortUrls is not None:
    # Validate mapping and replace known links
    assert len(shortUrls) == len(fullUrls)
    for short, full in zip(shortUrls, fullUrls):
      text = text.replace(short, full[:maxLength])
  # Remove any remaining shortlinks
  return re.sub("https://t.co/\S+", "", text)

In [None]:
# Generate new dataset column with patched text
dataset[TWEET_TEXT_FINAL] = [
  replace_urls(text, shortUrls, fullUrls)
  for (text, shortUrls, fullUrls)
  in dataset[[TWEET_TEXT_UNESCAPED, TWEET_SHORTEN_URLS, TWEET_EXPANDED_URLS]].values
]

In [None]:
# View a sample of patched values
for tmp in dataset[[NOTE_ID, TWEET_TEXT_FINAL]].sample(10).values:
  print(tmp)
  print("--------")

In [None]:
# Define helper to truncated URLs in note text
def truncate_urls(noteText, maxLength=150):
  assert maxLength >= 0
  urlPattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
  def truncate_match(match):
    url = match.group(0)
    return url[:maxLength] if len(url) > maxLength else url
  return re.sub(urlPattern, truncate_match, noteText)

print(truncate_urls("This note has no url"))
print(truncate_urls("This note has 1 url http://www.foobar.com/test/path and then more text", maxLength=15))
print(truncate_urls("This note has 1 url http://foobar.com/test/path and then more text", maxLength=15))
print(truncate_urls("This note has 1 url https://www.foobar.com/test/path and then more text", maxLength=15))
print(truncate_urls("This note has 1 url https://foobar.com/test/path and then more text", maxLength=15))
print(truncate_urls("This note has 2 url https://foobar.com/test/path and https://foobarbaz.com/test/path then more text", maxLength=15))
print(truncate_urls("This note has 2 url https://foobar.com/test/path and https://foobarbaz.com/test/path then more text", maxLength=250))

In [None]:
# Apply truncation to note text
dataset[NOTE_TEXT_FINAL] = [truncate_urls(text) for text in dataset[NOTE_TEXT_UNESCAPED]]

In [None]:
# View a sample of patched values
for tmp in dataset[NOTE_TEXT_FINAL].sample(10):
  print(tmp)
  print("--------")

## Inspect Final Text

In [None]:
# Validate that text is always present
dataset[[NOTE_TEXT_FINAL, TWEET_TEXT_FINAL]].isna().sum()

In [None]:
# Validate that text is always present
(dataset[[NOTE_TEXT_FINAL, TWEET_TEXT_FINAL]] == "").sum()  # non-zero expected because some tweets only contain a media short link

## Generate No-URL Variants

In [None]:
# Define helper to generate no-URL variants for TF-IDF characterization of clusters
def remove_urls(text):
  urlPattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
  return re.sub(urlPattern, "", text)

print(remove_urls("This note has no url"))
print(remove_urls("This note has 1 url http://www.foobar.com/test/path and then more text"))
print(remove_urls("This note has 1 url http://foobar.com/test/path and then more text"))
print(remove_urls("This note has 1 url https://www.foobar.com/test/path and then more text"))
print(remove_urls("This note has 1 url https://foobar.com/test/path and then more text"))
print(remove_urls("This note has 2 url https://foobar.com/test/path and https://foobarbaz.com/test/path then more text"))
print(remove_urls("This note has 2 url https://foobar.com/test/path and https://foobarbaz.com/test/path then more text"))

In [None]:
# Generate no-URL variants
dataset[NOTE_TEXT_NO_URLS] = [remove_urls(text) for text in dataset[NOTE_TEXT_FINAL]]
dataset[TWEET_TEXT_NO_URLS] = [remove_urls(text) for text in dataset[TWEET_TEXT_FINAL]]

# Detect Language

## Load Models and Data

In [None]:
# load model and tokenizer
langDetectionModel = nn.DataParallel(AutoModelForSequenceClassification.from_pretrained(os.path.join(MODEL_ROOT, LANGUAGE_DETECTION_MODEL, MODEL_DIR)).to(CUDA))
langDetectionTokenizer = AutoTokenizer.from_pretrained(os.path.join(MODEL_ROOT, LANGUAGE_DETECTION_MODEL, TOKENIZER_DIR))

In [None]:
get_gpu_stats()

## Apply Model

In [None]:
# View model size
print(f"{sum(tmp.numel() for tmp in langDetectionModel.parameters())//(2**20)}M")

In [None]:
# Define helper to classify a chunk
def classify_chunk(texts, model, tokenizer):
  inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
  with torch.no_grad():
    logits = model(**inputs).logits
  preds = torch.softmax(logits, dim=-1)
  vals, idxs = torch.max(preds, dim=1)
  # Map raw predictions to languages
  id2lang = model.module.config.id2label
  return [(id2lang[k.item()], v.item()) for k, v in zip(idxs, vals)]

In [None]:
# Define helper to classify larger lists
def classify_texts(texts, model, tokenizer, batchSize=1024):
  start = 0
  numBatches = int(np.ceil(len(texts) / batchSize))
  results = []
  progressBar = tqdm(range(numBatches))
  while start < len(texts):
    end = start + batchSize
    results.extend(classify_chunk(texts[start:end], model, tokenizer))
    progressBar.update(1)
    start = end
  return list(zip(*results))

In [None]:
# Compute note languages
noteLangs, noteConfidence = classify_texts(list(dataset[NOTE_TEXT_FINAL]), langDetectionModel, langDetectionTokenizer)

In [None]:
# Compute tweet languages
tweetLangs, tweetConfidence = classify_texts(list(dataset[TWEET_TEXT_FINAL]), langDetectionModel, langDetectionTokenizer)

In [None]:
# Augment dataset
dataset[NOTE_LANG] = noteLangs
dataset[NOTE_LANG_CONFIDENCE] = noteConfidence
dataset[TWEET_LANG] = tweetLangs
dataset[TWEET_LANG_CONFIDENCE] = tweetConfidence

## Inspect Results

In [None]:
# Profile note lang confidence
dataset[NOTE_LANG_CONFIDENCE].plot.hist(bins=50, logy=True)

In [None]:
# Profile tweet lang confidence
dataset[TWEET_LANG_CONFIDENCE].plot.hist(bins=50, logy=True)

In [None]:
# Profile note lang
dataset[NOTE_LANG].value_counts()

In [None]:
# Profile tweet lang
dataset[TWEET_LANG].value_counts()

## Set Inferred Languages

In [None]:
# Define helpers to infer note langauge and tweet language
def infer_note_lang(noteLang, noteConfidence, tweetLang, tweetConfidence):
  if noteConfidence > .5:
    return noteLang
  elif tweetConfidence > .5:
    return tweetLang
  else:
    return pd.NA

def infer_tweet_lang(noteLang, noteConfidence, tweetLang, tweetConfidence):
  if tweetConfidence > .5:
    return tweetLang
  elif noteConfidence > .5:
    return noteLang
  else:
    return pd.NA

In [None]:
# Apply inference
dataset[NOTE_LANG_INFERRED] = [
  infer_note_lang(noteLang, noteConfidence, tweetLang, tweetConfidence)
  for (noteLang, noteConfidence, tweetLang, tweetConfidence)
  in dataset[[NOTE_LANG, NOTE_LANG_CONFIDENCE, TWEET_LANG, TWEET_LANG_CONFIDENCE]].values
]
dataset[TWEET_LANG_INFERRED] = [
  infer_tweet_lang(noteLang, noteConfidence, tweetLang, tweetConfidence)
  for (noteLang, noteConfidence, tweetLang, tweetConfidence)
  in dataset[[NOTE_LANG, NOTE_LANG_CONFIDENCE, TWEET_LANG, TWEET_LANG_CONFIDENCE]].values
]

In [None]:
# Profile note lang
dataset[NOTE_LANG_INFERRED].value_counts()

In [None]:
# Profile note lang
dataset[TWEET_LANG_INFERRED].value_counts()

## Prune By Language

In [None]:
# Restrict to EN notes and posts
print(len(dataset))
enDataset = dataset[
  (dataset[NOTE_LANG_INFERRED] == "en")
  & (dataset[TWEET_LANG_INFERRED] == "en")
]
print(len(enDataset))

In [None]:
# Save dataset with augmented text, labeling signals and language
enDataset.to_parquet(os.path.join(DATA_ROOT, "augmented_en_posts_with_signals_and_langs.parquet"))

# Prepare Model Training

## Profile Dataset

In [None]:
tmp = enDataset[TOTAL_RATINGS].clip(0, 200).plot.hist(bins=40, cumulative=True)
tmp.axvline(16, color="red")

In [None]:
# Show rating distribution
tmp = enDataset[[TOTAL_RATINGS, CURRENT_LABEL]].copy()
tmp[CRH] = tmp[CURRENT_LABEL] == CURRENTLY_RATED_HELPFUL
tmp[NMR] = tmp[CURRENT_LABEL] == NEEDS_MORE_RATINGS
tmp[CRNH] = tmp[CURRENT_LABEL] == CURRENTLY_RATED_NOT_HELPFUL
tmp[TOTAL_RATINGS] = [min(count, 200) // 4 for count in tmp[TOTAL_RATINGS]]
tmp = tmp[[TOTAL_RATINGS, CRH, NMR, CRNH]]
tmp.groupby(TOTAL_RATINGS).mean().sort_values(TOTAL_RATINGS)

## Generate Labels & Splits

In [None]:
# Define helper to apply thresholds and generate multitask labels
def make_multitask_dataset(
  dataset,
  # Tag thresholds
  minTotalSignal=3,
  minPosNotHelpfulRatio=.25,
  minPosHelpfulRatio=.5,
  minPosSignal=2.5,
  maxNegRatio=0.1,
  # Thresholds for intercept, factor and status targets
  crnhInterceptThreshold=0,
  crhPosInterceptThreshold=0.4,
  crhNegInterceptThreshold=0.3,
  minStatusRatings=15,
):
  # Target labels: Helpful Tags, NotHelpful Tags, Relevance, Classification, Intercept, Factor, CRH, NMR, CRNH
  print(f"Initial dataset length: {len(dataset)}")
  dataset = dataset[dataset[AUTHOR_CLASSIFICATION].notna()]
  print(f"Dataset with classification: {len(dataset)}")
  output = dataset[[NOTE_ID, TWEET_ID, NOTE_TEXT_FINAL, TWEET_TEXT_FINAL]].copy()
  # Set label for each tag column
  for col in H_TAGS:
    posRows = (
      (dataset[TOTAL_SIGNAL_WEIGHT] >= minTotalSignal)
      & (dataset[f"{col}_ratio"] >= minPosHelpfulRatio)
      & (dataset[col] >= minPosSignal)
      & (dataset[AUTHOR_CLASSIFICATION] == MISINFORMED_OR_POTENTIALLY_MISLEADING)
    ).astype(np.bool).values
    negRows = (
      (dataset[TOTAL_SIGNAL_WEIGHT] >= minTotalSignal)
      & (dataset[f"{col}_ratio"] <= maxNegRatio)
      & (dataset[AUTHOR_CLASSIFICATION] == MISINFORMED_OR_POTENTIALLY_MISLEADING)
    ).astype(np.bool).values
    assert (posRows & negRows).sum() == 0
    output[col] = np.nan
    output.loc[posRows, col] = 1.0
    output.loc[negRows, col] = 0.0
  for col in NH_TAGS:
    posRows = (
      (dataset[TOTAL_SIGNAL_WEIGHT] >= minTotalSignal)
      & (dataset[f"{col}_ratio"] >= minPosNotHelpfulRatio)
      & (dataset[col] >= minPosSignal)
      & (dataset[AUTHOR_CLASSIFICATION] == MISINFORMED_OR_POTENTIALLY_MISLEADING)
    ).astype(np.bool).values
    negRows = (
      (dataset[TOTAL_SIGNAL_WEIGHT] >= minTotalSignal)
      & (dataset[f"{col}_ratio"] <= maxNegRatio)
      & (dataset[AUTHOR_CLASSIFICATION] == MISINFORMED_OR_POTENTIALLY_MISLEADING)
    ).astype(np.bool).values
    assert (posRows & negRows).sum() == 0
    output[col] = np.nan
    output.loc[posRows, col] = 1.0
    output.loc[negRows, col] = 0.0
  # Prepare relevance labels
  output[RELEVANCE] = 1.0
  output.loc[dataset[AUTHOR_CLASSIFICATION] == NOT_MISLEADING, RELEVANCE] = np.nan
  output.loc[dataset[INTERCEPT] < crnhInterceptThreshold, RELEVANCE] = np.nan
  output.loc[dataset[CURRENT_LABEL] == CURRENTLY_RATED_NOT_HELPFUL, RELEVANCE] = np.nan
  # Prepare classification labels
  posRows = dataset[AUTHOR_CLASSIFICATION] == MISINFORMED_OR_POTENTIALLY_MISLEADING
  negRows = dataset[AUTHOR_CLASSIFICATION] == NOT_MISLEADING
  assert (posRows & negRows).sum() == 0
  assert (posRows | negRows).sum() == len(dataset)
  output[CLASSIFICATION] = np.nan
  output.loc[posRows, CLASSIFICATION] = 1.0
  output.loc[negRows, CLASSIFICATION] = 0.0
  # Prepare intercept labels
  nanRows = dataset[TOTAL_RATINGS] < minStatusRatings
  output[INTERCEPT] = dataset[INTERCEPT]
  output.loc[nanRows, INTERCEPT] = np.nan
  # Prepare factor labels
  nanRows = dataset[TOTAL_RATINGS] < minStatusRatings
  output[FACTOR] = dataset[FACTOR]
  output.loc[nanRows, FACTOR] = np.nan
  # Prepare CRNH labels
  posRows = (
    (dataset[CURRENT_LABEL] == CURRENTLY_RATED_NOT_HELPFUL)
    & (dataset[INTERCEPT] < crnhInterceptThreshold)
    & (dataset[TOTAL_RATINGS] >= minStatusRatings)
  )
  negRows = (
    (dataset[CURRENT_LABEL] != CURRENTLY_RATED_NOT_HELPFUL)
    & (dataset[INTERCEPT] > crnhInterceptThreshold)
    & (dataset[TOTAL_RATINGS] >= minStatusRatings)
  )
  assert (posRows & negRows).sum() == 0
  output[CRNH] = np.nan
  output.loc[posRows, CRNH] = 1.0
  output.loc[negRows, CRNH] = 0.0
  # Prepare CRH labels
  posRows = (
    (dataset[CURRENT_LABEL] == CURRENTLY_RATED_HELPFUL)
    & (dataset[INTERCEPT] >= crhPosInterceptThreshold)
    & (dataset[TOTAL_RATINGS] >= minStatusRatings)
  )
  negRows = (
    (dataset[CURRENT_LABEL] != CURRENTLY_RATED_HELPFUL)
    & (dataset[INTERCEPT] < crhNegInterceptThreshold)
    & (dataset[TOTAL_RATINGS] >= minStatusRatings)
  )
  assert (posRows & negRows).sum() == 0
  output[CRH] = np.nan
  output.loc[posRows, CRH] = 1.0
  output.loc[negRows, CRH] = 0.0
  # Summarize output
  print(f"Final dataset length: {len(output)}")
  return output

In [None]:
# Present labeled dataset
labeledDataset = make_multitask_dataset(enDataset)
labeledDataset.dtypes

In [None]:
# Summarize labels
def count_values_per_column(df):
  # Iterate through each column
  nanCount = []
  zeroCount = []
  oneCount = []
  for col in df.columns:
    # Count values using value_counts, filling missing values with 0
    if col not in [INTERCEPT, FACTOR]:
      counts = df[col].value_counts(dropna=False)
      zeroCount.append(counts.get(0, 0))
      oneCount.append(counts.get(1, 0))
      nanCount.append(counts.get(np.nan, 0))
    else:
      zeroCount.append(pd.NA)
      oneCount.append(df[col].notna().sum())
      nanCount.append(df[col].isna().sum())
  # Create result DataFrame
  result = pd.DataFrame({
    "columnName": df.columns,
    "zeroCount": zeroCount,
    "oneCount": oneCount,
    "nanCount": nanCount,
  })  
  return result[["columnName", "nanCount", "zeroCount", "oneCount"]]

count_values_per_column(labeledDataset.drop(columns=[NOTE_ID, TWEET_ID, NOTE_TEXT_FINAL, TWEET_TEXT_FINAL]))

In [None]:
# Split dataset for training and testing
def split_dataset(dataset, trainFrac=.8):
  trainTweets = dataset[TWEET_ID].drop_duplicates().sample(frac=trainFrac)
  return (
    dataset[dataset[TWEET_ID].isin(trainTweets)],
    dataset[~dataset[TWEET_ID].isin(trainTweets)],
  )

trainSplit, testSplit = split_dataset(labeledDataset)
print(len(trainSplit))
print(len(testSplit))

In [None]:
# Define helper to add synthetic relevance examples
def add_relevance(dataset, negFactor=5, seed=42):
  noteNegRows = pd.concat([dataset[[NOTE_ID, NOTE_TEXT_FINAL]]] * negFactor).reset_index(drop=True)
  tweetNegRows = pd.concat([dataset[[TWEET_ID, TWEET_TEXT_FINAL]]] * negFactor).sample(frac=1., random_state=seed).reset_index(drop=True)
  relevanceNegRows = pd.concat([noteNegRows, tweetNegRows], axis=1)
  for col in dataset:
    if col in [RELEVANCE, NOTE_ID, TWEET_ID, NOTE_TEXT_FINAL, TWEET_TEXT_FINAL]:
      continue
    relevanceNegRows[col] = np.nan
  relevanceNegRows[RELEVANCE] = 0.0
  return pd.concat([dataset, relevanceNegRows], axis=0).sample(frac=1., random_state=seed)

trainDataset = add_relevance(trainSplit)
print(len(trainDataset))
testDataset = add_relevance(testSplit)
print(len(testDataset))

In [None]:
count_values_per_column(trainDataset.drop(columns=[NOTE_ID, TWEET_ID, NOTE_TEXT_FINAL, TWEET_TEXT_FINAL]))

In [None]:
count_values_per_column(testDataset.drop(columns=[NOTE_ID, TWEET_ID, NOTE_TEXT_FINAL, TWEET_TEXT_FINAL]))

# All Signal Model

## Create Tensors

In [None]:
# Profile the amount of signal for each label
(trainDataset == 1).sum()

In [None]:
# Define which labels to include
allSignalLabels = [
  "intercept",
  "factor",
  "crh",
  "crnh",
  "classification",
  "relevance",
  "notHelpfulNoteNotNeeded",
  "notHelpfulOpinionSpeculation",
  "notHelpfulIrrelevantSources",
  "notHelpfulSpamHarassmentOrAbuse",
  "notHelpfulArgumentativeOrBiased",
  "notHelpfulMissingKeyPoints",
  "notHelpfulSourcesMissingOrUnreliable",
  "notHelpfulIncorrect",
  "helpfulImportantContext",
  "helpfulAddressesClaim",
  "helpfulGoodSources",
  "helpfulClear",
]

In [None]:
# Define helper to extract embeddings
def make_tensors(dataset, includedLabels, batchSize=1024):
  # Prune columns
  dataset = dataset[[NOTE_ID, TWEET_ID, NOTE_TEXT_FINAL, TWEET_TEXT_FINAL] + includedLabels].copy()
  dataset[INTERCEPT] = (dataset[INTERCEPT] - dataset[INTERCEPT].mean()) / dataset[INTERCEPT].std()
  dataset[FACTOR] = (dataset[FACTOR] - dataset[FACTOR].mean()) / dataset[FACTOR].std()
  # Prepare tokenizer and inputs
  tokenizer = AutoTokenizer.from_pretrained(os.path.join(MODEL_ROOT, DISTILROBERTA_BASE_MODEL, TOKENIZER_DIR))
  noteTexts = list(dataset[NOTE_TEXT_FINAL].values)
  tweetTexts = list(dataset[TWEET_TEXT_FINAL].values)
  assert len(noteTexts) == len(tweetTexts)
  # Tokenize all texts
  numBatches = int(np.ceil(len(noteTexts) / batchSize))
  progressBar = tqdm(range(numBatches))
  inputIds = []
  attentionMasks = []
  start = 0
  while start < len(noteTexts):
    end = start + batchSize
    batch = tokenizer(
      list(zip(tweetTexts[start:end], noteTexts[start:end])),
      max_length=512,
      truncation="longest_first",
      padding="max_length",  # Pad to max length since batches are large enough we effectively do this anyways.
      return_tensors="pt"
    )
    inputIds.append(batch["input_ids"])
    attentionMasks.append(batch["attention_mask"])
    start = end
    progressBar.update(1)
  # Generate labels and loss mask
  labels = torch.tensor(dataset[includedLabels].fillna(0.5).values).to(torch.float32)
  lossMask = torch.tensor(dataset[includedLabels].notna().values).to(torch.float32)
  return (
    torch.concat(inputIds, axis=0),
    torch.concat(attentionMasks, axis=0),
    labels,
    lossMask,
    torch.tensor(dataset[NOTE_ID].values),
    torch.tensor(dataset[TWEET_ID].values),
  )

In [None]:
# Obtain tokens and masks
trainTensors = make_tensors(trainDataset, allSignalLabels)
print(tuple(tmp.shape for tmp in trainTensors))
testTensors = make_tensors(testDataset, allSignalLabels)
print(tuple(tmp.shape for tmp in testTensors))

In [None]:
# Save tensors to disk
def save_tensors(tensors, fileName):
  inputIds, attentionMasks, labels, lossMask, noteIds, tweetIds = tensors
  ts = int(time.time())
  path = os.path.join(DATA_ROOT, f"{ts}_{fileName}")
  print(f"Saving checkpoint to {path}")
  torch.save({
    "inputIds": inputIds,
    "attentionMasks": attentionMasks,
    "labels": labels,
    "lossMask": lossMask,
    "noteIds": noteIds,
    "tweetIds": tweetIds,    
  }, path)

save_tensors(trainTensors, "train_tensors.pt")
save_tensors(testTensors, "test_tensors.pt")

In [None]:
# Generate small splits for testing
trainTensorsSmall = tuple(tmp[:2000] for tmp in trainTensors)
testTensorsSmall = tuple(tmp[:2000] for tmp in testTensors)

In [None]:
# Generate small splits for testing
trainTensorsMedium = tuple(tmp[:20000] for tmp in trainTensors)
testTensorsMedium = tuple(tmp[:20000] for tmp in testTensors)

## Define Training Helpers

In [None]:
# Define model
class ParallelStack(nn.Module):

  def __init__(self, hFactor, nHeads, dim=768, dropout=0.1):
    super().__init__()
    self.preclassifier = nn.Linear(dim, int(hFactor * dim))
    self.dropout = nn.Dropout(dropout)
    self.classifier = nn.Linear(int(hFactor * dim), nHeads)

  def forward(self, embedding):
    z = self.preclassifier(embedding)
    a = self.dropout(nn.ReLU()(z))
    return self.classifier(a)

class MultiHeadMLPAllSignals(nn.Module):

  def __init__(self, dim=768, dropout=0.1):
    super().__init__()
    self.roberta = AutoModel.from_pretrained(os.path.join(MODEL_ROOT, DISTILROBERTA_BASE_MODEL, MODEL_DIR))
    self.interceptPredictor = ParallelStack(12, 1)
    self.factorPredictor = ParallelStack(12, 1)
    self.crhClassifier = ParallelStack(12, 1)
    self.crnhClassifier = ParallelStack(12, 1)
    self.relevanceClassifier = ParallelStack(12, 1)
    self.classificationClassifier = ParallelStack(12, 1)
    self.helpfulTagClassifier = ParallelStack(12, 4)
    self.notHelpfulTagClassifier = ParallelStack(12, 8)

  def forward(self, inputIds, attentionMask):
    embedding = self.roberta(
      input_ids=inputIds,
      attention_mask=attentionMask,
    ).last_hidden_state[:, 0]  # batch, token, dimension
    return torch.concat([
      self.interceptPredictor(embedding),
      self.factorPredictor(embedding),
      self.crhClassifier(embedding),
      self.crnhClassifier(embedding),
      self.classificationClassifier(embedding),
      self.relevanceClassifier(embedding),
      self.notHelpfulTagClassifier(embedding),
      self.helpfulTagClassifier(embedding),
    ], axis=1)

In [None]:
# Define a helper to prepare loss weights
def make_loss_weights(lossMask, objectiveWeights, numBatches):
  # Compute weight to assign to each instance of training data for a particular objective
  assert objectiveWeights.shape[0] == lossMask.shape[1]
  instanceWeights = lossMask.sum(axis=0) ** -1
  assert objectiveWeights.shape[0] == instanceWeights.shape[0]
  assert np.abs(objectiveWeights.sum().item() - 1) < 1e-5
  adjustedInstanceWeight = instanceWeights * objectiveWeights
  # Compute weight to assign to each prediction loss
  predictionLoss = lossMask * adjustedInstanceWeight
  assert predictionLoss.shape == lossMask.shape
  assert np.abs(predictionLoss.sum().item() - 1) < 1e-5
  assert ((predictionLoss.sum(axis=0) - objectiveWeights).abs() < 1e-5).all().item()
  # Apply batch scaling
  return numBatches * predictionLoss 

make_loss_weights(
  torch.tensor([
    [1, 1, 1, 1],
    [1, 1, 0, 0],
    [1, 0, 0, 0],
  ], dtype=torch.float32),
  torch.tensor([0.2, 0.2, 0.5, .1], dtype=torch.float32),
  10,
)

In [None]:
# Define helper to compute loss
def multihead_loss(logits, lossWeights, labels, mseMultiplier):
  # Validate sizes match
  assert logits.shape == lossWeights.shape
  assert logits.shape == labels.shape
  # Compute loss of each prediction
  logitLoss = torch.concat([
    nn.MSELoss(reduction="none")(logits[:, :2], labels[:, :2]) * mseMultiplier,
    nn.BCEWithLogitsLoss(reduction="none")(logits[:, 2:], labels[:, 2:])
  ], axis=1)
  return logitLoss * lossWeights

multihead_loss(
  torch.arange(-3, 5, dtype=torch.float32).reshape(2, 4),
  torch.tensor([1, 0, 1, 0, 0, 1, 0, 1], dtype=torch.float32).reshape(2, 4),
  torch.ones(8, dtype=torch.float32).reshape(2, 4),
  10,
)

In [None]:
# Define helper for applying model
def apply_model(model, dataset, device, gpuBatchSize, frac=None):
  # Configure batching
  if device == CUDA and torch.cuda.device_count() > 1:
    batchSize = gpuBatchSize * torch.cuda.device_count()
  else:
    batchSize = gpuBatchSize
  # Prepare data
  inputIds, attentionMask, labels, lossMask, noteIds, tweetIds = dataset
  assert inputIds.shape[0] == attentionMask.shape[0] == labels.shape[0] == lossMask.shape[0] == noteIds.shape[0] == tweetIds.shape[0]
  if frac != None:
    assert 0 < frac <= 1.
    size = int(frac * inputIds.shape[0])
    indices = torch.randperm(inputIds.shape[0])[:size]
    inputIds = inputIds[indices]
    attentionMask = attentionMask[indices]
    labels = labels[indices]
    lossMask = lossMask[indices]
    noteIds = noteIds[indices]
    tweetIds = tweetIds[indices]
  # Process chunks
  start = 0
  assert not model.training
  preds = []
  progress = tqdm(range(int(np.ceil(inputIds.shape[0] / batchSize))))
  while start < inputIds.shape[0]:
    end = start + batchSize
    with torch.no_grad():
      with autocast(device_type=device, dtype=torch.bfloat16):
        preds.append(
          model(
            inputIds[start:end].to(device),
            attentionMask[start:end].to(device),
          ).to(CPU).detach()
        )
    start = end
    progress.update(1)
  preds = torch.concat(preds, axis=0)
  return labels, preds, lossMask, noteIds, tweetIds

In [None]:
# Define helper for incremental eval
def eval_model(model, trainDataset, testDataset, device, gpuBatchSize):
  print("Forward pass on training data:")
  allTrainLabels, allTrainPreds, allTrainMask, _, _ = apply_model(model, trainDataset, device, gpuBatchSize=gpuBatchSize, frac=(1/9))
  print("Forward pass on test data:")
  allTestLabels, allTestPreds, allTestMask, _, _ = apply_model(model, testDataset, device, gpuBatchSize=gpuBatchSize, frac=None)
  results = []
  for i in range(allTrainLabels.shape[1]):
    trainLabels, trainPreds = allTrainLabels[allTrainMask[:, i] == 1, i], allTrainPreds[allTrainMask[:, i] == 1, i]
    testLabels, testPreds = allTestLabels[allTestMask[:, i] == 1, i], allTestPreds[allTestMask[:, i] == 1, i]
    if i < 2:
      trainMSE = nn.MSELoss()(trainPreds, trainLabels)
      testMSE = nn.MSELoss()(testPreds, testLabels)
      results.append((trainMSE, testMSE, None, None, None, None))
    else:
      if trainLabels.sum().item() == 0 or testLabels.sum().item() == 0:
        results.append((None, None, -1, -1, -1, -1))
        continue
      trainAuc = skm.roc_auc_score(trainLabels.numpy(), trainPreds.numpy())
      testAuc = skm.roc_auc_score(testLabels.numpy(), testPreds.numpy())
      fpr, tpr, _ = skm.roc_curve(testLabels.numpy(), testPreds.numpy())
      tprAt1 = tpr[np.argmin(np.abs(fpr - .01))]
      tprAt5 = tpr[np.argmin(np.abs(fpr - .05))]
      results.append((None, None, trainAuc, testAuc, tprAt1, tprAt5))
  return results

In [None]:
# Define helpers to save checkpointed state
def save_checkpoint(root, epoch, batch, loss, model, optimizer, scheduler, scaler, stats):
  checkpoint = {
    "epoch": epoch,
    "batch": batch,
    "loss": loss,
    "model": model.module.state_dict(),
    "optimizer": optimizer.state_dict() if optimizer is not None else None,
    "scheduler": scheduler.state_dict() if scheduler is not None else None,
    "scaler": scaler.state_dict() if scaler is not None else None,
    "stats": stats,
  }
  checkpointId = str(int(time.time()))
  path = os.path.join(root, f"{checkpointId}.pt")
  print(f"Saving checkpoint to {path}")
  torch.save(checkpoint, path)

In [None]:
# Define training loop
def train_model(
  model,
  trainDataset,
  testDataset,
  includedLabels,
  numEpochs=3,
  device=CUDA,
  deepLogEvery=1,
  gpuBatchSize=32,
  learningRate=1e-5,
  logEvery=None,
  learningSchedule=True,
  objectiveWeights=None,
  mseMultiplier=.2
):
  gc.collect()
  torch.cuda.empty_cache()
  # Set up checkpoint directory
  modelId = str(int(time.time()))
  print(f"Beginning training run for {modelId}")
  modelRoot = os.path.join(DATA_ROOT, modelId)
  os.mkdir(modelRoot)
  # Prepare data and batching
  print("Setting up training")
  model = model.to(device)
  if device == CUDA and torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
    batchSize = gpuBatchSize * torch.cuda.device_count()
  else:
    batchSize = gpuBatchSize
  inputIds, attentionMask, labels, lossMask, _, _ = trainDataset
  numBatches = int(np.ceil(inputIds.shape[0] / batchSize))
  progress = tqdm(range(numBatches * numEpochs))
  if deepLogEvery is None:
    deepLogEvery = max(1, int(np.ceil(numEpochs / 10)))
  print(f"Training data contains {inputIds.shape[0]} rows to be split into {numBatches} batches")
  # Prepare models and data
  assert all([
    (
      n.startswith("module.roberta")
      or n.startswith("module.interceptPredictor")
      or n.startswith("module.factorPredictor")
      or n.startswith("module.crhClassifier")
      or n.startswith("module.crnhClassifier")
      or n.startswith("module.relevanceClassifier")
      or n.startswith("module.classificationClassifier")
      or n.startswith("module.helpfulTagClassifier")
      or n.startswith("module.notHelpfulTagClassifier")
    )
    for n, _ in model.named_parameters()
  ])
  robertaParams = [p for n, p in model.named_parameters() if n.startswith("module.roberta")]
  singleHeadParams = [p for n, p in model.named_parameters() if (
    n.startswith("module.interceptPredictor")
    or n.startswith("module.factorPredictor")
    or n.startswith("module.crhClassifier")
    or n.startswith("module.crnhClassifier")
    or n.startswith("module.relevanceClassifier")
    or n.startswith("module.classificationClassifier")
  )]
  tagParams = [p for n, p in model.named_parameters() if n.startswith("module.helpfulTagClassifier") or n.startswith("module.notHelpfulTagClassifier")]
  print("Parameter groups:", len(robertaParams), len(singleHeadParams), len(tagParams))
  optim = torch.optim.AdamW([
    {"params": robertaParams, "weight_decay": .01},
    {"params": singleHeadParams, "weight_decay": .1},
    {"params": tagParams, "weight_decay": .25},
  ], lr=learningRate)
  if learningSchedule:
    scheduler = LinearLR(
      optim,
      start_factor=1.0,  # Start at the initial learning rate
      end_factor=0.0,    # End at 0
      total_iters=(numBatches * numEpochs),  # Total number of training steps      
    )
  scaler = GradScaler()
  if objectiveWeights is None:
    objectiveWeights = torch.ones(labels.shape[1]) / labels.shape[1]
  assert np.abs(objectiveWeights.sum().item() - 1) < 1e-5
  lossWeights = make_loss_weights(lossMask, objectiveWeights, numBatches).to(device)
  model.train()
  for epoch in range(numEpochs):
    gc.collect()
    torch.cuda.empty_cache()
    losses = []
    base = 0
    randOrder = np.random.permutation(np.arange(0, inputIds.shape[0]))
    inputIds = inputIds[randOrder]
    attentionMask = attentionMask[randOrder]
    labels = labels[randOrder]
    lossWeights = lossWeights[randOrder]
    for batch in range(numBatches):
      # Obtain batch
      start = batch * batchSize
      end = start + batchSize
      y = labels[start:end].to(device)
      with autocast(device_type=device, dtype=torch.bfloat16):
        # Forward pass
        y_hat = model(
          inputIds[start:end].to(device),
          attentionMask[start:end].to(device),
        )
        # Compute loss
        loss = multihead_loss(y_hat, lossWeights[start:end], y, mseMultiplier).sum()
      losses.append(loss.item())
      # Backward pass
      scaler.scale(loss).backward()
      # Update weights
      scaler.step(optim)
      scaler.update()
      if learningSchedule:
        scheduler.step()
      # Zero out gradients
      optim.zero_grad()
      # Update progress bar
      progress.update(1)
      if batch % logEvery == 0:
        print(f"epoch={epoch:<3d}  batch={batch:<5d}  loss={np.mean(losses[-logEvery:]):7.5f}")
    # Log loss
    model.eval()
    results = eval_model(model, trainDataset, testDataset, device, gpuBatchSize=gpuBatchSize)
    stats = []
    for (trainMSE, testMSE, trainAuc, testAuc, tprAt1, tprAt5), label in zip(results, includedLabels):
      if trainMSE is None:
        print(f"{label:<40}  epoch={epoch:<3d}  loss={np.mean(losses):7.5f}  trainAuc={trainAuc:5.3f}  testAuc={testAuc:5.3f}  tpr@0.01={tprAt1:5.3f}  tpr@0.05={tprAt5:5.3f}")
      else:
        print(f"{label:<40}  epoch={epoch:<3d}  loss={np.mean(losses):7.5f}  trainMSE={trainMSE:7.5f}  testMSE={testMSE:7.5f}")      
      stats.append((label, trainMSE, testMSE, trainAuc, testAuc, tprAt1, tprAt5))
    save_checkpoint(modelRoot, epoch, batch, np.mean(losses), model, optim, scheduler, scaler, stats)      
    model.train()

In [None]:
def eval_hparams(
  trainDataset,
  testDataset,
  includedLabels=allSignalLabels,
  numEpochs=5,
  gpuBatchSize=32,
  learningRate=1e-5,
  learningSchedule=True,
  robertaWeightDecay=0.01,
  helpfulnessWeightDecay=0.1,
  relevanceWeightDecay=0.05,
  abuseWeightDecay=0.25,
  objectiveWeights=None,
  logEvery=500,
  mseMultiplier=.2
):
  model = MultiHeadMLPAllSignals()
  train_model(
    model,
    trainDataset,
    testDataset, 
    includedLabels,
    numEpochs=numEpochs,
    gpuBatchSize=gpuBatchSize,
    learningRate=learningRate,
    learningSchedule=learningSchedule,
    objectiveWeights=objectiveWeights,
    logEvery=logEvery,
    mseMultiplier=mseMultiplier,
  )

## Small Scale Tests

In [None]:
# Test run at small scale
eval_hparams(
  trainTensorsSmall,
  testTensorsSmall,
  numEpochs=1,
  logEvery=1,
  gpuBatchSize=32
)

In [None]:
# Test run at medium scale
eval_hparams(
  trainTensorsMedium,
  testTensorsMedium,
  numEpochs=3,
  logEvery=10,
  gpuBatchSize=32
)

## Full Scale Training

In [None]:
# Train at full scale
eval_hparams(
  trainTensors,
  testTensors,
  numEpochs=3,
  logEvery=250,
  gpuBatchSize=32
)

# Train Meta Model

## Apply Model

In [None]:
# Partition test tensors to create holdout dataset
def split_test(testTensors):
  print(tuple(tmp.shape for tmp in testTensors))
  split = testTensors[0].shape[0] // 2
  tuningTensors = tuple(tmp[:split] for tmp in testTensors)
  holdoutTensors = tuple(tmp[split:] for tmp in testTensors)
  print(tuple(tmp.shape for tmp in tuningTensors))
  print(tuple(tmp.shape for tmp in holdoutTensors))
  return tuningTensors, holdoutTensors

tuningTensors, holdoutTensors = split_test(testTensors)

In [None]:
# Prune to only rows that 
def prune_tensors(tensors, labels):
  classificationIdx = allSignalLabels.index("classification")
  crhIdx = allSignalLabels.index("crh")
  # Identify rows that represent real pairs based on 'classification' bit
  inputIds, attentionMask, labels, lossMask, noteIds, tweetIds = tensors
  realRows = (lossMask[:, classificationIdx] > 0)  # all real rows have the classification bit set
  print(np.bincount(realRows.cpu().detach().numpy()))
  assert lossMask[~realRows, crhIdx].sum().item() == 0  # non-real rows should never have loss for the CRH label
  # Prune to only real data
  return (
    inputIds[realRows, :],
    attentionMask[realRows, :],
    labels[realRows, :],
    lossMask[realRows, :],
    noteIds[realRows],
    tweetIds[realRows],
  )

trainTensorsPruned = prune_tensors(trainTensors, allSignalLabels)
tuningTensorsPruned = prune_tensors(tuningTensors, allSignalLabels)  # Note that the ratio here isn't exactly 5x bcause the order was randomized after creating low relevance instances
holdoutTensorsPruned = prune_tensors(holdoutTensors, allSignalLabels)  # Note that the ratio here isn't exactly 5x bcause the order was randomized after creating low relevance instances

In [None]:
# Load all signal model
def load_full_signal_checkpoint(path):
  # Log checkpoint state
  print(f"Loading checkpoint from {path}")
  checkpoint = torch.load(path, weights_only=False)
  print(f"  epoch={checkpoint['epoch']:<3d}")
  print(f"  batch={checkpoint['batch']:<3d}")
  print(f"  loss={checkpoint['loss']:7.5f}")
  for label, trainMSE, testMSE, trainAuc, testAuc, tprAt1, tprAt5 in checkpoint["stats"]:
    if trainMSE is None:
      print(f"{label:<40}  trainAuc={trainAuc:5.3f}  testAuc={testAuc:5.3f}  tpr@0.01={tprAt1:5.3f}  tpr@0.05={tprAt5:5.3f}")
    else:
      print(f"{label:<40}  trainMSE={trainMSE:7.5f}  testMSE={testMSE:7.5f}")      
  # load model
  model = MultiHeadMLPAllSignals()
  model.load_state_dict(checkpoint["model"])
  model.eval()
  return model.to(CUDA)

allSignalModel = load_full_signal_checkpoint(os.path.expanduser("~/workspace/datasets/helpfulness/1753341209/1753352848.pt"))

In [None]:
# Define helper for applying model
def apply_model(model, dataset, device, gpuBatchSize):
  # Configure batching
  if device == CUDA and torch.cuda.device_count() > 1:
    batchSize = gpuBatchSize * torch.cuda.device_count()
  else:
    batchSize = gpuBatchSize
  # Prepare data
  inputIds, attentionMask, labels, lossMask, noteIds, tweetIds = dataset
  assert inputIds.shape[0] == attentionMask.shape[0] == labels.shape[0] == lossMask.shape[0] == noteIds.shape[0] == tweetIds.shape[0]
  # Process chunks
  start = 0
  assert not model.training
  preds = []
  progress = tqdm(range(int(np.ceil(inputIds.shape[0] / batchSize))))
  while start < inputIds.shape[0]:
    end = start + batchSize
    with torch.no_grad():
      with autocast(device_type=device, dtype=torch.bfloat16):
        preds.append(
          model(
            inputIds[start:end].to(device),
            attentionMask[start:end].to(device),
          ).to(CPU).detach()
        )
    start = end
    progress.update(1)
  preds = torch.concat(preds, axis=0)
  return labels, preds, lossMask, noteIds, tweetIds

In [None]:
# Extract train tensors
trainTensorsSignals = apply_model(allSignalModel, trainTensorsPruned, CUDA, 32)
tuningTensorsSignals = apply_model(allSignalModel, tuningTensorsPruned, CUDA, 32)
holdoutTensorsSignals = apply_model(allSignalModel, holdoutTensorsPruned, CUDA, 32)

## Profile Signals

In [None]:
allSignalLabels

In [None]:
# Define helper to convert all-signal model results to dataframe
def make_dataframe(allSignalTensors, colNames, dataset, clusters):
  labels, preds, lossMask, noteIds, tweetIds = allSignalTensors
  # Set note and tweet id
  df = pd.DataFrame({
    NOTE_ID: noteIds,
    TWEET_ID: tweetIds,
  })
  # Set predictions nad labels
  assert len(colNames) == preds.shape[1] == labels.shape[1] == lossMask.shape[1]
  for i, col in enumerate(colNames):
    df[f"{col}_label"] = labels[:, i].cpu().detach().to(torch.float32).numpy()
    df.loc[(lossMask[:, i] == 0).numpy(), f"{col}_label"] = pd.NA
    df[f"{col}_pred"] = preds[:, i].cpu().detach().to(torch.float32).numpy()
    crhPreds = preds[:, i].cpu().detach().to(torch.float32).numpy()
    if col == "factor":
      crhPreds = np.abs(crhPreds)
    if col.startswith("notHelpful") or col == "factor" or col == "crnh":
      crhPreds = crhPreds * -1    
    df[f"{col}_crh_pred"] = crhPreds
  # Merge in status
  df = df.merge(enDataset[[NOTE_ID, TWEET_ID, CURRENT_LABEL]])
  # Merge in clusters
  df = df.merge(clusters)
  # Make sure no rows were dropped and return
  assert len(df) == labels.shape[0]
  return df

trainSignalDF = make_dataframe(trainTensorsSignals, allSignalLabels, enDataset, clusters)
tuningSignalDF = make_dataframe(tuningTensorsSignals, allSignalLabels, enDataset, clusters)
holdoutSignalDF = make_dataframe(holdoutTensorsSignals, allSignalLabels, enDataset, clusters)

In [None]:
# Define helper for calculating KL divergence
def kl_divergence(pSeries, qSeries):
  uniqueVals = np.unique(pd.concat([pSeries, qSeries]))
  # Smooth to all values occurring at least once
  probP = pSeries.value_counts(normalize=True).reindex(uniqueVals, fill_value=1).values
  probQ = qSeries.value_counts(normalize=True).reindex(uniqueVals, fill_value=1).values
  return np.sum(probP * np.log(probP / probQ))

In [None]:
# Profile signals
def profile_signals(df, labelCols):
  tuples = []
  for label in labelCols:
    # Isolate predictions and labels
    colPreds = df[f"{label}_pred"]
    colLabels = df[f"{label}_label"]
    colMask = df[f"{label}_label"].notna()
    crhPreds = df[f"{label}_crh_pred"]
    crhLabels = df["crh_label"]
    crhMask = df["crh_label"].notna()
    # Calculate AUCs
    assert crhLabels[crhMask].isna().sum() == 0
    assert colLabels[colMask].isna().sum() == 0
    crhAuc = skm.roc_auc_score(crhLabels[crhMask] > .5, crhPreds[crhMask])
    if label in ["intercept", "factor", "relevance"]:
      colAuc = pd.NA
    else:
      colAuc = skm.roc_auc_score(colLabels[colMask] > .5, colPreds[colMask])
    # Calculate KL divergences
    klds = dict()
    for cluster in ["notes", "tweets", "joint"]:
      crhClusters = df[df[CURRENT_LABEL] == CURRENTLY_RATED_HELPFUL][f"{cluster}_cluster_id"]
      colClusters = df.sort_values(f"{label}_crh_pred", ascending=False).head(len(crhClusters))[f"{cluster}_cluster_id"]
      klds[cluster] = kl_divergence(crhClusters, colClusters)
    tuples.append((label, crhAuc, colAuc, klds["notes"], klds["tweets"], klds["joint"]))
  return pd.DataFrame(tuples, columns=["Signal", "CRH AUC", "Label AUC", "KL (Notes)", "KL (Tweets)", "KL (Joint)"])

In [None]:
# Show AUC and KL summary for tuning data
profile_signals(tuningSignalDF, allSignalLabels).sort_values("KL (Notes)")

In [None]:
# Validate that holdout results align with tuning data
profile_signals(holdoutSignalDF, allSignalLabels).sort_values("KL (Notes)")

In [None]:
# Define plotting helpers
def plot_crh_rates_single_with_pred(df, topWords, col, clusterCol, dataset=enDataset):
  # Prepare DF
  init = len(df)
  crhBudget = (df[CURRENT_LABEL] == CURRENTLY_RATED_HELPFUL).sum()
  crhThreshold = df[col].sort_values(ascending=False).values[crhBudget]
  df["above"] = df[col] > crhThreshold
  df = df.merge(dataset[[NOTE_ID, AUTHOR_CLASSIFICATION]])
  df = df[[clusterCol, CURRENT_LABEL, AUTHOR_CLASSIFICATION, "above"]]
  assert len(df) == init
  # Compute cluster ratios
  proposedRatios = df[clusterCol].value_counts(normalize=True).to_frame().reset_index(drop=False).rename(columns={"proportion": "proposed"})
  misleadingRatios = df[df[AUTHOR_CLASSIFICATION] == MISINFORMED_OR_POTENTIALLY_MISLEADING][clusterCol].value_counts(normalize=True).to_frame().reset_index(drop=False).rename(columns={"proportion": "misleading"})
  crhRatios = df[df[CURRENT_LABEL] == CURRENTLY_RATED_HELPFUL][clusterCol].value_counts(normalize=True).to_frame().reset_index(drop=False).rename(columns={"proportion": "crh"})
  aboveRatios = df[df["above"]][clusterCol].value_counts(normalize=True).to_frame().reset_index(drop=False).rename(columns={"proportion": "above"})
  merged = proposedRatios.merge(misleadingRatios, how="outer").merge(crhRatios, how="outer").merge(aboveRatios, how="outer").fillna(0)
  merged = merged.sort_values("crh", ascending=False)
  fig, ax = plt.subplots(1, 1)
  fig.set_figwidth(40)
  fig.set_figheight(6)
  merged[clusterCol] = [topWords[str(clusterId)] for clusterId in merged[clusterCol]]
  merged = merged[[clusterCol, "proposed", "misleading", "crh", "above"]].rename(columns={
    "above": "Predicted CRH Note Distribution",
    "crh": "CRH Note Distribution",
    "misleading": "Misleading Note Distribution",
    "proposed": "Proposed Note Distribution"
  })
  merged.plot(x=clusterCol, kind="bar", ax=ax, rot=0)
  ax.set_xlabel("Keywords")
  ax.set_ylabel("Rate")
  ax.grid(axis="y", zorder=-1)
  ax.set_axisbelow(True)
  ax.legend(loc='upper right', ncol=4, fontsize=20)

def plot_crh_rates_with_pred(df, col, topWords=topWords):
  plot_crh_rates_single_with_pred(df, topWords["notes"], col, "notes_cluster_id")
  plot_crh_rates_single_with_pred(df, topWords["tweets"], col, "tweets_cluster_id")
  plot_crh_rates_single_with_pred(df, topWords["joint"], col, "joint_cluster_id")

In [None]:
plot_crh_rates_with_pred(holdoutSignalDF, "crh_crh_pred")

In [None]:
plot_crh_rates_with_pred(holdoutSignalDF, "helpfulAddressesClaim_crh_pred")

In [None]:
plot_crh_rates_with_pred(holdoutSignalDF, "helpfulClear_crh_pred")

In [None]:
plot_crh_rates_with_pred(holdoutSignalDF, "notHelpfulOpinionSpeculation_crh_pred")

In [None]:
plot_crh_rates_with_pred(holdoutSignalDF, "helpfulImportantContext_crh_pred")

In [None]:
plot_crh_rates_with_pred(holdoutSignalDF, "intercept_crh_pred")

In [None]:
plot_crh_rates_with_pred(holdoutSignalDF, "notHelpfulNoteNotNeeded_crh_pred")

## Define Training Helpers

In [None]:
# Define a helper to prepare loss weights
def make_loss_weights(lossMask, objectiveWeights, numBatches):
  # Compute weight to assign to each instance of training data for a particular objective
  assert objectiveWeights.shape[0] == lossMask.shape[1]
  instanceWeights = lossMask.sum(axis=0) ** -1
  assert objectiveWeights.shape[0] == instanceWeights.shape[0]
  assert np.abs(objectiveWeights.sum().item() - 1) < 1e-5
  adjustedInstanceWeight = instanceWeights * objectiveWeights
  # Compute weight to assign to each prediction loss
  predictionLoss = lossMask * adjustedInstanceWeight
  assert predictionLoss.shape == lossMask.shape
  assert np.abs(predictionLoss.sum().item() - 1) < 1e-5
  assert ((predictionLoss.sum(axis=0) - objectiveWeights).abs() < 1e-5).all().item()
  # Apply batch scaling
  return numBatches * predictionLoss 

make_loss_weights(
  torch.tensor([
    [1, 1, 1, 1],
    [1, 1, 0, 0],
    [1, 0, 0, 0],
  ], dtype=torch.float32),
  torch.tensor([0.2, 0.2, 0.5, .1], dtype=torch.float32),
  10,
)

In [None]:
# Define helper to compute loss
def multihead_loss(logits, lossWeights, labels, posWeight=None):
  # Validate sizes match
  assert logits.shape == lossWeights.shape
  assert logits.shape == labels.shape
  # Compute positive class weights
  if posWeight is not None:
    classWeights = (labels == 1)
    classWeights = classWeights * posWeight
    classWeights[classWeights == 0] = 1
  else:
    classWeights = 1
  # Compute loss of each prediction
  return (nn.BCEWithLogitsLoss(reduction="none")(logits, labels) * lossWeights * classWeights)

print(
  multihead_loss(
    torch.arange(-3, 5, dtype=torch.float32).reshape(2, 4),
    torch.tensor([1, 0, 1, 0, 0, 1, 0, 1], dtype=torch.float32).reshape(2, 4),
    torch.concat([torch.ones(1, 4, dtype=torch.float32), torch.zeros(1, 4, dtype=torch.float32)], axis=0),
    None
  )
)
print(
  multihead_loss(
    torch.arange(-3, 5, dtype=torch.float32).reshape(2, 4),
    torch.tensor([1, 0, 1, 0, 0, 1, 0, 1], dtype=torch.float32).reshape(2, 4),
    torch.concat([torch.ones(1, 4, dtype=torch.float32), torch.zeros(1, 4, dtype=torch.float32)], axis=0),
    1
  )
)
print(
  multihead_loss(
    torch.arange(-3, 5, dtype=torch.float32).reshape(2, 4),
    torch.tensor([1, 0, 1, 0, 0, 1, 0, 1], dtype=torch.float32).reshape(2, 4),
    torch.concat([torch.ones(1, 4, dtype=torch.float32), torch.zeros(1, 4, dtype=torch.float32)], axis=0),
    10
  )
)

In [None]:
# Define helper for applying model
def apply_model(model, dataset, device, gpuBatchSize, colLabels, frac=None):
  # Configure batching
  if device == CUDA and torch.cuda.device_count() > 1:
    batchSize = gpuBatchSize * torch.cuda.device_count()
  else:
    batchSize = gpuBatchSize
  # Prepare data
  labels, inputs, lossMask, noteIds, tweetIds = dataset
  crhIdx = colLabels.index("crh")
  assert crhIdx == 2
  labels = labels[:, crhIdx:(crhIdx+1)]
  lossMask = lossMask[:, crhIdx:(crhIdx+1)]
  assert labels.shape[0] == inputs.shape[0] == lossMask.shape[0] == noteIds.shape[0] == tweetIds.shape[0]
  if frac != None:
    assert 0 < frac <= 1.
    size = int(frac * inputs.shape[0])
    indices = torch.randperm(inputs.shape[0])[:size]
    inputs = inputs[indices]
    labels = labels[indices]
    lossMask = lossMask[indices]
    noteIds = noteIds[indices]
    tweetIds = tweetIds[indices]
  # Process chunks
  start = 0
  assert not model.training
  preds = []
  progress = tqdm(range(int(np.ceil(inputs.shape[0] / batchSize))))
  while start < inputs.shape[0]:
    end = start + batchSize
    with torch.no_grad():
      with autocast(device_type=device, dtype=torch.bfloat16):
        preds.append(
          model(
            inputs[start:end].to(device),
          ).to(CPU).detach()
        )
    start = end
    progress.update(1)
  preds = torch.concat(preds, axis=0)
  return labels, preds, lossMask, noteIds, tweetIds

In [None]:
# Define helper for incremental eval
def eval_model(model, trainDataset, testDataset, device, gpuBatchSize, colLabels):
  print("Forward pass on training data:")
  allTrainLabels, allTrainPreds, allTrainMask, _, _ = apply_model(model, trainDataset, device, gpuBatchSize, colLabels, frac=(1/9))
  print("Forward pass on test data:")
  allTestLabels, allTestPreds, allTestMask, testNoteIds, testTweetIds = apply_model(model, testDataset, device, gpuBatchSize, colLabels, frac=None)
  results = []
  for i in range(allTrainLabels.shape[1]):
    trainLabels, trainPreds = allTrainLabels[allTrainMask[:, i] == 1, i], allTrainPreds[allTrainMask[:, i] == 1, i]
    testLabels, testPreds = allTestLabels[allTestMask[:, i] == 1, i], allTestPreds[allTestMask[:, i] == 1, i]
    if trainLabels.sum().item() == 0 or testLabels.sum().item() == 0:
      results.append((-1, -1, -1, -1))
      continue
    trainAuc = skm.roc_auc_score(trainLabels.numpy(), trainPreds.numpy())
    testAuc = skm.roc_auc_score(testLabels.numpy(), testPreds.numpy())
    fpr, tpr, _ = skm.roc_curve(testLabels.numpy(), testPreds.numpy())
    tprAt1 = tpr[np.argmin(np.abs(fpr - .01))]
    tprAt5 = tpr[np.argmin(np.abs(fpr - .05))]
    results.append((trainAuc, testAuc, tprAt1, tprAt5))
  return results, (allTestLabels, allTestPreds, allTestMask, testNoteIds, testTweetIds)

In [None]:
# Define helpers to save checkpointed state
def save_checkpoint(root, epoch, batch, loss, model, optimizer, scheduler, scaler, stats):
  checkpoint = {
    "epoch": epoch,
    "batch": batch,
    "loss": loss,
    "model": model.module.state_dict(),
    "optimizer": optimizer.state_dict() if optimizer is not None else None,
    "scheduler": scheduler.state_dict() if scheduler is not None else None,
    "scaler": scaler.state_dict() if scaler is not None else None,
    "stats": stats,
  }
  checkpointId = str(int(time.time()))
  path = os.path.join(root, f"{checkpointId}.pt")
  print(f"Saving checkpoint to {path}")
  torch.save(checkpoint, path)

In [None]:
# Define training loop
def train_model(
  model,
  trainDataset,
  testDataset,
  colLabels,
  numEpochs=3,
  device=CUDA,
  gpuBatchSize=32,
  learningRate=1e-5,
  weightDecay=0.01,
  learningSchedule=True,
  objectiveWeights=None,
  posWeight=None,
):
  gc.collect()
  torch.cuda.empty_cache()
  # Set up checkpoint directory
  modelId = str(int(time.time()))
  print(f"Beginning training run for {modelId}")
  modelRoot = os.path.join(DATA_ROOT, modelId)
  os.mkdir(modelRoot)
  # Prepare data and batching
  print("Setting up training")
  model = model.to(device)
  if device == CUDA and torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
    batchSize = gpuBatchSize * torch.cuda.device_count()
  else:
    batchSize = gpuBatchSize
  labels, inputs, lossMask, noteIds, tweetIds = trainDataset
  crhIdx = colLabels.index("crh")
  assert crhIdx == 2
  labels = labels[:, crhIdx:(crhIdx+1)]
  lossMask = lossMask[:, crhIdx:(crhIdx+1)]
  numBatches = int(np.ceil(inputs.shape[0] / batchSize))
  progress = tqdm(range(numBatches * numEpochs))
  print(f"Training data contains {inputs.shape[0]} rows to be split into {numBatches} batches")
  # Prepare models and data
  optim = torch.optim.AdamW(model.parameters(), lr=learningRate, weight_decay=weightDecay)
  if learningSchedule:
    scheduler = LinearLR(
      optim,
      start_factor=1.0,  # Start at the initial learning rate
      end_factor=0.0,    # End at 0
      total_iters=(numBatches * numEpochs),  # Total number of training steps      
    )
  else:
    scheduler = None
  scaler = GradScaler()
  if objectiveWeights is None:
    objectiveWeights = torch.ones(labels.shape[1]) / labels.shape[1]
  assert np.abs(objectiveWeights.sum().item() - 1) < 1e-5
  lossWeights = make_loss_weights(lossMask, objectiveWeights, numBatches).to(device)
  model.train()
  for epoch in range(numEpochs):
    gc.collect()
    torch.cuda.empty_cache()
    losses = []
    base = 0
    randOrder = np.random.permutation(np.arange(0, inputs.shape[0]))
    inputs = inputs[randOrder]
    labels = labels[randOrder]
    lossWeights = lossWeights[randOrder]
    for batch in range(numBatches):
      # Obtain batch
      start = batch * batchSize
      end = start + batchSize
      y = labels[start:end].to(device)
      with autocast(device_type=device, dtype=torch.bfloat16):
        # Forward pass
        y_hat = model(
          inputs[start:end].to(device),
        )
        # Compute loss
        loss = multihead_loss(y_hat, lossWeights[start:end], y, posWeight=posWeight).sum()
      losses.append(loss.item())
      # Backward pass
      scaler.scale(loss).backward()
      # Update weights
      scaler.step(optim)
      scaler.update()
      if learningSchedule:
        scheduler.step()
      # Zero out gradients
      optim.zero_grad()
      # Update progress bar
      progress.update(1)
    print(f"epoch={epoch:<3d}  loss={np.mean(losses):7.5f}")
  # Log loss
  model.eval()
  results, evalPreds = eval_model(model, trainDataset, testDataset, device, gpuBatchSize, colLabels)
  stats = []
  for (trainAuc, testAuc, tprAt1, tprAt5), label in zip(results, [CRH]):
    print(f"  epoch={epoch:<3d}  loss={np.mean(losses):7.5f}  trainAuc={trainAuc:5.3f}  testAuc={testAuc:5.3f}  tpr@0.01={tprAt1:5.3f}  tpr@0.05={tprAt5:5.3f}  ({label})")
    stats.append((label, trainAuc, testAuc, tprAt1, tprAt5))
  save_checkpoint(modelRoot, epoch, batch, np.mean(losses), model, optim, scheduler, scaler, stats)      
  return evalPreds

In [None]:
def eval_hparams(
  model,
  trainDataset,
  testDataset,
  colLabels,
  numEpochs=5,
  gpuBatchSize=1024,
  learningRate=1e-5,
  learningSchedule=True,
  weightDecay=0.01,
  objectiveWeights=None,
  posWeight=None
):
  return train_model(
    model,
    trainDataset,
    testDataset,
    colLabels,
    numEpochs=numEpochs,
    gpuBatchSize=gpuBatchSize,
    learningRate=learningRate,
    learningSchedule=learningSchedule,
    weightDecay=weightDecay,
    objectiveWeights=objectiveWeights,
    posWeight=posWeight,
  )

## Train Models

In [None]:
# Define model
class MetaModel(nn.Module):

  def __init__(self, colMask):
    super().__init__()
    totalCols = sum(colMask)
    print("total cols:", totalCols)
    self.colMask = colMask
    self.scale = nn.Parameter(torch.randn(totalCols))
    self.shift = nn.Parameter(torch.randn(totalCols))
    self.preclassifier = nn.Linear(totalCols, totalCols**2)
    self.classifier = nn.Linear(totalCols**2, 1)

  def forward(self, inputs):
    inputs = inputs[:, self.colMask]
    z = (inputs - self.shift) / self.scale
    a = nn.Tanh()(z)
    z = self.preclassifier(a)
    a = nn.ReLU()(z)
    return self.classifier(a)

In [None]:
# Train model on all signals except CRH
tmp = MetaModel([(col not in ["crh"]) for col in allSignalLabels])
tmp = eval_hparams(tmp, trainTensorsSignals, holdoutTensorsSignals, allSignalLabels, learningRate=1e-3, learningSchedule=True, numEpochs=50)
tmp = make_dataframe(tmp, ["crh"], dataset, clusters)
print(profile_signals(tmp, ["crh"]))
plot_crh_rates_with_pred(tmp, "crh_pred")

In [None]:
# Train model on all signals except CRH
tmp = MetaModel([(col in ["helpfulAddressesClaim", "helpfulClear", "notHelpfulOpinionSpeculation", "helpfulImportantContext"]) for col in allSignalLabels])
tmp = eval_hparams(tmp, trainTensorsSignals, holdoutTensorsSignals, allSignalLabels, learningRate=1e-3, learningSchedule=True, numEpochs=50)
tmp = make_dataframe(tmp, ["crh"], dataset, clusters)
print(profile_signals(tmp, ["crh"]))
plot_crh_rates_with_pred(tmp, "crh_pred")

In [None]:
# Train model on all signals except CRH
tmp = MetaModel([(col in ["helpfulAddressesClaim", "helpfulClear", "notHelpfulOpinionSpeculation"]) for col in allSignalLabels])
tmp = eval_hparams(tmp, trainTensorsSignals, holdoutTensorsSignals, allSignalLabels, learningRate=1e-3, learningSchedule=True, numEpochs=50)
tmp = make_dataframe(tmp, ["crh"], dataset, clusters)
print(profile_signals(tmp, ["crh"]))
plot_crh_rates_with_pred(tmp, "crh_pred")

In [None]:
# Train model on all signals except CRH
tmp = MetaModel([(col in ["helpfulAddressesClaim", "notHelpfulOpinionSpeculation"]) for col in allSignalLabels])
tmp = eval_hparams(tmp, trainTensorsSignals, holdoutTensorsSignals, allSignalLabels, learningRate=1e-3, learningSchedule=True, numEpochs=50)
tmp = make_dataframe(tmp, ["crh"], dataset, clusters)
print(profile_signals(tmp, ["crh"]))
plot_crh_rates_with_pred(tmp, "crh_pred")

# Summarize Results

In [None]:
# Validate that holdout results align with tuning data
tmp = profile_signals(holdoutSignalDF, allSignalLabels).sort_values("KL (Notes)")
tmp[tmp["Signal"] == "crh"]

In [None]:
plot_crh_rates_with_pred(holdoutSignalDF, "crh_pred")

In [None]:
# Train model on all signals except CRH
tmp = MetaModel([(col in ["helpfulAddressesClaim", "notHelpfulOpinionSpeculation"]) for col in allSignalLabels])
tmp = eval_hparams(tmp, trainTensorsSignals, holdoutTensorsSignals, allSignalLabels, learningRate=1e-3, learningSchedule=True, numEpochs=50)
holdoutResultDf = make_dataframe(tmp, ["crh"], dataset, clusters)

In [None]:
# View KL divergence
profile_signals(holdoutResultDf, ["crh"])

In [None]:
# View plots
plot_crh_rates_with_pred(holdoutResultDf, "crh_pred")

In [None]:
# Generate review samples
tmp = holdoutResultDf[[NOTE_ID, TWEET_ID, "crh_label", CURRENT_LABEL, "crh_pred"]].merge(enDataset[[NOTE_ID, TWEET_ID, NOTE_TEXT_FINAL, TWEET_TEXT_FINAL, INTERCEPT]])
tmp = tmp.rename(columns={"crh_pred": "helpful_claim_and_opinion"})
reviewSample = tmp.merge(holdoutSignalDF[[NOTE_ID, TWEET_ID, "crh_pred"]].rename(columns={"crh_pred": "helpful_direct"}))
reviewSample = reviewSample[[NOTE_ID, TWEET_ID, "crh_label", "helpful_claim_and_opinion", "helpful_direct", NOTE_TEXT_FINAL, TWEET_TEXT_FINAL, "intercept", CURRENT_LABEL]]
reviewSample

In [None]:
# Define helper to confirm key stats
def show_key_stats(scores, labels):
  # Select non-NA rows
  mask = labels.notna()
  scores = scores[mask]
  labels = labels[mask]
  # Generate curves
  auc = skm.roc_auc_score(labels, scores)
  fpr, tpr, _ = skm.roc_curve(labels, scores)
  tpr01 = tpr[np.argmin(np.abs(fpr - 0.01))]
  tpr05 = tpr[np.argmin(np.abs(fpr - 0.05))]
  return (auc, tpr01, tpr05)

In [None]:
# Recap direct stats
show_key_stats(reviewSample["helpful_direct"], reviewSample["crh_label"])

In [None]:
# Recap claim and opinion stats
show_key_stats(reviewSample["helpful_claim_and_opinion"], reviewSample["crh_label"])

In [None]:
# Save review sample to disk
tmp = os.path.join(DATA_ROOT, "review_sample.parquet")
print(tmp)
reviewSample.to_parquet(tmp)

In [None]:
nnnReviewSample = holdoutSignalDF[[NOTE_ID, TWEET_ID, "notHelpfulNoteNotNeeded_label", "notHelpfulNoteNotNeeded_pred"]].merge(
  enDataset[[NOTE_ID, TWEET_ID, NOTE_TEXT_FINAL, TWEET_TEXT_FINAL]])
tmp = os.path.join(DATA_ROOT, "nnn_review_sample.parquet")
print(tmp)
nnnReviewSample.to_parquet(tmp)

# Export Model

## Load and Combine Models

In [None]:
# Load multi-head model
tmp = torch.load(os.path.expanduser("~/workspace/datasets/helpfulness/1753341209/1753352848.pt"), weights_only=False)
multiHeadModel = MultiHeadMLPAllSignals()
multiHeadModel.load_state_dict(tmp["model"])
multiHeadModel

In [None]:
# Load CRH model
tmp = torch.load(os.path.expanduser("~/workspace/datasets/helpfulness/1753377611/1753377685.pt"), weights_only=False)
crhModel = MetaModel([(col in ["helpfulAddressesClaim", "notHelpfulOpinionSpeculation"]) for col in allSignalLabels])
crhModel.load_state_dict(tmp["model"])
crhModel

In [None]:
# Define composite model
class CompositeModel(nn.Module):

  def __init__(self, multiHeadModel, crhModel):
    super().__init__()
    # Set multiHeadModel parameters
    self.roberta = multiHeadModel.roberta
    self.interceptPredictor = multiHeadModel.interceptPredictor
    self.factorPredictor = multiHeadModel.factorPredictor
    self.crhClassifier = multiHeadModel.crhClassifier
    self.crnhClassifier = multiHeadModel.crnhClassifier
    self.relevanceClassifier = multiHeadModel.relevanceClassifier
    self.classificationClassifier = multiHeadModel.classificationClassifier
    self.helpfulTagClassifier = multiHeadModel.helpfulTagClassifier
    self.notHelpfulTagClassifier = multiHeadModel.notHelpfulTagClassifier
    # Set crhModel parameters
    self.colMask = crhModel.colMask
    self.scale = crhModel.scale
    self.shift = crhModel.shift
    self.preclassifier = crhModel.preclassifier
    self.classifier = crhModel.classifier
    
  def forward(self, inputIds, attentionMask):
    # Apply multiHeadModel
    embedding = self.roberta(
      input_ids=inputIds,
      attention_mask=attentionMask,
    ).last_hidden_state[:, 0]  # batch, token, dimension
    heads = torch.concat([
      self.interceptPredictor(embedding),
      self.factorPredictor(embedding),
      self.crhClassifier(embedding),
      self.crnhClassifier(embedding),
      self.classificationClassifier(embedding),
      self.relevanceClassifier(embedding),
      self.notHelpfulTagClassifier(embedding),
      self.helpfulTagClassifier(embedding),
    ], axis=1)
    # Apply crhModel
    z = (heads[:, self.colMask] - self.shift) / self.scale
    a = nn.Tanh()(z)
    z = self.preclassifier(a)
    a = nn.ReLU()(z)
    return torch.concat([heads, self.classifier(a)], axis=1)

compositeModel = CompositeModel(multiHeadModel, crhModel).to(CUDA)
compositeModel.eval()

## Validate Combined Model

In [None]:
# Define helper to apply composite model
def apply_composite_model(model, dataset, device, gpuBatchSize, colLabels):
  # Configure batching
  if device == CUDA and torch.cuda.device_count() > 1:
    batchSize = gpuBatchSize * torch.cuda.device_count()
  else:
    batchSize = gpuBatchSize
  # Prepare data
  inputIds, attentionMask, labels, lossMask, noteIds, tweetIds = dataset
  assert inputIds.shape[0] == attentionMask.shape[0] == labels.shape[0] == lossMask.shape[0] == noteIds.shape[0] == tweetIds.shape[0]
  # Process chunks
  start = 0
  assert not model.training
  preds = []
  progress = tqdm(range(int(np.ceil(inputIds.shape[0] / batchSize))))
  while start < inputIds.shape[0]:
    end = start + batchSize
    with torch.no_grad():
      with autocast(device_type=device, dtype=torch.bfloat16):
        preds.append(
          model(
            inputIds[start:end].to(device),
            attentionMask[start:end].to(device),
          ).to(CPU).detach()
        )
    start = end
    progress.update(1)
  preds = torch.concat(preds, axis=0)
  df = pd.DataFrame({
    NOTE_ID: noteIds.cpu().numpy(),
    TWEET_ID: tweetIds.cpu().numpy(),
  })
  df[colLabels + ["claimsAndOpinion"]] = preds.to(torch.float32).cpu().numpy()
  return df

In [None]:
validation = apply_composite_model(compositeModel, testTensors, CUDA, 32, allSignalLabels)

In [None]:
# Confirm that validation passes
holdoutResultDf[[NOTE_ID, TWEET_ID, "crh_pred"]].rename(columns={"crh_pred": "claims_and_opinion_ref"}).merge(
  holdoutSignalDF[[NOTE_ID, TWEET_ID, "relevance_pred", "notHelpfulSpamHarassmentOrAbuse_pred", "crh_pred"]].rename(columns={
    "relevance_pred": "relevance_ref",
    "notHelpfulSpamHarassmentOrAbuse_pred": "notHelpfulSpamHarassmentOrAbuse_ref",
    "crh_pred": "crh_ref"})).merge(
  validation[[NOTE_ID, TWEET_ID, "claimsAndOpinion", "relevance", "notHelpfulSpamHarassmentOrAbuse", "crh"]]
)

## Generate Trace

In [None]:
%%time
# Apply model to see sample results
compositeModel.eval()
compositeModel.to(CUDA)(testTensors[0][:3].to(CUDA), testTensors[1][:3].to(CUDA))

In [None]:
# Define helper for applying model
def get_jit_trace(model, tensors):
  # Prepare data
  model = model.to(CPU)
  inputIds, attentionMask, labels, lossMask, noteIds, tweetIds = tensors
  assert inputIds.shape[0] == attentionMask.shape[0] == labels.shape[0] == lossMask.shape[0] == noteIds.shape[0] == tweetIds.shape[0]
  assert not model.training
  with torch.no_grad():
    return torch.jit.trace(model, (inputIds[:1], attentionMask[:1]))

torch.jit.save(get_jit_trace(compositeModel, testTensors), os.path.join(DATA_ROOT, "composite_model.jit"))

In [None]:
%%time
# Validate model loaded from disk
def validate_jit_model(path, testTensors):
  model = torch.jit.load(path)
  assert not model.training
  return model(testTensors[0][:3].to(CPU), testTensors[1][:3].to(CPU))

validate_jit_model(os.path.join(DATA_ROOT, "composite_model.jit"), testTensors)

## Pack Tarball

In [None]:
# Define helper to create a tarball from a list of pairs
def pack_tarball(data: list[tuple[str, bytes]]) -> bytes:
  tarBytes = io.BytesIO()
  with tarfile.open(fileobj=tarBytes, mode='w') as tar:
    for name, content in data:
      print(name)
      # Encode content if it's a string
      assert isinstance(content, bytes)
      # Create TarInfo object
      info = tarfile.TarInfo(name=name)
      info.size = len(content)
      # Add file to tar
      tar.addfile(info, io.BytesIO(content))
  
  # Reset the stream position to the beginning
  tarBytes.seek(0)
  return tarBytes.getvalue()

In [None]:
# Define helper to prepare test data
def prepare_test_dataset(dataset, tensors, model, colLabels, size=50):
  # Select test samples
  inputIds, attentionMask, labels, lossMask, noteIds, tweetIds = tensors
  preds = model(inputIds[:size].to(CPU), attentionMask[:size].to(CPU))
  result = pd.DataFrame({
    NOTE_ID: noteIds[:size].numpy(),
    TWEET_ID: tweetIds[:size].numpy(),
  })
  result[colLabels + ["claimsAndOpinion"]] = preds.detach().to(CPU).numpy()
  result = result[[NOTE_ID, TWEET_ID, RELEVANCE, "notHelpfulSpamHarassmentOrAbuse", CRH, "claimsAndOpinion"]]
  # Merge with raw inputs.  Note that any synthetic note/tweet pairs will be dropped
  # because they don't occur in the dataset
  result = dataset[[NOTE_ID, TWEET_ID, NOTE_TEXT, TWEET_TEXT, TWEET_SHORTEN_URLS, TWEET_EXPANDED_URLS]].merge(result)
  assert len(result) > 0
  return result

prepare_test_dataset(dataset, testTensors, compositeModel.to(CPU), allSignalLabels)

In [None]:
# Return a tarball containing all modeling resources
def create_tarball(testData, colLabels):
  # List of {path, resource} pairs
  pairs = []
  # Add tokenizer resources
  tokenizerDir = os.path.join(MODEL_ROOT, DISTILROBERTA_BASE_MODEL, TOKENIZER_DIR)
  for fileName in os.listdir(tokenizerDir):
    if fileName.startswith("."):
      continue
    with open(os.path.join(tokenizerDir, fileName), "rb") as handle:
      resource = handle.read()
    pairs.append((f"tokenizer/{fileName}", resource))
  # Add jit model
  with open(os.path.join(DATA_ROOT, "composite_model.jit"), "rb") as handle:
    jitModel = handle.read()
  pairs.append(("model/model.jit", jitModel))
  # Add labels
  labels = b"".join(f"{label}\n".encode("utf-8") for label in colLabels + ["claimsAndOpinion"])
  pairs.append(("model/labels.txt", labels))
  # Add test data
  buf = io.BytesIO()
  testData.to_parquet(buf)
  buf.seek(0)
  pairs.append(("test_data.parquet", buf.getvalue()))
  return pack_tarball(pairs)

In [None]:
# Create and store tarball
tarball = create_tarball(prepare_test_dataset(dataset, testTensors, compositeModel.to(CPU), allSignalLabels), allSignalLabels)
print(hashlib.sha256(tarball).hexdigest())
with open(os.path.join(DATA_ROOT, "composite_model.tar"), "wb") as handle:
  handle.write(tarball)

In [None]:
allSignalLabels + ["claimsAndOpinion"]