Main pipeline for the CCS3 project.

Runs:
1. MECO eye-tracking analysis
2. Surprisal computation
3. Gaze-informed MT
4. Evaluation

In [1]:
!pip install rdata wordfreq sacrebleu transformers torch tqdm



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Environment setup (Colab)
import os
import sys

PROJECT_ROOT = "/content/drive/MyDrive/CCS3"

os.chdir(PROJECT_ROOT)

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print("Working directory:", os.getcwd())

Working directory: /content/drive/MyDrive/CCS3


In [4]:
# Imports
import random
import numpy as np

from src.config import RANDOM_SEED, EVAL_SAMPLE_SIZE

# MECO
from src.meco.load import load_meco_rda, select_language
from src.meco.preprocess import preprocess_meco
from src.meco.features import add_meco_features
from src.meco.regression import fit_ols_full_and_first_occurrence

# Surprisal
from src.surprisal.ru import RussianSurprisalModel, add_ru_surprisal
from src.surprisal.en import EnglishSurprisalModel

# WMT
from src.wmt.load import load_wmt
from src.wmt.preprocess import preprocess_wmt

# Gaze-informed MT
from src.gaze_mt.generate import MTGenerator
from src.gaze_mt.difficulty import DifficultyModel
from src.gaze_mt.rerank import score_candidates, select_best

# Evaluation
from src.evaluation.stats import wilcoxon_test, summarize_differences
from src.evaluation.bleu import compute_bleu


In [5]:
# Main
def main():

    # Reproducibility
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    # MECO pipeline
    print("Loading MECO data...")
    df = load_meco_rda()

    ru_df = select_language(df, "ru_mo")

    print("Preprocessing MECO...")
    ru_df = preprocess_meco(ru_df)

    print("Adding MECO features...")
    ru_df = add_meco_features(ru_df)

    # Russian surprisal
    print("Computing Russian surprisal...")
    ru_surprisal_model = RussianSurprisalModel()
    ru_df = add_ru_surprisal(ru_df, ru_surprisal_model)

    # MECO regressions
    lm_full, lm_first = fit_ols_full_and_first_occurrence(ru_df)

    # coefficients for gaze-informed MT
    b0 = float(lm_full.params["const"])
    b_s = float(lm_full.params["surprisal"])

    # WMT data
    print("\nLoading WMT data...")
    wmt_df = load_wmt()
    wmt_df = preprocess_wmt(wmt_df)

    wmt_sample = wmt_df.sample(
        n=EVAL_SAMPLE_SIZE,
        random_state=RANDOM_SEED,
    )

    # Gaze-informed MT
    print("\nRunning gaze-informed MT...")

    mt_generator = MTGenerator()
    en_surprisal_model = EnglishSurprisalModel()

    difficulty_model = DifficultyModel(
        surprisal_model=en_surprisal_model,
        b0=b0,
        b_s=b_s,
    )

    baseline_scores = []
    gaze_scores = []

    baseline_translations = []
    gaze_translations = []
    references = []

    example_records = []

    # Reranking loop
    for _, row in wmt_sample.iterrows():

        ru_sentence = row["ru_clean"]
        ref = row["en"]

        # generate candidates + log-probs
        candidates, log_probs = mt_generator.generate_with_scores(ru_sentence)

        scored = score_candidates(
            candidates=candidates,
            log_probs=log_probs,
            difficulty_model=difficulty_model,
        )

        best = select_best(scored)

        baseline = best["baseline"]
        gaze_best = best["gaze_best"]

        baseline_scores.append(baseline["mean_difficulty"])
        gaze_scores.append(gaze_best["mean_difficulty"])

        baseline_translations.append(baseline["translation"])
        gaze_translations.append(gaze_best["translation"])
        references.append(ref)

        delta = baseline["mean_difficulty"] - gaze_best["mean_difficulty"]

        example_records.append({
            "ru": ru_sentence,
            "baseline": baseline["translation"],
            "gaze": gaze_best["translation"],
            "ref": ref,
            "baseline_diff": baseline["mean_difficulty"],
            "gaze_diff": gaze_best["mean_difficulty"],
            "delta": delta,
        })

    # Evaluation: difficulty
    print("\nEvaluating difficulty results...")

    summary = summarize_differences(baseline_scores, gaze_scores)
    stats = wilcoxon_test(baseline_scores, gaze_scores)

    print("\nDifficulty summary:")
    for k, v in summary.items():
        print(f"{k}: {v:.4f}")

    print("\nWilcoxon signed-rank test:")
    print(stats)

    # Evaluation: BLEU
    print("\nEvaluating BLEU...")

    bleu_baseline = compute_bleu(baseline_translations, references)
    bleu_gaze = compute_bleu(gaze_translations, references)

    print(f"BLEU baseline: {bleu_baseline:.2f}")
    print(f"BLEU gaze:     {bleu_gaze:.2f}")

    # Qualitative analysis
    print("\nMost divergent examples (by difficulty difference):\n")

    example_records_sorted = sorted(
        example_records,
        key=lambda x: abs(x["delta"]),
        reverse=True,
    )

    for ex in example_records_sorted[:3]:
        print(f"Δ difficulty = {ex['delta']:.4f}")
        print("RU:", ex["ru"])
        print("BASE:", ex["baseline"])
        print("GAZE:", ex["gaze"])
        print("REF:", ex["ref"])
        print("-" * 60)

In [6]:
# Entry point
if __name__ == "__main__":
    main()

Loading MECO data...
Preprocessing MECO...
Adding MECO features...
Computing Russian surprisal...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Computing RU surprisal:   0%|          | 0/2714 [00:00<?, ?it/s]


Data splits:
  Full MECO: 33433 observations
  First occurrences: 1213 observations
  Unique words: 1213

OLS REGRESSION RESULTS — FULL MECO
                            OLS Regression Results                            
Dep. Variable:                  dur_z   R-squared:                       0.050
Model:                            OLS   Adj. R-squared:                  0.050
Method:                 Least Squares   F-statistic:                     539.5
Date:                Thu, 01 Jan 2026   Prob (F-statistic):               0.00
Time:                        20:44:52   Log-Likelihood:                -33151.
No. Observations:               30719   AIC:                         6.631e+04
Df Residuals:                   30715   BIC:                         6.634e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025  




Evaluating difficulty results...

Difficulty summary:
baseline_mean: -0.4601
gaze_mean: -0.4610
mean_difference: -0.0008
std_difference: 0.0032

Wilcoxon signed-rank test:
{'statistic': 0.0, 'p_value': 0.043114446783075355}

Evaluating BLEU...
BLEU baseline: 36.37
BLEU gaze:     36.40

Most divergent examples (by difficulty difference):

Δ difficulty = 0.0184
RU: КПК больше не представляет рабочих и крестьян, но может представлять «большинство китайского народа», включая «красных» капиталистов.
BASE: KPC no longer represents workers and peasants, but can represent "most Chinese people", including "red" capitalists.
GAZE: KPC no longer represents workers and peasants, but can represent the majority of the Chinese people, including "red" capitalists.
REF: The CCP no longer represents workers and peasants, but it can represent the "majority of the people," including "red" capitalists.
------------------------------------------------------------
Δ difficulty = 0.0120
RU: Выжившие в рабств