In [12]:
import pandas as pd
import os
import re
import math
from collections import Counter

from tqdm.notebook import tqdm

In [3]:
# Load word frequency statistics for control features
word_freq = Counter()
with open("../data/wikitext-2_train_vocab.txt", "r") as f:
    for line in f:
        token, freq = line.strip().split("\t")
        word_freq[token] = int(freq)

In [28]:
# Harmonize lists of <(word, int),(word, int)> pairs
# Discards pairs where words do not match

def harmonize_rows(ref, d):
    result = []
    curr_d = d.pop(0)
    curr_ref = ref.pop(0)
    
    while len(d) > 10:
        model_token, surprisal = curr_d
        code, rt_token, rt = curr_ref

        if code == 30628:
            print(curr_d, curr_ref)
        
        #print(curr_d[2] + " " + curr_ref[0])
        if model_token == rt_token:
            #print("===" + curr_d[2] + "-" + curr_ref[0])
            result.append(curr_d + curr_ref)
            curr_d = d.pop(0)
            curr_ref = ref.pop(0)
        # If current token is unked, then pop both
        elif "UNK" in model_token:
            curr_d = d.pop(0)
            curr_ref = ref.pop(0)
        # If current ref has punctuation, pop both
        elif not rt_token.isalpha():
            curr_ref = ref.pop(0)
            curr_d = d.pop(0)
        #If the current word is the end of a line
        elif "EOL" in rt_token:
            curr_ref = ref.pop(0)
            curr_d = d.pop(0)
        else:
            curr_d = d.pop(0)
            
    return result

In [33]:
#

def merge_model_results():
    
    final_df = []
    
    models = [f for f in os.listdir("../data/model_results") if not f.startswith(".")]
    for m in tqdm(models, desc="Harmonizing models"):
        tqdm.write("Harmonizing results for " + m)
        test_corpus = [f for f in os.listdir("../data/model_results/" + m) if not f.startswith(".")]
        # DEV
        test_corpus = ["dundee"]
        for tc in test_corpus:
            test_files = [f for f in os.listdir("../data/model_results/" + m + "/" + tc) if not f.startswith(".")]
            
            for tf in tqdm(test_files, desc="Test files"):
                if tf == "UNKS":
                    print("TODO: UNKS")
                    continue
                
                try:
                    tf = tf.split("_")
                    test_filename = tf[0]
                    model_architecture = tf[1]
                    training_data = tf[2]
                    seed = tf[3].replace(".csv", "")
                except:
                    print(tf)
                
                # Special handling for the Dundee corpus
                if tc == "dundee":
                    gold_test_filename = test_filename.replace("wrdp", "") + "_avg"
                    gold_standard = pd.read_csv("../data/human_rts/" + tc + "/" + gold_test_filename + ".txt", sep="\t", names=["word", "surprisal"])
                    gold_standard.insert(0, 'code', range(0,len(gold_standard)))
                    gold_standard["code"] = gold_standard["code"] + int(test_filename.replace("tx", "").replace("wrdp", "")) * 10000
                else:
                    gold_standard = pd.read_csv("../data/human_rts/" + tc + "/" + test_filename + ".txt", sep="\t")
                                    
                model_results = "_".join([test_filename, model_architecture, training_data, seed])
                model_path = "/".join(["../data/model_results", m, tc, model_results])
                model_results = pd.read_csv(model_path+".csv", sep="\t")
            
                # TODO: EOL Handleing
                
                model_results = [tuple(x)[2:4] for x in model_results.values.tolist()]
                gold_standard = [tuple(x) for x in gold_standard.values.tolist()]
                
                harmonized_results = harmonize_rows(gold_standard, model_results)
                
                result = [tuple((x[2], x[0], x[1], x[4], tc, model_architecture, training_data, seed, len(x[0]), math.log(word_freq[x[0]]+1))) for x in harmonized_results]
                final_df.extend(result)
                
    df = pd.DataFrame(final_df)
    df.columns = ["code", "word", "surprisal", "psychometric", "corpus", "model", "training", "seed", "len", "freq"]
    df.head()
    df.to_csv("../data/harmonized_results.csv")
    return df

df = merge_model_results()

HBox(children=(FloatProgress(value=0.0, description='Harmonizing models', max=5.0, style=ProgressStyle(descrip…

Harmonizing results for 5gram


HBox(children=(FloatProgress(value=0.0, description='Test files', max=80.0, style=ProgressStyle(description_wi…

('.', 3.996775431879412) (30628, 'Beckham', 373.4)
('Beckham', 25.244896762107004) (30628, 'Beckham', 373.4)
('.', 3.3674915911182564) (30628, 'Beckham', 373.4)
('Beckham', 22.56801753440681) (30628, 'Beckham', 373.4)
('.', 5.480494004734213) (30628, 'Beckham', 373.4)
('UNK-INITC', 7.532568421063043) (30628, 'Beckham', 373.4)
('.', 4.504853370395847) (30628, 'Beckham', 373.4)
('Beckham', 26.36086845347224) (30628, 'Beckham', 373.4)

Harmonizing results for rnng


HBox(children=(FloatProgress(value=0.0, description='Test files', max=180.0, style=ProgressStyle(description_w…

('.', 11.0215) (30628, 'Beckham', 373.4)
('Beckham', 15.6277) (30628, 'Beckham', 373.4)
('.', 10.2556) (30628, 'Beckham', 373.4)
('Beckham', 16.3061) (30628, 'Beckham', 373.4)
('.', 10.7883) (30628, 'Beckham', 373.4)
('UNK-INITC', 5.52126) (30628, 'Beckham', 373.4)
('.', 11.549000000000001) (30628, 'Beckham', 373.4)
('Beckham', 15.2614) (30628, 'Beckham', 373.4)
('.', 9.5025) (30628, 'Beckham', 373.4)
('UNK-INITC', 5.29639) (30628, 'Beckham', 373.4)
('.', 10.3834) (30628, 'Beckham', 373.4)
('Beckham', 15.6341) (30628, 'Beckham', 373.4)
('.', 9.71155) (30628, 'Beckham', 373.4)
('UNK-INITC', 5.35) (30628, 'Beckham', 373.4)
('.', 11.2966) (30628, 'Beckham', 373.4)
('Beckham', 14.8918) (30628, 'Beckham', 373.4)
('.', 11.3135) (30628, 'Beckham', 373.4)
('UNK-INITC', 5.1432400000000005) (30628, 'Beckham', 373.4)

Harmonizing results for vanilla


HBox(children=(FloatProgress(value=0.0, description='Test files', max=180.0, style=ProgressStyle(description_w…

('.', 1.596814) (30628, 'Beckham', 373.4)
('Beckham', 0.0) (30628, 'Beckham', 373.4)
('.', 2.118465) (30628, 'Beckham', 373.4)
('UNK-INITC', 0.0) (30628, 'Beckham', 373.4)
('.', 1.056573) (30628, 'Beckham', 373.4)
('Beckham', 0.0) (30628, 'Beckham', 373.4)
('.', 1.100375) (30628, 'Beckham', 373.4)
('Beckham', 0.0) (30628, 'Beckham', 373.4)
('.', 1.2768) (30628, 'Beckham', 373.4)
('Beckham', 0.0) (30628, 'Beckham', 373.4)
('.', 1.453569) (30628, 'Beckham', 373.4)
('Beckham', 0.0) (30628, 'Beckham', 373.4)
('.', 1.742211) (30628, 'Beckham', 373.4)
('UNK-INITC', 0.0) (30628, 'Beckham', 373.4)
('.', 1.7080380000000002) (30628, 'Beckham', 373.4)
('UNK-INITC', 0.0) (30628, 'Beckham', 373.4)
('.', 1.598862) (30628, 'Beckham', 373.4)
('Beckham', 0.0) (30628, 'Beckham', 373.4)

Harmonizing results for gpt2.old


HBox(children=(FloatProgress(value=0.0, description='Test files', max=120.0, style=ProgressStyle(description_w…

('.', 9.17480182647705) (30628, 'Beckham', 373.4)
('Beckham', 11.973114013671875) (30628, 'Beckham', 373.4)
('.', 3.667911529541016) (30628, 'Beckham', 373.4)
('Beckham', 7.750174045562744) (30628, 'Beckham', 373.4)
('.', 3.3121492862701416) (30628, 'Beckham', 373.4)
('Beckham', 8.703068733215332) (30628, 'Beckham', 373.4)
('.', 3.164111614227295) (30628, 'Beckham', 373.4)
('Beckham', 8.889322280883789) (30628, 'Beckham', 373.4)
('.', 2.055673360824585) (30628, 'Beckham', 373.4)
('Beckham', 13.251792907714846) (30628, 'Beckham', 373.4)
('.', 3.880390167236328) (30628, 'Beckham', 373.4)
('Beckham', 15.578605651855467) (30628, 'Beckham', 373.4)

Harmonizing results for gpt2


HBox(children=(FloatProgress(value=0.0, description='Test files', max=140.0, style=ProgressStyle(description_w…

('.', 3.0326457023620605) (30628, 'Beckham', 373.4)
('Beckham', 0.0) (30628, 'Beckham', 373.4)
('.', 3.667911529541016) (30628, 'Beckham', 373.4)
('Beckham', 7.750174045562744) (30628, 'Beckham', 373.4)
('.', 2.342882871627808) (30628, 'Beckham', 373.4)
('Beckham', 0.0) (30628, 'Beckham', 373.4)
('.', 3.3121492862701416) (30628, 'Beckham', 373.4)
('Beckham', 8.703068733215332) (30628, 'Beckham', 373.4)
('.', 3.164111614227295) (30628, 'Beckham', 373.4)
('Beckham', 8.889322280883789) (30628, 'Beckham', 373.4)
('.', 2.055673360824585) (30628, 'Beckham', 373.4)
('Beckham', 13.251792907714846) (30628, 'Beckham', 373.4)
('.', 3.880390167236328) (30628, 'Beckham', 373.4)
('Beckham', 15.578605651855467) (30628, 'Beckham', 373.4)




In [38]:
# Drop corpus--code combinations for which *any* model--training has zero surprisal
to_drop = df.groupby(["corpus", "code"]).apply(lambda xs: (xs.surprisal == 0).any())

In [43]:
df_mod = df.set_index(["corpus", "code"]).drop(index=to_drop[to_drop].index)

In [45]:
df_mod.to_csv("../data/harmonized_results.csv")