In [1]:
import pandas as pd
import os
import re
import math
from collections import Counter

from tqdm.notebook import tqdm, trange

In [4]:
# Load word frequency statistics for control features
wiki_counts = Counter()
with open("data/wikitext-2_train_vocab.txt", "r") as counts_f:
    for line in counts_f:
        token, count = line.strip().split("\t")
        wiki_counts[token] = int(count)

In [5]:
def harmonize_rows(ref, d):
    result = []
    curr_d = d.pop(0)
    curr_ref = ref.pop(0)
    
    while len(d) > 10:
        #print(curr_d[2] + " " + curr_ref[0])
        if curr_d[2] == curr_ref[0]:
            #print("===" + curr_d[2] + "-" + curr_ref[0])
            result.append(curr_d + curr_ref)
            curr_d = d.pop(0)
            curr_ref = ref.pop(0)
        # If current token is unked, then pop both
        elif "UNK" in curr_d[2]:
            curr_d = d.pop(0)
            curr_ref = ref.pop(0)
        # If current ref has punctuation, pop both
        elif not curr_ref[0].isalpha():
            curr_ref = ref.pop(0)
            curr_d = d.pop(0)
        #If the current word is the end of a line
        elif "EOL" in curr_ref[0]:
            curr_ref = ref.pop(0)
            curr_d = d.pop(0)
        else:
            curr_d = d.pop(0)
            
    return result

In [8]:
#SPRT Data Wrangling

def harmonize_unks(data, unks):
    result = []
    for i in range(len(data)):
        if unks[i] == "1":
            result.append((data[i][0], data[i][1], "-UNK-", data[i][3]))
        else:
            result.append(data[i])
    return result

vanilla_pref = "./surprisal/vanilla/"
vanilla = [vanilla_pref + f for f in os.listdir(vanilla_pref) if not f.startswith(".")]
rnng_pref = "./surprisal/rnng/"
rnng = [rnng_pref + f for f in os.listdir(rnng_pref) if not f.startswith(".")]
on_pref = "./surprisal/ordered-neurons/"
ordered_n = [on_pref + f for f in os.listdir(on_pref) if not f.startswith(".")]

ngram_pref = "./surprisal/5gram/"
ngram = [ngram_pref + f for f in os.listdir(ngram_pref) if not f.startswith(".")]

results = ngram + rnng + ordered_n + vanilla

natural_ref = pd.read_csv("./corpora/natural_reference.csv")
natural_ref = natural_ref.drop(["Unnamed: 0"], axis=1)
natural_ref["code"] = natural_ref["zone"].map(str) + "_" + natural_ref["item"].map(str)
natural_ref = natural_ref.drop(["zone", "item"], axis=1)

brown_ref = pd.read_csv("./corpora/brown_reference.csv")
brown_ref = brown_ref.drop(["Unnamed: 0", "subject", "text_id", "text_pos", "word_in_exp", "time"], axis=1)

final_df = []
for res in results:
    tag = tuple(res.split("/")[3].replace(".csv", "").split("_"))
    text = tag[0]
    model = tag[1]
    train = tag[2]
    seed = tag[3]
    d = [tuple(x) for x in pd.read_csv(res, sep="\t").values.tolist()]
        
    if ("bnc-brown" in res):
        ref = [tuple(x) for x in brown_ref.values.tolist()]
        corpus = "bnc-brown"
    else:
        ref = [tuple(x) for x in natural_ref.values.tolist()]
        corpus = "natural-stories"
    
    if tag[1] == "ordered-neurons":
        unk_path = "./corpora/cuny_on_unks/" + corpus + "_ordered-neurons_" + tag[2] + "_" + tag[3] + ".csv"
        try:
            unks = " ".join([x.strip() for x in open(unk_path, "r").readlines()]).split(" ")
        except:
            print(unk_path)
            
        d = harmonize_unks(d, unks)
    

    harmonized = harmonize_rows(ref, d)
    result = [(x[2], x[3], x[5], text, model, train, seed, x[1], len(x[2]), math.log(wiki_counts[x[2]] + 1) ) for x in harmonized]
    final_df.extend(result)
        
to_csv = pd.DataFrame(final_df)
to_csv.columns = ["word", "surprisal", "code", "corpus", "model", "training", "seed", "sent_pos", "len", "freq"]
to_csv.to_csv("coded_results_spr.csv")

    

In [9]:
# Insert control features.

df = pd.read_csv("coded_results.csv")
#df["count"] = len(df.word)

#df['c'] = df.apply(lambda row: row.a + row.b, axis=1)
df["len"] = df.apply(lambda row: len(row.word), axis=1)
df["count"] = df.apply(lambda row: math.log(wiki_counts[row.word] + 1), axis=1)

df = df.rename({'x2': 'sent_pos'}, axis='columns')
df.to_csv("coded_results.csv")


In [11]:
# Eye Tracking Data

vanilla_pref = "./surprisal-dundee/vanilla/"
vanilla = [vanilla_pref + f for f in os.listdir(vanilla_pref) if not f.startswith(".")]
rnng_pref = "./surprisal-dundee/rnng/"
rnng = [rnng_pref + f for f in os.listdir(rnng_pref) if not f.startswith(".")]
on_pref = "./surprisal-dundee/ordered-neurons/"
ordered_n = [on_pref + f for f in os.listdir(on_pref) if not f.startswith(".")]
ngram_pref = "./surprisal-dundee/5gram/"
ngram = [ngram_pref + f for f in os.listdir(ngram_pref) if not f.startswith(".")]

results =  ordered_n + vanilla + rnng + ngram

final_df = []            
for i in trange(len(results)):
    r = results[i]
    r = r.split("/")[3].replace(".csv", "").split("_")
    text = r[0]
    model = r[1]
    train = r[2]
    if model == "5gram":
        seed = "111"
    else:
        seed = r[3]
                
    gold_path = "./dundee_firstpass/" + text.replace("wrdp", "") + "_firstpass_all.txt"
    if model == "5gram":
        model_path = "./surprisal-dundee/" + model + "/" + text + "_" + model + "_" + train + ".csv"
    else:
        model_path = "./surprisal-dundee/" + model + "/" + text + "_" + model + "_" + train + "_" + seed + ".csv"
                
    if os.path.exists(model_path):
        gold_df = pd.read_csv(gold_path, sep="\t", names=["word", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10"])
        try:
            model_df = pd.read_csv(model_path, sep="\t")
        except:
            print(model_path)
            
        # Get End of Line Information and change gold DF tokesn to "EOL" If its at the end of a line
        eol_path = "./dundee_items/" + text + ".txt"
        eols = pd.read_csv(eol_path, sep=r'[ ]{2,}', header=None)
        line_position = list(eols.iloc[:,3])
        line_position.pop(0)
        line_position.append(1)
        gold_df["line_pos"] = line_position
        gold_df['word'][gold_df.line_pos == 1] = "EOL"

        if model == "ordered-neurons":
            unk_path = "./surprisal-dundee/unk-ordered-neurons/" + text + "_" + model + "_" + train + "_" + seed + ".csv"
            try:
                unks = " ".join([x.strip() for x in open(unk_path, "r").readlines()]).split(" ")
            except:
                print(unk_path)
            model_df["unks"] = unks
            model_df["token"] = model_df.apply(lambda x: "UNK" if x.unks == "1" else x.token, axis=1)            
                    
        ref = [tuple(x) for x in gold_df.values.tolist()]
        model_output = [tuple(x)[:4] for x in model_df.values.tolist()]
        
        harmonized = harmonize_rows(ref, model_output)
        result = [x[1:15] + tuple((text, model, train, seed, len(x[2]), math.log(wiki_counts[x[2]] + 1))) for x in harmonized]

        final_df.extend(result)
    else:
        print("Path does not exist: " + str(model_path))
                    
df = pd.DataFrame(final_df)
df.columns = ["position", "word", "surprisal", "word2", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "text", "model", "training", "seed", "len", "freq"]
#df.columns = ["word", "surprisal", "gaze", "text", "model", "training", "seed", "position", "len", "freq"]
df.head()
df.to_csv("coded_results_dundee.csv")




0.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0.08038585209003216
./surprisal-dundee/ordered-neurons/tx01wrdp_ordered-neurons_bllip-lg_0922.csv
./surprisal-dundee/unk-ordered-neurons/tx01wrdp_ordered-neurons_bllip-lg_0922.csv
0.1607717041800643
./surprisal-dundee/ordered-neurons/tx01wrdp_ordered-neurons_bllip-md_0922.csv
./surprisal-dundee/unk-ordered-neurons/tx01wrdp_ordered-neurons_bllip-md_0922.csv
0.24115755627009647
0.3215434083601286
0.40192926045016075
0.48231511254019294
0.5627009646302251
0.6430868167202572
0.7234726688102894
0.8038585209003215
0.8842443729903537
0.9646302250803859


In [12]:
print(harmonized[300:330])

[(19, 4, 'lost', 12.463731554538068, 'lost', 244.0, 223.0, 0.0, 243.0, 0.0, 198.0, 382.0, 411.0, 0.0, 188.0, 13), (19, 7, 'as', 6.399833789270179, 'as', 167.0, 235.0, 0.0, 0.0, 198.0, 218.0, 135.0, 0.0, 263.0, 0.0, 15), (19, 9, 'it', 3.1668045110513074, 'it', 0.0, 103.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2), (19, 10, 'was', 4.928532104369898, 'was', 258.0, 0.0, 0.0, 135.0, 0.0, 0.0, 0.0, 54.0, 0.0, 0.0, 3), (19, 11, 'a', 4.9022300033442034, 'a', 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4), (19, 12, 'trench', 16.93453545454075, 'trench', 0.0, 280.0, 199.0, 0.0, 0.0, 213.0, 265.0, 79.0, 144.0, 205.0, 5), (19, 13, 'in', 5.015855348941859, 'in', 0.0, 0.0, 0.0, 253.0, 0.0, 0.0, 0.0, 103.0, 0.0, 0.0, 6), (19, 14, 'the', 4.110607079311621, 'the', 0.0, 202.0, 174.0, 0.0, 0.0, 375.0, 0.0, 0.0, 0.0, 0.0, 7), (19, 15, 'First', 12.422194171334018, 'First', 229.0, 188.0, 157.0, 199.0, 0.0, 0.0, 314.0, 266.0, 217.0, 245.0, 8), (19, 16, 'World', 4.710696780419914, 'World', 0.0, 229.0, 

In [13]:
len(gold_df.word)

2673

In [16]:
gold_df.head(50)

Unnamed: 0,word,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,line_pos
0,When,197.0,251.0,0.0,0.0,202.0,0.0,191.0,143.0,186.0,0.0,2
1,I,0.0,0.0,157.0,153.0,0.0,222.0,0.0,163.0,0.0,0.0,3
2,was,0.0,0.0,0.0,0.0,174.0,93.0,0.0,0.0,184.0,0.0,4
3,about,167.0,311.0,140.0,171.0,0.0,119.0,188.0,181.0,0.0,156.0,5
4,20,0.0,187.0,0.0,108.0,189.0,187.0,0.0,181.0,166.0,0.0,6
5,I,141.0,0.0,106.0,0.0,0.0,168.0,427.0,150.0,165.0,0.0,7
6,worked,0.0,254.0,215.0,250.0,0.0,192.0,209.0,190.0,196.0,191.0,8
7,in,0.0,0.0,108.0,0.0,209.0,0.0,0.0,0.0,0.0,0.0,9
8,an,0.0,201.0,0.0,0.0,0.0,252.0,0.0,0.0,0.0,146.0,10
9,office.,230.0,200.0,170.0,286.0,127.0,153.0,228.0,134.0,175.0,0.0,11
