In [1]:
import pandas as pd
from scipy.stats import zscore, f_oneway, kruskal

import sys
sys.path.append("../src")
import numpy as np
import nltk
from nltk.tree import Tree
import matplotlib.pyplot as plt
import pandas as pd
from iparse_vq import IParser
from nltk.tag import pos_tag, map_tag, UnigramTagger
from nltk.corpus import treebank
from sklearn.model_selection import train_test_split

In [2]:
# word identities: topics, sentiment, categories

In [3]:
class POS_Parser:
    def __init__(self, simple=True, tagging='pos_tag'):
        self.simple = simple
        if tagging == 'pos_tag':
            self.tag = pos_tag
        else:
            tagger = UnigramTagger(treebank.tagged_sents())
            self.tag = tagger.tag
        
    def parse_batch(self, sentences):
        all_tags = []
        for s in sentences:
            text = nltk.word_tokenize(s)
            posTagged = self.tag(text)
            if self.simple:
                tags = [map_tag('en-ptb', 'universal', tag or 'NN') for word, tag in posTagged]
            else:
                tags = [tag or 'NN' for word, tag in posTagged]
            all_tags.append((None, tags))
        return all_tags

In [4]:
iparse_pos = POS_Parser(simple=False, tagging='unigram')

In [5]:
# gpn_vqtest4_labeled_16_back_test_cuda_8540884_dev=92.24.pt
# gpn_vqtest4_labeled_16_uni_cuda_8523872_dev=91.02.pt
# iparse = IParser("../models/gpn_vqtest4_labeled_16_uni_cuda_8523872_dev=91.02.pt")
iparse = IParser("../models/gpn_vqtest5_256_cuda_8576457_dev=94.41.pt")

Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def read_and_normalize(csv = 'tracking_data/selfpacedreading.RT.txt', var='RT'):
    reading = pd.read_csv(csv, delimiter = '\t')
    reading['zscore'] = reading.groupby(['subj_nr'], as_index=False)[var].transform(lambda x : zscore(x,ddof=1))
    return reading

# reads a csv data file then adds a column for the zscore of each specific speaker
reading = read_and_normalize()
display(reading.head(5))
reading.groupby('subj_nr').agg({'zscore': [np.mean, np.var]}).head()

Unnamed: 0,subj_nr,sent_nr,sent_pos,correct,answer_time,word_pos,word,RT,zscore
0,1,2,12,c,3630.0,1,Billy,376,1.31483
1,1,2,12,c,3630.0,2,wrote,364,1.181588
2,1,2,12,c,3630.0,3,on,394,1.514693
3,1,2,12,c,3630.0,4,the,353,1.059449
4,1,2,12,c,3630.0,5,envelope.,354,1.070552


Unnamed: 0_level_0,zscore,zscore
Unnamed: 0_level_1,mean,var
subj_nr,Unnamed: 1_level_2,Unnamed: 2_level_2
1,1.111922e-16,1.0
2,-3.621307e-16,1.0
3,1.685706e-16,1.0
4,9.949097000000001e-17,1.0
5,1.617035e-16,1.0


In [7]:
def combine_and_separate_periodt(lst):
    s = ' '.join(lst)
    return '{} .'.format(s[:-1])

def combine_sentences(reading):
    sentences = reading.sort_values(['sent_nr', 'word_pos']).groupby(['sent_nr', 'word_pos'], as_index=False).first()[['sent_nr', 'word_pos', 'word']]
    combined_sentences = sentences.groupby('sent_nr', as_index=False).agg({'word': combine_and_separate_periodt, 'word_pos':list})
    return combined_sentences

# creates a table of each sentence (to input into the parser)
combined = combine_sentences(reading)
combined

Unnamed: 0,sent_nr,word,word_pos
0,1,Anne lost control and laughed .,"[1, 2, 3, 4, 5]"
1,2,Billy wrote on the envelope .,"[1, 2, 3, 4, 5]"
2,3,He called over his shoulder .,"[1, 2, 3, 4, 5]"
3,4,He stayed against the wall .,"[1, 2, 3, 4, 5]"
4,5,Helen ran to the toilet .,"[1, 2, 3, 4, 5]"
...,...,...,...
356,357,Simon lay staring at the ceiling again with hi...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
357,358,Paula tried to relax back into the pillow to d...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
358,359,He kept his stock books neat and checked them ...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
359,360,Tom walked into the visiting room glad that hi...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."


In [8]:
def label_sentences(iparse, combined_sentences):
    t = iparse.parse_batch(combined_sentences['word'])
    cats = []
    for tree, cat in t:
        cats.append(cat)
    return combined_sentences.assign(tags= cats)

# adds the tags to each sentence (including final period)
combined = label_sentences(iparse, combined)
combined



Unnamed: 0,sent_nr,word,word_pos,tags
0,1,Anne lost control and laughed .,"[1, 2, 3, 4, 5]","[1, 144, 125, 93, 151, 81]"
1,2,Billy wrote on the envelope .,"[1, 2, 3, 4, 5]","[1, 144, 62, 196, 17, 56]"
2,3,He called over his shoulder .,"[1, 2, 3, 4, 5]","[148, 102, 0, 222, 38, 81]"
3,4,He stayed against the wall .,"[1, 2, 3, 4, 5]","[148, 102, 62, 77, 17, 56]"
4,5,Helen ran to the toilet .,"[1, 2, 3, 4, 5]","[100, 102, 143, 55, 156, 56]"
...,...,...,...,...
356,357,Simon lay staring at the ceiling again with hi...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[1, 248, 235, 90, 242, 58, 89, 157, 4, 136, 62..."
357,358,Paula tried to relax back into the pillow to d...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[100, 102, 183, 137, 180, 90, 22, 29, 86, 235,..."
358,359,He kept his stock books neat and checked them ...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[148, 102, 70, 126, 126, 252, 204, 155, 70, 84..."
359,360,Tom walked into the visiting room glad that hi...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[129, 248, 90, 22, 250, 17, 47, 73, 201, 126, ..."


In [9]:
def uncombine_sentence(reading, combined_sentences, use_pairs=False):
    dict_lists = []
    for i, row in combined_sentences.iterrows():
        prev = ''
        for word_pos, tag in zip(row['word_pos'], row['tags']):
            tag2 = tag
            if use_pairs:
                tag2 = '{}-{}'.format(prev, tag)
                prev = tag
            dict_lists.append({'sent_nr': row['sent_nr'], 'word_pos':word_pos, 'tag': tag2})
    tags_for_words = pd.DataFrame(dict_lists)
    reading_with_tags = reading.merge(tags_for_words, on=['sent_nr', 'word_pos'])
    return reading_with_tags

# shows tag (or tag pair) for each word as in the readings table
r_with_tags = uncombine_sentence(reading, combined, use_pairs=False)
r_with_tags

Unnamed: 0,subj_nr,sent_nr,sent_pos,correct,answer_time,word_pos,word,RT,zscore,tag
0,1,2,12,c,3630.0,1,Billy,376,1.314830,1
1,2,2,179,c,3301.0,1,Billy,242,-0.220132,1
2,3,2,115,c,1822.0,1,Billy,329,0.037237,1
3,6,2,43,c,2550.0,1,Billy,208,-0.449477,1
4,8,2,87,e,3662.0,1,Billy,434,0.053585,1
...,...,...,...,...,...,...,...,...,...,...
353579,111,329,63,-,,24,jacket.,694,1.989887,58
353580,112,329,4,-,,24,jacket.,320,-0.174806,58
353581,114,329,233,-,,24,jacket.,249,0.474962,58
353582,115,329,143,-,,24,jacket.,195,-0.439994,58


In [10]:
def return_zscores(reading_with_tags):
    return reading_with_tags.groupby('tag').agg({'zscore': list}).sort_values('tag')['zscore'].tolist()

# finds the z-scores assigned to each tag
zscores_list = return_zscores(r_with_tags)
print('Categories: {} | List lengths: {}'.format(len(zscores_list), [len(lst) for lst in zscores_list]))

Categories: 221 | List lengths: [3129, 4334, 67, 409, 2285, 203, 1997, 1504, 4039, 358, 1189, 358, 2072, 1674, 1013, 1245, 3422, 707, 1686, 707, 6802, 5354, 3006, 1015, 922, 1652, 5147, 1150, 491, 140, 1058, 2910, 433, 492, 2274, 370, 861, 808, 290, 774, 521, 3098, 1629, 212, 2533, 622, 150, 4699, 652, 3644, 3578, 5135, 2513, 6170, 213, 1375, 836, 200, 3653, 2032, 2156, 629, 143, 984, 214, 8094, 364, 2369, 1209, 821, 799, 633, 804, 1069, 72, 963, 1504, 2071, 1899, 9886, 776, 507, 6008, 307, 568, 1506, 1904, 869, 7704, 1198, 141, 725, 430, 299, 348, 137, 136, 2066, 3410, 147, 76, 208, 688, 2076, 860, 504, 494, 3135, 926, 213, 1720, 4917, 662, 210, 5416, 1137, 523, 1070, 1576, 2159, 339, 1412, 1649, 1829, 486, 2871, 5355, 520, 353, 1509, 7425, 730, 1785, 5867, 1568, 493, 1016, 7526, 4051, 1355, 154, 587, 1920, 995, 2388, 80, 134, 1628, 1175, 572, 422, 72, 1537, 563, 640, 414, 1501, 1559, 2127, 66, 4277, 192, 2859, 224, 224, 381, 228, 73, 211, 283, 3248, 4735, 2218, 1510, 1192, 64, 1394, 

In [11]:
# composes the above cells into a single function
# the combined argument lets us avoid recalculating the table of sentences and tags
#     because all the stimuli were the same
# supported stat tests are anova (assumes homoscedastic and normal), or kruskal (no assumption)

def read_csv_and_r_with_tags(csv = 'tracking_data/selfpacedreading.RT.txt', var='RT', 
                       use_pairs=False, combined=None, iparse=iparse):
    r = read_and_normalize(csv = csv, var=var)
    if combined is None:
        combined = combine_sentences(r)
        combined = label_sentences(iparse, combined)
    r_with_tags = uncombine_sentence(r, combined, use_pairs)
    return r_with_tags

def read_csv_and_check(csv = 'tracking_data/selfpacedreading.RT.txt', var='RT', 
                       use_pairs=False, combined=None, iparse=iparse, test_type='kruskal'):
    r_with_tags = read_csv_and_r_with_tags(csv, var, use_pairs, combined, iparse)
    zscores_list = return_zscores(r_with_tags)
    return {'kruskal':kruskal, 'anova':f_oneway}[test_type](*zscores_list)

def read_csv_r2(csv = 'tracking_data/selfpacedreading.RT.txt', var='RT', 
                       use_pairs=False, combined=None, iparse=iparse):
    r_with_tags = read_csv_and_r_with_tags(csv, var, use_pairs, combined, iparse)
    r_with_tags, test = train_test_split(r_with_tags, test_size=0.3)
    tag_mean = r_with_tags.groupby('tag').agg({'zscore': 'mean'}).rename(columns={'zscore':'pred'})
    test = test.merge(tag_mean, on='tag')
    return 1 - np.sum(np.square(test['zscore'] - test['pred'])) / \
            np.sum(np.square(test['zscore'] - np.mean(r_with_tags['zscore'])))
    

In [12]:
np.random.seed(0)
for csv, var in [('tracking_data/selfpacedreading.RT.txt', 'RT'), 
                 ('tracking_data/eyetracking.RT.txt', 'RTfirstfix'),
                ('tracking_data/eyetracking.RT.txt', 'RTfirstpass')]:
    for use_pairs in [True, False]:
        print(csv, var, 'Using Pairs: {}'.format(use_pairs))
#         for test_type in ['kruskal', 'anova']:
#             print(read_csv_and_check(csv = csv, var=var, use_pairs=use_pairs, 
#                                      combined=combined, test_type=test_type))
        for parser in [iparse, iparse_pos]:
            print(read_csv_r2(csv = csv, var=var, use_pairs=use_pairs, 
                                     iparse=parser))
        print()


tracking_data/selfpacedreading.RT.txt RT Using Pairs: True




-0.0016082597158781908
0.0019979484523962032

tracking_data/selfpacedreading.RT.txt RT Using Pairs: False
0.00040448528205649925
0.0010145600625923734

tracking_data/eyetracking.RT.txt RTfirstfix Using Pairs: True
0.1183171492481857
0.08417806586040921

tracking_data/eyetracking.RT.txt RTfirstfix Using Pairs: False
0.08720900278833632
0.06851090622812406

tracking_data/eyetracking.RT.txt RTfirstpass Using Pairs: True
0.1499245386337711
0.09293940641117482

tracking_data/eyetracking.RT.txt RTfirstpass Using Pairs: False
0.09138240684350352
0.0857344278531138



In [13]:
# # statistical tests between each tag
# # useful for analysis?

# from itertools import combinations
# for cat1, cat2 in combinations(range(len(zscores_list)), 2):
#     print(cat1, cat2, f_oneway(zscores_list[cat1], zscores_list[cat2]).pvalue)