In [1]:
import pandas as pd
import nltk
from random import sample 
import random
import pickle

In [2]:
def flatten(l):
    return [item for sublist in l for item in sublist]

data = pd.read_csv("data/wsj_deps.conllu", sep="\t", header=None)
data.columns = ["index", "WORD", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]
train_items = set([x[0] for x in flatten(pickle.load(open("data/train_pos.p", "rb")))])
data.head()

Unnamed: 0,index,WORD,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC
0,1,Pierre,Pierre,PROPN,NNP,_,2,compound,_,_
1,2,Vinken,Vinken,PROPN,NNP,_,9,nsubj,_,_
2,3,",",",",PUNCT,",",_,2,punct,_,_
3,4,61,61,NUM,CD,_,5,nummod,_,_
4,5,years,year,NOUN,NNS,_,6,nmod:npmod,_,_


In [3]:
# Number of times each token occurs as per its part of speech
verbs = data[(data["XPOS"] == "VBD") | (data["XPOS"] == "VBN") | (data["XPOS"] == "VB")]
verbs = verbs[verbs["WORD"].str.islower()]
verbs = verbs[verbs["WORD"].isin(train_items)]

v_counts = verbs.groupby(["WORD", "LEMMA", "XPOS"]).size().reset_index()
#v_counts.to_csv("data/v_counts.csv")

# Just some code to print out common words from the training (useful for deciding what to put in the test sentences)
advs = data[(data["XPOS"] == "VB")]
adv_counts = advs.groupby(["WORD", "LEMMA", "XPOS"]).size().reset_index()
adv_counts = adv_counts.sort_values(by=[0], ascending=False)
adv_counts = [tuple(r) for r in adv_counts.values.tolist()]
#print(adv_counts[:100])

In [4]:
N_ITEMS_PER_TEST = 20
MAX_ITEMS_PER_BUCKET = 20

# Open class items
NOUN_BLOCKS = ["investor", "maker", "president", "executive", "officer", "lawyer", "government", "chairman", "spokesman"]
V_TR_BLOCKS = ["likes", "saw", "had"]
ADJ_BLOCKS = ["big", "important", "new", "special", "key", "foreign", "old"]
ITEM_BLOCKS_SING = ["report", "investment", "position", "letter", "idea", "thing", "computer", "product", "paper"]
ITEM_BLOCKS_PL = ["reports", "investments", "position", "letters", "ideas", "things", "computers", "products", "papers"]
PRES_ADVS = ["quickly", "directly", "immediately", "easily", "rapidly", "entirely"]

# Closed class items
DET_BLOCKS = ["the", "some", "the", "the", "the", "the"]
INTENSE_BLOCKS = ["very", "extremely", "somewhat", "rather"]
PAST_ADVS = ["yesterday", "recently", "today"]
AUX = ["can", "will", "might", "should", "can", "will", "can", "will"]
PREPS = ["near", "with"]
SING_BE = ["is", "was"]
PL_BE = ["are", "were"]


In [5]:
def sample_words(data):
    result = []
    
    """
    tiny_exp = data[(data[0] > 1) & (data[0] <= 2)]
    result.extend(sample([tuple(r) for r in tiny_exp.values.tolist()], min(MAX_ITEMS_PER_BUCKET, tiny_exp.shape[0])))
    
    small_exp = data[(data[0] > 5) & (data[0] <= 10)]
    result.extend(sample([tuple(r) for r in small_exp.values.tolist()], min(MAX_ITEMS_PER_BUCKET, small_exp.shape[0])))
    
    mid_exp = data[(data[0] > 10) & (data[0] <= 20)]
    result.extend(sample([tuple(r) for r in mid_exp.values.tolist()], min(MAX_ITEMS_PER_BUCKET, mid_exp.shape[0])))

    big_exp = data[(data[0] > 20) & (data[0] <= 50)]
    result.extend(sample([tuple(r) for r in big_exp.values.tolist()], min(MAX_ITEMS_PER_BUCKET, big_exp.shape[0])))

    huge_exp = data[(data[0] > 50)]
    result.extend(sample([tuple(r) for r in huge_exp.values.tolist()], min(MAX_ITEMS_PER_BUCKET, huge_exp.shape[0])))
    """
    
    exp_2 = data[(data[0] > 1) & (data[0] <= 2)]
    result.extend(sample([tuple(r) for r in exp_2.values.tolist()], min(MAX_ITEMS_PER_BUCKET, exp_2.shape[0])))
            
    exp_3 = data[(data[0] > 2) & (data[0] <= 3)]
    result.extend(sample([tuple(r) for r in exp_3.values.tolist()], min(MAX_ITEMS_PER_BUCKET, exp_3.shape[0])))
    
    exp_4 = data[(data[0] > 3) & (data[0] <= 4)]
    result.extend(sample([tuple(r) for r in exp_4.values.tolist()], min(MAX_ITEMS_PER_BUCKET, exp_4.shape[0])))
     
    exp_5 = data[(data[0] > 4) & (data[0] <= 5)]
    result.extend(sample([tuple(r) for r in exp_5.values.tolist()], min(MAX_ITEMS_PER_BUCKET, exp_5.shape[0])))
    
    exp_10 = data[(data[0] > 5) & (data[0] <= 10)]
    result.extend(sample([tuple(r) for r in exp_10.values.tolist()], min(MAX_ITEMS_PER_BUCKET, exp_10.shape[0])))
    
    exp_20 = data[(data[0] > 10) & (data[0] <= 20)]
    result.extend(sample([tuple(r) for r in exp_20.values.tolist()], min(MAX_ITEMS_PER_BUCKET, exp_20.shape[0])))
    
    exp_50 = data[(data[0] > 20) & (data[0] <= 50)]
    result.extend(sample([tuple(r) for r in exp_50.values.tolist()], min(MAX_ITEMS_PER_BUCKET, exp_50.shape[0])))
    
    exp_100 = data[(data[0] > 50) & (data[0] <= 100)]
    result.extend(sample([tuple(r) for r in exp_100.values.tolist()], min(MAX_ITEMS_PER_BUCKET, exp_100.shape[0])))
    
    return result
    
def export_plural_test(test, name):
    test = flatten(test)
    
    items_output = []
    for sent in test:
        for word in sent[3].split(" "):
            items_output.append((word, sent[0], sent[1], sent[2], sent[4], sent[5], sent[6], sent[7]))
    
    df = pd.DataFrame(items_output)
    df.to_csv(name +"_items.csv", index=False, header=False)
    
    with open(name+"_exposure_tests.txt", "w") as outf:
        outf.writelines("\n".join([x[3] for x in test]))


def export_argstruct_test(test, name):
    test = flatten(test)
    
    items_output = []
    for sent in test:
        for word in sent[5].split(" "):
            items_output.append((word, sent[0], sent[1], sent[2], sent[3], sent[4], sent[6], sent[7], sent[8], sent[9]))
    
    df = pd.DataFrame(items_output)
    df.to_csv(name +"_items.csv", index=False, header=False)
    
    with open(name+"_exposure_tests.txt", "w") as outf:
        outf.writelines("\n".join([x[5] for x in test]))

In [6]:
# PLURAL AGREEMENT LEARNING
    
def n_distractor(is_plural):
    if is_plural:
        return random.choice(ITEM_BLOCKS_SING)
    else:
        return random.choice(ITEM_BLOCKS_PL)

def generate_plural_tests(noun_bundle, is_plural):
    
    test_items = []
    
    for i in range(N_ITEMS_PER_TEST):
        
        det1 = random.choice(DET_BLOCKS)
        det2 = random.choice(DET_BLOCKS)
        verb1 = random.choice(V_TR_BLOCKS)
        prep1 = random.choice(PREPS)
        adj1 = random.choice(ADJ_BLOCKS)
        n_distr = n_distractor(is_plural)
        intensifier1 = random.choice(INTENSE_BLOCKS)
        adj2 = random.choice(list(set(ADJ_BLOCKS)-set([adj1])))
        sing_be = random.choice(SING_BE)
        pl_be = random.choice(PL_BE)

        
        # BASE SIMPLE: The NOUN is/are
        base_simple_sing = " ".join([det1.capitalize(), noun_bundle[0], "is", "." ])
        base_simple_pl = " ".join([det1.capitalize(), noun_bundle[0], "are", "."])
        
        # Word, POS, Frequency, Test Sentence, Grammaticality, Target Index, Item Number, Test Number
        test_items.append( (noun_bundle[0], noun_bundle[2], noun_bundle[3], base_simple_sing, "sing", 3, i, "base_simple") )
        test_items.append( (noun_bundle[0], noun_bundle[2], noun_bundle[3], base_simple_pl, "pl", 3, i, "base_simple") )
    
        # BASE RC: The noun who the president likes is/are
        base_rc_sing = " ".join([det1.capitalize(), noun_bundle[0], "who", det2, n_distr, verb1, "is", "."])
        base_rc_pl = " ".join([det1.capitalize(), noun_bundle[0], "who", det2, n_distr, verb1,  "are", "."])
        
        test_items.append( (noun_bundle[0], noun_bundle[2], noun_bundle[3], base_rc_sing, "sing", 7, i, "base_rc") )
        test_items.append( (noun_bundle[0], noun_bundle[2], noun_bundle[3], base_rc_pl, "pl", 7, i, "base_rc") )
    
        # BASE PREP: The noun near the president is/are
        base_pp_sing = " ".join([det1.capitalize(), noun_bundle[0], prep1, det2, adj1, n_distr, verb1, "is", "."])
        base_pp_pl = " ".join([det1.capitalize(), noun_bundle[0], prep1, det2, adj1, n_distr, verb1,  "are", "."])
        
        test_items.append( (noun_bundle[0], noun_bundle[2], noun_bundle[3], base_pp_sing, "sing", 8, i, "base_pp") )
        test_items.append( (noun_bundle[0], noun_bundle[2], noun_bundle[3], base_pp_pl, "pl", 8, i, "base_pp") )
    
        # TRANSF SIMPLE: Are/Is the NOUN
        transf_simple_sing = " ".join([sing_be.capitalize(), det1, noun_bundle[0], "."])
        transf_simple_pl = " ".join([pl_be.capitalize(), det1, noun_bundle[0], "."])
        
        test_items.append( (noun_bundle[0], noun_bundle[2], noun_bundle[3], transf_simple_sing, "sing", 3, i, "transf_simple") )
        test_items.append( (noun_bundle[0], noun_bundle[2], noun_bundle[3], transf_simple_pl, "pl", 3, i, "transf_simple") )
        
        #TRANSF MOD: Are/Is the very important and big NOUN
        transf_mod_sing = " ".join([sing_be.capitalize(), det1, intensifier1, adj1, "and", adj2, noun_bundle[0], "."])
        transf_mod_pl = " ".join([pl_be.capitalize(), det1, intensifier1, adj1, "and", adj2, noun_bundle[0], "."])
        
        test_items.append( (noun_bundle[0], noun_bundle[2], noun_bundle[3], transf_mod_sing, "sing", 7, i, "transf_mod") )
        test_items.append( (noun_bundle[0], noun_bundle[2], noun_bundle[3], transf_mod_pl, "pl", 7, i, "transf_mod") )
        
    return test_items


nouns = data[(data["XPOS"] == "NN") | (data["XPOS"] == "NNS")]
nouns = nouns[nouns["WORD"].str.islower()]
nouns = nouns[nouns["WORD"].isin(train_items)]
n_counts = nouns.groupby(["WORD", "LEMMA", "XPOS"]).size().reset_index()

n_sing = sample_words(n_counts[n_counts["XPOS"]=="NN"])
n_pl = sample_words(n_counts[n_counts["XPOS"] == "NNS"])

singulars = [generate_plural_tests(x, False) for x in n_sing]
plurals = [generate_plural_tests(x, True) for x in n_pl]

export_plural_test(plurals+singulars, "number-downsample")



In [7]:
# ARGUMENT STRUCTURE LEARNING

def generate_argstruct_tests(verb_bundle, is_pres):
    
    test_items = []
    
    """
    Argument Structure Tests
        - The lion devoured the gazelle yesterday . [OK for Trans, Not OK for Intrans]
        - The lion devoured yesterday . [Not OK for Trans, OK for intrans]
        
        - The gazelle was devoured yesterday . [OK for trans, not OK for intrans]
        - The gazelle devoured yesterday . [Not OK for trans, OK for intrans]
    """
    
    for i in range(N_ITEMS_PER_TEST):
        
        det1 = random.choice(DET_BLOCKS)
        noun1 = random.choice(NOUN_BLOCKS)
        det2 = random.choice(DET_BLOCKS)
        noun2 = random.choice(NOUN_BLOCKS)
        adv3 = random.choice(PAST_ADVS)
        aux = random.choice(AUX)
        
        adv1 = random.choice(PRES_ADVS)
        adv2 = random.choice(list(set(PRES_ADVS)-set([adv1])))
        adv4 = random.choice(list(set(PRES_ADVS)-set([adv1, adv2])))

        
        if is_pres:
            
            # BASE TEST
            test_base_obj = " ".join([det1.capitalize(), noun1, aux, verb_bundle[0], det2, noun2, "today", "."])
            test_base_nobj = " ".join([det1.capitalize(), noun1, aux, verb_bundle[0],  "today", "."])

            test_items.append( (verb_bundle[0], verb_bundle[2], verb_bundle[3], verb_bundle[5], verb_bundle[6], test_base_obj, "obj", 8, i, "base-pres") )
            test_items.append( (verb_bundle[0], verb_bundle[2], verb_bundle[3], verb_bundle[5], verb_bundle[6], test_base_nobj, "nobj", 6, i, "base-pres") )
            
        else:
        
            # BASE TEST
            test_base_obj = " ".join([det1.capitalize(), noun1, verb_bundle[0], det2, noun2, adv3, "."])
            test_base_nobj = " ".join([det1.capitalize(), noun1, verb_bundle[0],  adv3, "."])

            test_items.append( (verb_bundle[0], verb_bundle[2], verb_bundle[3], verb_bundle[5], verb_bundle[6], test_base_obj, "obj", 7, i, "base-nomod") )
            test_items.append( (verb_bundle[0], verb_bundle[2], verb_bundle[3], verb_bundle[5], verb_bundle[6], test_base_nobj, "nobj", 5, i, "base-nomod") )

            # TRANSF TEST
            test_transf_obj = " ".join([det1.capitalize(), noun1, "was", verb_bundle[0], adv3, "."])
            test_transf_nobj = " ".join([det1.capitalize(), noun1, verb_bundle[0], adv3, "."])

            test_items.append( (verb_bundle[0], verb_bundle[2], verb_bundle[3], verb_bundle[5], verb_bundle[6], test_transf_obj, "obj", 6, i, "transf-nomod") )
            test_items.append( (verb_bundle[0], verb_bundle[2], verb_bundle[3], verb_bundle[5], verb_bundle[6], test_transf_nobj, "nobj", 5, i, "transf-nomod") )

            # TRANSF TEST Short Modification
            test_transf_obj = " ".join([det1.capitalize(), noun1, "was", adv1, "and", adv2, verb_bundle[0], adv3, "."])
            test_transf_nobj = " ".join([det1.capitalize(), noun1, adv1, "and", adv2, verb_bundle[0], adv3, "."])

            test_items.append( (verb_bundle[0], verb_bundle[2], verb_bundle[3], verb_bundle[5], verb_bundle[6], test_transf_obj, "obj", 9, i, "transf-mod") )
            test_items.append( (verb_bundle[0], verb_bundle[2], verb_bundle[3], verb_bundle[5], verb_bundle[6], test_transf_nobj, "nobj", 8, i, "transf-mod") )

            # TRANSF TEST Long Modification
            test_transf_obj = " ".join([det1.capitalize(), noun1, "was", adv1, ",", adv2, ",", "and", adv4, verb_bundle[0], adv3, "."])
            test_transf_nobj = " ".join([det1.capitalize(), noun1, adv1, ",", adv2, ",", "and", adv4, verb_bundle[0], adv3, "."])

            test_items.append( (verb_bundle[0], verb_bundle[2], verb_bundle[3], verb_bundle[5], verb_bundle[6], test_transf_obj, "obj", 12, i, "transf-longmod") )
            test_items.append( (verb_bundle[0], verb_bundle[2], verb_bundle[3], verb_bundle[5], verb_bundle[6], test_transf_nobj, "nobj", 11, i, "transf-longmod") )

        
    return test_items


verbs = data[(data["XPOS"] == "VBD") | (data["XPOS"] == "VB")]
verbs = verbs[verbs["WORD"].str.islower()]
verbs = verbs[verbs["WORD"].isin(train_items)]

v_counts = verbs.groupby(["WORD", "LEMMA", "XPOS"]).size().reset_index()

celex = pd.read_csv("~/Documents/resources/data/celex2/english/esl/esl.cd", sep="\\", header=None)
# Trans_V = Can sometimes take a direct object
# Intrans_V = Can sometimes be intransitive
celex.columns = ["IdNum","word", "Cob", "ClassNum", "C_N", "Unc_N", "Sing_N", "Plu_N", "GrC_N",
                 "GrUnc_N", "Attr_N", "PostPos_N", "Voc_N","Proper_N", "Exp_N","Trans_V","TransComp_V",
                 "Intrans_V","Ditrans_V","Link_V","Phr_V","Prep_V","PhrPrep_V","Exp_V","Ord_A","Attr_A",
                 "Pred_A","PostPos_A","Exp_A","Ord_ADV","Pred_ADV","PostPos_ADV","Comb_ADV","Exp_ADV",
                 "Card_NUM","Ord_NUM","Exp_NUM","Pers_PRON","Dem_PRON","Poss_PRON","Refl_PRON","Wh_PRON",
                 "Det_PRON","Pron_PRON","Exp_PRON","Cor_C","Sub_C"]

argstruct = celex[["word", "Trans_V", "Intrans_V",]]
v_counts = v_counts.merge(argstruct, left_on="LEMMA", right_on='word')


vbd_trans = sample_words(v_counts[(v_counts["Trans_V"] == "Y") & (v_counts["Intrans_V"] == "N") & (v_counts["XPOS"] == "VBD")])
vbd_intrans = sample_words(v_counts[(v_counts["Trans_V"] == "N") & (v_counts["Intrans_V"] == "Y") & (v_counts["XPOS"] == "VBD")])
vbd_ambi = sample_words(v_counts[(v_counts["Trans_V"] == "Y") & (v_counts["Intrans_V"] == "Y") & (v_counts["XPOS"] == "VBD")])

trans = [generate_argstruct_tests(x, False) for x in vbd_trans]
intrans = [generate_argstruct_tests(x, False) for x in vbd_intrans]
ambi = [generate_argstruct_tests(x, False) for x in vbd_ambi]

vb_trans = sample_words(v_counts[(v_counts["Trans_V"] == "Y") & (v_counts["Intrans_V"] == "N") & (v_counts["XPOS"] == "VB")])
vb_intrans = sample_words(v_counts[(v_counts["Trans_V"] == "N") & (v_counts["Intrans_V"] == "Y") & (v_counts["XPOS"] == "VB")])
vb_ambi = sample_words(v_counts[(v_counts["Trans_V"] == "Y") & (v_counts["Intrans_V"] == "Y") & (v_counts["XPOS"] == "VB")])

trans_pres = [generate_argstruct_tests(x, True) for x in vb_trans]
intrans_pres = [generate_argstruct_tests(x, True) for x in vb_intrans]
ambi_pres = [generate_argstruct_tests(x, True) for x in vb_ambi]

export_argstruct_test(trans+intrans+ambi+trans_pres+intrans_pres+ambi_pres, "argstruct-downsample")
