# Sandbox for new features (especially to avoid conflicts)

In [2]:
from nltk.corpus import wordnet
import pandas as pd

# FROM TERMINAL:
# pip install -U spacy
# python -m spacy download en
import spacy

In [3]:
def treebank_to_wn_tag(tb_tag):
    tag_dict = {'N': 'n', 'V': 'v', 'J': 'a', 'R': 'r'}
    if tb_tag[0] in tag_dict:
        return tag_dict[tb_tag[0]]
    else:
        return None

## If a candidate word isn't in the vocabulary, get some hypernyms
Should first check for synonyms in the vocabulary!

In [15]:
def get_hypernyms(word, pos=None):
    # TODO: POS tagging when choosing a synset
    # TODO: find a better way to choose a synset... but there might not be that many for these words anyway
    # TODO: error handling - what do we do if a word doesn't have a synset?
    hypernym_words = []
    for syn in wordnet.synsets(word, pos):
        print ("Synset: {}".format(syn))
        for h in syn.hypernyms():
            print ("Hypernym synset: {}".format(h))
            for l in h.lemmas():
                hypernym_words.append(l.name())
    
    print (hypernym_words)
            
        

**'voluptuous' has many synonyms, but no hypernyms**

In [16]:
get_hypernyms('happy') 

Synset: Synset('happy.a.01')
Synset: Synset('felicitous.s.02')
Synset: Synset('glad.s.02')
Synset: Synset('happy.s.04')
[]


In [11]:
def get_synonyms(word, pos=None):
    synonyms = []
    for syn in wordnet.synsets(word, pos):
        print(syn)
        for l in syn.lemmas():
            print("-", l)
            synonyms.append(l.name())
    # if not synonyms:
    #     print("No synonyms found for '{}' with pos={}".format(word, pos))
    return synonyms

In [13]:
get_synonyms("happy")

Synset('happy.a.01')
- Lemma('happy.a.01.happy')
Synset('felicitous.s.02')
- Lemma('felicitous.s.02.felicitous')
- Lemma('felicitous.s.02.happy')
Synset('glad.s.02')
- Lemma('glad.s.02.glad')
- Lemma('glad.s.02.happy')
Synset('happy.s.04')
- Lemma('happy.s.04.happy')
- Lemma('happy.s.04.well-chosen')


['happy', 'felicitous', 'happy', 'glad', 'happy', 'happy', 'well-chosen']

**'trifling' has many synsets and hypernyms if no POS tag, but if the correct POS tag given, is much more accurate**

In [6]:
get_hypernyms('trifling') 

print("\n\n")

get_hypernyms('trifling', 'a')

Synset: Synset('dalliance.n.01')
Hypernym synset: Synset('delay.n.02')
Synset: Synset('piddle.v.01')
Hypernym synset: Synset('spend.v.02')
Synset: Synset('frivol.v.01')
Hypernym synset: Synset('act.v.02')
Synset: Synset('dally.v.04')
Hypernym synset: Synset('consider.v.03')
Synset: Synset('negligible.s.02')
['delay', 'holdup', 'spend', 'expend', 'drop', 'act', 'behave', 'do', 'consider', 'take', 'deal', 'look_at']



Synset: Synset('negligible.s.02')
[]


## POS tagging, dependency parsing with spacy
[Documentation for dependency parsing](https://spacy.io/usage/linguistic-features#dependency-parse)

[Code example for dependency parsing](https://github.com/explosion/spacy/blob/master/examples/information_extraction/parse_subtrees.py)

In [7]:
nlp = spacy.load('en_core_web_sm')

In [14]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for token in doc:
    print(token.text, token.norm_, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

Apple apple apple PROPN NNP nsubj Xxxxx True False
is is be VERB VBZ aux xx True True
looking looking look VERB VBG ROOT xxxx True False
at at at ADP IN prep xx True True
buying buying buy VERB VBG pcomp xxxx True False
U.K. u.k. u.k. PROPN NNP compound X.X. False False
startup startup startup NOUN NN dobj xxxx True False
for for for ADP IN prep xxx True True
$ $ $ SYM $ quantmod $ False False
1 1 1 NUM CD compound d False False
billion billion billion NUM CD pobj xxxx True False


In [9]:
def get_spacy_doc(sentence, spacy_nlp=nlp):
    return spacy_nlp(sentence)

def get_pos_of_word(spacy_doc, token_index):
    # NOTE: token index MIGHT NOT be the index of the word in the original sentence (tokenization)
    return spacy_doc[token_index].pos_

def get_pos_of_sentence(sentence):
    doc = get_spacy_doc(sentence):
    pos_list = []
    i = 0
    for token in doc:
        pos = token.pos_
        i += 1
        
        


def get_ancestors_of_word(spacy_doc, token_index):
    # NOTE: token index MIGHT NOT be the index of the word in the original sentence (tokenization)
    # NOTE: returns token OBJECTS, not just text
    token = spacy_doc[token_index]
    
    return [ancestor for ancestor in token.ancestors]


def get_children_of_word(spacy_doc, token_index):
    # NOTE: token index MIGHT NOT be the index of the word in the original sentence (tokenization)
    # NOTE: returns token OBJECTS, not just text
    token = spacy_doc[token_index]
    
    return [child for child in token.children]
        

In [10]:
sentence = "Even though they only had two days left, the _____ weren't worried at all about their assignment."
doc = get_spacy_doc(sentence, nlp)
for i, token in enumerate(doc):
    print(i, token)

0 Even
1 though
2 they
3 only
4 had
5 two
6 days
7 left
8 ,
9 the
10 _
11 _
12 _
13 _
14 _
15 were
16 n't
17 worried
18 at
19 all
20 about
21 their
22 assignment
23 .


In [10]:
token_index = 4
print(get_pos_of_word(doc, token_index))
print(get_ancestors_of_word(doc, token_index))
print(get_children_of_word(doc, token_index))

VERB
[were]
[Even, though, they, only, left]


In [11]:
from spacy import displacy

displacy.render(doc, style='dep', jupyter=True)

## Stopword removal
The Woods paper mentions (for the MSR task) removing determiners, coordinating conjunctions, pronouns, and proper nouns for some (?) feature sets (it's kind of confusing what their actual feature sets are for the PMI model)

In [3]:
df1 = pd.read_csv('combined.gzip', index_col=0, compression='gzip')

In [5]:
df1

Unnamed: 0,0,1,1-0,1-1,10,10-0,10-1,10-2,10-2-1,10-7,...,zeitgeist,zero,zigzagging,zivakovic,zolotic,zone,zoo,zoos,zucchini,zupanja
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10-0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10-2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10-2-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10-7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df2 = pd.read_csv('sample_coo1', index_col=0, compression='gzip')
df2

In [9]:
df3 = pd.read_csv('sample_coo2', index_col=0, compression='gzip')
df3

Unnamed: 0,the,of,gonna,and,to,in,he,that,was,for,...,profit-making,entity,restore,involve,fees,recordings,pensions,length,92nd,events
the,317,466,216,275,292,346,124,128,158,125,...,1,1,2,0,1,1,3,3,0,1
of,466,46,174,134,66,106,45,42,40,43,...,0,0,1,0,0,0,1,1,1,1
gonna,216,174,48,94,101,111,49,62,67,65,...,1,1,0,0,0,0,0,0,1,1
and,275,134,94,26,97,100,39,27,39,43,...,1,0,0,0,0,0,1,1,0,0
to,292,66,101,97,29,70,51,35,44,41,...,0,0,2,0,0,0,0,0,0,0
in,346,106,111,100,70,39,46,53,51,38,...,0,0,0,0,0,0,0,0,0,1
he,124,45,49,39,51,46,11,26,54,14,...,0,0,0,0,0,0,0,0,0,0
that,128,42,62,27,35,53,26,4,21,13,...,0,0,0,0,0,0,0,0,0,0
was,158,40,67,39,44,51,54,21,5,11,...,0,0,0,0,0,0,0,0,0,0
for,125,43,65,43,41,38,14,13,11,7,...,0,0,0,1,1,1,0,0,0,0
