#Data to use

* excess-n because smallest n with more than 1 class and more than 100 train obs
* float-v because smallest v with more than 1 class and more than 100 train obs
* brillian-a because largest a
* accident-n because largest n
* promise-v because largest v

##Assumptions:
* Only grab POS and PARSE features for first occurance of WOI - fine in most cases, but not all

In [1]:
import pandas as pd
import sys
import time
from IPython.display import clear_output
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tree import ParentedTree
from __future__ import division
from nltk.parse.stanford import StanfordParser
english_parser = StanfordParser("/Users/tylerfolkman/stanford-parser-full-2015-01-30/stanford-parser.jar",
                                "/Users/tylerfolkman/stanford-parser-full-2015-01-30/stanford-parser-3.5.1-models.jar",
                               java_options='-mx8000m')

In [2]:
# read data
train = pd.read_pickle("../data/clean/train_df.pkl")
test = pd.read_pickle("../data/clean/test_df.pkl")

In [3]:
train.lextype.value_counts()

accident-n     1234
promise-v      1163
behaviour-n     994
promise-n       589
shirt-n         531
brilliant-a     441
knee-n          417
slight-a        380
modest-a        374
wooden-a        362
giant-n         343
giant-a         316
generous-a      307
seize-v         288
bother-v        282
bury-v          272
promise-a       262
derive-v        259
calculate-v     218
sack-v          187
float-v         183
amaze-a         183
excess-n        178
amaze-v         133
sack-n           99
excess-a         73
float-n          61
invade-v         46
floating-a       41
calculate-a      31
scrap-v          30
scrap-n          27
onion-n          26
knee-a           16
bother-n         12
consume-a        11
invade-a          8
slight-n          5
seize-a           4
brilliant-n       2
knee-v            2
shirt-a           2
dtype: int64

In [4]:
t = train[train.lextype == 'promise-v']
t.shape

(1163, 5)

In [3]:
train.lextype.value_counts()

accident-n     1234
promise-v      1163
behaviour-n     994
promise-n       589
shirt-n         531
brilliant-a     441
knee-n          417
slight-a        380
modest-a        374
wooden-a        362
giant-n         343
giant-a         316
generous-a      307
seize-v         288
bother-v        282
bury-v          272
promise-a       262
derive-v        259
calculate-v     218
sack-v          187
float-v         183
amaze-a         183
excess-n        178
amaze-v         133
sack-n           99
excess-a         73
float-n          61
invade-v         46
floating-a       41
calculate-a      31
scrap-v          30
scrap-n          27
onion-n          26
knee-a           16
bother-n         12
consume-a        11
invade-a          8
slight-n          5
seize-a           4
brilliant-n       2
knee-v            2
shirt-a           2
dtype: int64

In [4]:
#functions
def find_woi_sent(context, woi):
    for s in context:
        if woi in s:
            return s

def get_trees(context, woi):
    
    assert len(context) == len(woi)
    tokenizer = RegexpTokenizer(r"[\w'-]+|[^\w\s]+")

    sentences = [sent_tokenize(s) for s in context]
    single_sent = []
    for i, sent in enumerate(sentences):
        single_sent.append(find_woi_sent(sent, woi[i]))
    assert len(sentences) == len(single_sent)

    tokenized_context = tokenizer.tokenize_sents(single_sent)
    tagger = ParentedTree.convert(english_parser.parse_sents(tokenized_context))
    return tagger

def get_bigrams(values):
    bigram_vectorizer = CountVectorizer(ngram_range=(2, 1), tokenizer=word_tokenize)
    return bigram_vectorizer.fit_transform(values).toarray()
def get_pos(pos_list, woi):
    woi_index = find_woi_pos(pos_list, woi)
    woi_set = []
    for i in range(-2, 3):
        try:
            woi_set.append(pos_list[woi_index + i])
        except:
            woi_set.append("na")
    pos_set = [b for a, b in woi_set]
    return pos_set
def find_woi_pos(list_tuples, woi):
    for i in range(len(list_tuples)):
        if woi in list_tuples[i][0]:
            return i
def find_woi_list(listin, woi):
    for i, leaf in enumerate(listin):
        if woi in leaf:
            return i
def get_parse(tagger, woi):
    leaves = tagger.leaves()
    index_woi = find_woi_list(leaves, woi)
    loc_woi = tagger.leaf_treeposition(index_woi)
    tagger_copy = tagger
    for i, loc in enumerate(loc_woi):
        tagger_copy = tagger_copy[loc]
        if i == (len(loc_woi) - 3):
            try:
                head_pos = tagger_copy.label()
                head_word = tagger_copy.leaves()[0]
            except:
                head_pos = 'na'
                head_word = 'na'
        if i == (len(loc_woi) - 4):
            try:
                parent_head_pos = tagger_copy.label()
                parent_head_word = tagger_copy.leaves()[0]
            except:
                parent_head_pos = 'na'
                parent_head_word = 'na'
    if len(loc_woi) < 4:
        parent_head_pos = 'na'
        parent_head_word = 'na'
    return [head_word, head_pos, parent_head_word, parent_head_pos]
def get_lex_features(values, woi):
    pos_df = pd.DataFrame(columns=['before_before', 'before', 'target', 'after', 'after_after'])
    parse_df = pd.DataFrame(columns=['head', 'head_pos', 'parent_head', 'parent_head_pos'])
    trees = get_trees(values, woi)
    n_trees = len(trees)
    for i, tree in enumerate(trees):
        
        clear_output()
        print ("Getting POS and Parse Features: {0} / {1}".format(i, n_trees-1))
        sys.stdout.flush()
        
        pos = get_pos(tree.pos(), woi[i])
        parse = get_parse(tree, woi[i])
        pos_df.loc[i] = pos
        parse_df.loc[i] = parse
        
    return pd.get_dummies(pos_df).values, pd.get_dummies(parse_df).values
def get_features(values, woi):
    print("Getting Bigram Features...")
    sys.stdout.flush()
    bigrams = get_bigrams(values)
    print("Getting POS and Parse Features...")
    sys.stdout.flush()
    pos, parse = get_lex_features(values, woi)
    return bigrams, pos, parse
def create_data_lex(lex, train=train, test=test,
                    final_dir = "/Users/tylerfolkman/GradSchool/Spring2015/NLP/project/clean/data/test_train",
                    pieces_dir = "/Users/tylerfolkman/GradSchool/Spring2015/NLP/project/clean/data/test_train/pieces"):
    
    print("Processing {}".format(lex))
    sys.stdout.flush()
    train_set = train[train.lextype == lex]
    test_set = test[test.lextype == lex]
    split_point = train_set.shape[0]
    all_context = np.concatenate([train_set['context'].values, test_set['context'].values], axis=0)
    all_woi = np.concatenate([train_set['woi'].values, test_set['woi'].values], axis=0)

    bigrams, pos, parse = get_features(all_context, all_woi)
    print("Compiling Data...")
    sys.stdout.flush()
    train_bigrams = bigrams[:split_point]
    train_pos = pos[:split_point]
    train_parse = parse[:split_point]
    train_target = train_set['senseid'].values
    test_bigrams = bigrams[split_point:]
    test_pos = pos[split_point:]
    test_parse = parse[split_point:]
    test_target = test_set['senseid'].values
    train_X = np.concatenate([train_bigrams, train_pos, train_parse], axis=1)
    test_X = np.concatenate([test_bigrams, test_pos, test_parse], axis=1)

    print("Writing to disk...")
    sys.stdout.flush()
    #save final
    np.save("{0}/{1}_train_X".format(final_dir, lex), train_X)
    np.save("{0}/{1}_train_target".format(final_dir, lex), train_target)
    np.save("{0}/{1}_test_X".format(final_dir, lex), test_X)
    np.save("{0}/{1}_test_target".format(final_dir, lex), test_target)

    #save pieces
    np.save("{0}/{1}_train_bigrams".format(pieces_dir, lex), train_bigrams)
    np.save("{0}/{1}_train_pos".format(pieces_dir, lex), train_pos)
    np.save("{0}/{1}_train_parse".format(pieces_dir, lex), train_parse)
    np.save("{0}/{1}_test_bigrams".format(pieces_dir, lex), test_bigrams)
    np.save("{0}/{1}_test_pos".format(pieces_dir, lex), test_pos)
    np.save("{0}/{1}_test_parse".format(pieces_dir, lex), test_parse)
    
    print("Finished!")

In [7]:
create_data_lex('excess-n')

Getting POS and Parse Features: 363 / 363
Compiling Data...
Writing to disk...
Finished!


In [8]:
create_data_lex('float-v')

Getting POS and Parse Features: 411 / 411
Compiling Data...
Writing to disk...
Finished!


In [9]:
create_data_lex('brilliant-a')

Getting POS and Parse Features: 669 / 669
Compiling Data...
Writing to disk...
Finished!


In [10]:
create_data_lex('accident-n')

Getting POS and Parse Features: 1500 / 1500
Compiling Data...
Writing to disk...
Finished!


In [11]:
create_data_lex('promise-v')

Getting POS and Parse Features: 1386 / 1386
Compiling Data...
Writing to disk...
Finished!
