In [1]:
import time
import os
import pickle
import numpy as np
from collections import OrderedDict
from collections import namedtuple
from random import shuffle
import copy
from itertools import combinations, combinations_with_replacement
from utils.utils import ROOT, dep_sample_generator
from typing import List, Dict


In [2]:
def generate_features_dict(path_to_file, sample2features, start_idx, feature_threshold=0, save_to_file=False,
                           features_name=''):
    """
    This function generates a features dictionary, such that for every features, an index is given starting from start
    the given start_idx.
    The following features are generated for a given dataset:
    byt the given sample2features which is a lambda exprestion to extract a feature from a sample
    you can think of sample2features as a template
    :param: path_to_file: path to location of the dataset (str)
    :param: feature_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hw_hp_feat_dict: dictionary feature->index (dict)
    """

    samp_gen = dep_sample_generator(path_to_file)
    hw_hp_feat_dict = OrderedDict()
    features_hist = OrderedDict()

    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            # ignore ROOT
            if s.token == ROOT:
                continue
            

            feats = sample2features(sample, s)
            for feat in feats:
                features_hist[feat] = features_hist.get(feat, 0) + 1

    current_idx = start_idx
    for k, v in features_hist.items():
        if v > feature_threshold:
            hw_hp_feat_dict[k] = current_idx
            current_idx += 1

#     print("total {:} features: ".format(features_name), current_idx)
    hw_hp_feat_dict = OrderedDict(sorted(hw_hp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + features_name + '.dict'
        with open(path, 'wb') as fp:
            pickle.dump(hw_hp_feat_dict, fp)
        print("saved {:} features dictionary @ ".format(features_name), path)
    return hw_hp_feat_dict

In [17]:
def generate_features_dicts(path_to_file: str, save_to_file: bool =False, minimal: bool=False
                            , use_mcdonald: bool=False)->Dict[str, dict]:
    """
    given a training file we return a dictionary of dictionaries
    where key is feature type name, and value is a dictionary of that feature generated by 'generate_features_dict'
    according to feature templates required by hw2 pg 2, those templates are:
    head word _ head pos
    head word
    head pos
    child word _ child pos
    child word
    child pos
    h_pos c_word c_pos

    if not minimal we add:
        h_word h_pos c_word c_pos
        h_word c_word c_pos
        h_word h_pos c_word
        h_word c_word
        
    if use mcdonald:
        distance(head,child)
        # is head to the right or left of the child:
        1 if head.idx < child.idx
        # in-between POS features:
        (h_pos, b_pos, c_pos)
        # surrounding word POS features:
        (h_pos, h+1_pos, c-1_pos, c_pos)
        (h-1_pos, h_pos, c-1_pos, c_pos)
        (h_pos, h+1_pos, c_pos, c+1_pos)
        (h-1_pos, h_pos, c_pos, c+1_pos)
    :param path_to_file: training file to extract features from
    :param save_to_file: if to save the dictionary
    :param minimal: if to add the extra features as described
    :return: dictionary described above
    """

    feature_types = {'head word _ head pos': (lambda sample, s: [(sample[s.head].token, sample[s.head].pos)], 0),
                     'head word': (lambda sample, s: [(sample[s.head].token)], 0),
                     'head pos': (lambda sample, s: [(sample[s.head].pos)], 0),
                     'child word _ child pos': (lambda sample, s: [(s.token, s.pos)], 0),
                     'child word': (lambda sample, s: [(s.token)], 0),
                     'child pos': (lambda sample, s: [(s.pos)], 0),
                     'h_pos c_word c_pos': (lambda sample, s: [(sample[s.head].pos, s.token, s.pos)], 0),
                     'h_word h_pos c_pos': (lambda sample, s: [(sample[s.head].token, sample[s.head].pos, s.pos)], 0),
                     'h_pos c_pos': (lambda sample, s: [(sample[s.head].pos, s.pos)], 0),
                     }

    if not minimal:

        feature_types['h_word h_pos c_word c_pos'] = (lambda sample, s:
                                                      [(sample[s.head].token, sample[s.head].pos, s.token, s.pos)], 0)

        feature_types['h_word c_word c_pos'] = (lambda sample, s:
                                                [(sample[s.head].token, s.token, s.pos)], 0)

        feature_types['h_word h_pos c_word'] = (lambda sample, s:
                                                [(sample[s.head].token, sample[s.head].pos, s.token)], 0)

        feature_types['h_word c_word'] = (lambda sample, s:
                                          [(sample[s.head].token, s.token)], 0)
        
    if use_mcdonald:
        # distance + is head to the right of child? (-) if child > head, (+) else
        # maybe we can add the distance to the unigrams?
        feature_types['h_word c_word dist'] = (lambda sample, s:
                                                      [(sample[s.head].token, s.token, sample[s.head].idx - s.idx)], 0)
        # in-between POS features:
        feature_types['h_c_pos_seq'] = (lambda sample, s:
                                                      [tuple(l.pos for l in sample[sample[s.head].idx : s.idx + 1])], 0)
        # surrounding word POS features
        feature_types['h_pos h_next_pos c_prev_pos c_pos'] = (lambda sample, s:
                                                              [(sample[s.head].pos,
                                                                sample[min(s.head + 1, sample[-1].idx)].pos,
                                                                sample[max(s.idx - 1, 0)].pos,
                                                                s.pos)], 0)
        
        feature_types['h_prev_pos h_pos c_prev_pos c_pos'] = (lambda sample, s:
                                                              [(sample[max(s.head - 1, 0)].pos,
                                                                sample[s.head].pos,
                                                                sample[max(s.idx - 1, 0)].pos,
                                                                s.pos)], 0)
        
        feature_types['h_pos h_next_pos c_pos c_next_pos'] = (lambda sample, s:
                                                              [(sample[s.head].pos,
                                                                sample[min(s.head + 1, sample[-1].idx)].pos,
                                                                s.pos,
                                                                sample[min(s.idx + 1, sample[-1].idx)].pos)], 0)
        
        feature_types['h_prev_pos h_pos c_pos c_next_pos'] = (lambda sample, s:
                                                              [(sample[max(s.head - 1, 0)].pos,
                                                                sample[s.head].pos,
                                                                s.pos,
                                                                sample[min(s.idx + 1, sample[-1].idx)].pos)], 0)
    features_dicts = {}
    current_num_features = 0
    for feature_type_name, (feature_template, feature_threshold) in feature_types.items():

        features_dicts[feature_type_name] = generate_features_dict(path_to_file,
                                                                   feature_template,
                                                                   start_idx=current_num_features,
                                                                   feature_threshold=feature_threshold,
                                                                   save_to_file=False,
                                                                   features_name=feature_type_name)

        num_features = len(features_dicts[feature_type_name])
        current_num_features += num_features

        print('generated {:} features , num features: {:}, total num features: {:}'.format(feature_type_name,
                                                                                         num_features,
                                                                                         current_num_features))

    return features_dicts


In [18]:
dictionaries = generate_features_dicts('./data/train.labeled.train.labeled', use_mcdonald=True)

generated h_pos c_word c_pos features , num features: 26559, total num features: 26559
generated h_word h_pos c_word c_pos features , num features: 58981, total num features: 85540
generated h_word h_pos c_word features , num features: 58614, total num features: 144154
generated h_pos h_next_pos c_pos c_next_pos features , num features: 16465, total num features: 160619
generated head pos features , num features: 36, total num features: 160655
generated child word _ child pos features , num features: 13641, total num features: 174296
generated h_c_pos_seq features , num features: 17414, total num features: 191710
generated head word _ head pos features , num features: 8595, total num features: 200305
generated head word features , num features: 7669, total num features: 207974
generated h_pos c_pos features , num features: 711, total num features: 208685
generated h_pos h_next_pos c_prev_pos c_pos features , num features: 15998, total num features: 224683
generated h_word c_word dist f

In [19]:
dictionaries.keys()

dict_keys(['h_pos c_word c_pos', 'h_word h_pos c_word c_pos', 'h_word h_pos c_word', 'h_pos h_next_pos c_pos c_next_pos', 'head pos', 'child word _ child pos', 'h_c_pos_seq', 'head word _ head pos', 'head word', 'h_pos c_pos', 'h_pos h_next_pos c_prev_pos c_pos', 'h_word c_word dist', 'child word', 'h_prev_pos h_pos c_pos c_next_pos', 'h_word c_word', 'child pos', 'h_word c_word c_pos', 'h_prev_pos h_pos c_prev_pos c_pos', 'h_word h_pos c_pos'])

In [20]:
sample = next(dep_sample_generator('./data/train.labeled.train.labeled'))
sample
child = sample[6]
head = sample[child.head]

In [8]:
dict2check = 'child word _ child pos'

In [9]:
a = dictionaries['h_word c_word'].get((head.token, child.token))
a

187029

In [22]:
dictionaries['h_c_pos_seq']

OrderedDict([((), 174296),
             (('NNP', ','), 174297),
             (('NNP', ',', 'CD', 'NNS', 'JJ'), 174298),
             (('NNP', ',', 'CD', 'NNS', 'JJ', ','), 174299),
             (('*', 'NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD'), 174300),
             (('MD', 'VB'), 174301),
             (('VB', 'DT', 'NN'), 174302),
             (('VB', 'DT', 'NN', 'IN'), 174303),
             (('IN', 'DT', 'JJ', 'NN'), 174304),
             (('VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP'), 174305),
             (('NNP', 'CD'), 174306),
             (('MD',
               'VB',
               'DT',
               'NN',
               'IN',
               'DT',
               'JJ',
               'NN',
               'NNP',
               'CD',
               '.'),
              174307),
             (('*', 'NNP', 'NNP', 'VBZ'), 174308),
             (('VBZ', 'NN'), 174309),
             (('NN', 'IN'), 174310),
             (('IN', 'NNP', 'NNP'), 174311),
             (('NNP', ',

In [11]:
print(head, child, a)

DepSample(idx=2, token='Vinken', pos='NNP', head=8) DepSample(idx=6, token='old', pos='JJ', head=2) 187029
