In [1]:
import time
import os
import pickle
import numpy as np
from collections import OrderedDict
from collections import namedtuple
from random import shuffle
import copy
from itertools import combinations, combinations_with_replacement
from utils.utils import ROOT, dep_sample_generator
from typing import List, Dict


In [19]:
def generate_features_dict(path_to_file, sample2features, start_idx, feature_threshold=0, save_to_file=False,
                           features_name=''):
    """
    This function generates a features dictionary, such that for every features, an index is given starting from start
    the given start_idx.
    The following features are generated for a given dataset:
    byt the given sample2features which is a lambda exprestion to extract a feature from a sample
    you can think of sample2features as a template
    :param: path_to_file: path to location of the dataset (str)
    :param: feature_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hw_hp_feat_dict: dictionary feature->index (dict)
    """

    samp_gen = dep_sample_generator(path_to_file)
    hw_hp_feat_dict = OrderedDict()
    features_hist = OrderedDict()

    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            # ignore ROOT
            if s.token == ROOT:
                continue
            

            feats = sample2features(sample, s)
            for feat in feats:
                features_hist[feat] = features_hist.get(feat, 0) + 1

    current_idx = start_idx
    for k, v in features_hist.items():
        if v > feature_threshold:
            hw_hp_feat_dict[k] = current_idx
            current_idx += 1

#     print("total {:} features: ".format(features_name), current_idx)
    hw_hp_feat_dict = OrderedDict(sorted(hw_hp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + features_name + '.dict'
        with open(path, 'wb') as fp:
            pickle.dump(hw_hp_feat_dict, fp)
        print("saved {:} features dictionary @ ".format(features_name), path)
    return hw_hp_feat_dict

In [68]:
def generate_features_dicts(path_to_file: str, save_to_file: bool =False, minimal: bool=False)->Dict[str, dict]:
    """
    given a training file we return a dictionary of dictionaries
    where key is feature type name, and value is a dictionary of that feature generated by 'generate_features_dict'
    according to feature templates required by hw2 pg 2, those templates are:
    head word _ head pos
    head word
    head pos
    child word _ child pos
    child word
    child pos
    h_pos c_word c_pos

    if not minimal we add:
        h_word h_pos c_word c_pos
        h_word c_word c_pos
        h_word h_pos c_word
        h_word c_word'
    :param path_to_file: training file to extract features from
    :param save_to_file: if to save the dictionary
    :param minimal: if to add the extra features as described
    :return: dictionary described above
    """

    feature_types = {'head word _ head pos': (lambda sample, s: [(sample[s.head].token, sample[s.head].pos)], 0),
                     'head word': (lambda sample, s: [(sample[s.head].token)], 0),
                     'head pos': (lambda sample, s: [(sample[s.head].pos)], 0),
                     'child word _ child pos': (lambda sample, s: [(s.token, s.pos)], 0),
                     'child word': (lambda sample, s: [(s.token)], 0),
                     'child pos': (lambda sample, s: [(s.pos)], 0),
                     'h_pos c_word c_pos': (lambda sample, s: [(sample[s.head].pos, s.token, s.pos)], 0),
                     'h_word h_pos c_pos': (lambda sample, s: [(sample[s.head].token, sample[s.head].pos, s.pos)], 0),
                     'h_pos c_pos': (lambda sample, s: [(sample[s.head].pos, s.pos)], 0),
                     }

    if not minimal:

        feature_types['h_word h_pos c_word c_pos'] = (lambda sample, s:
                                                      [(sample[s.head].token, sample[s.head].pos, s.token, s.pos)], 0)

        feature_types['h_word c_word c_pos'] = (lambda sample, s:
                                                [(sample[s.head].token, s.token, s.pos)], 0)

        feature_types['h_word h_pos c_word'] = (lambda sample, s:
                                                [(sample[s.head].token, sample[s.head].pos, s.token)], 0)

        feature_types['h_word c_word'] = (lambda sample, s:
                                          [(sample[s.head].token, s.token)], 0)

    features_dicts = {}
    current_num_features = 0
    for feature_type_name, (feature_template, feature_threshold) in feature_types.items():

        features_dicts[feature_type_name] = generate_features_dict(path_to_file,
                                                                   feature_template,
                                                                   start_idx=current_num_features,
                                                                   feature_threshold=feature_threshold,
                                                                   save_to_file=False,
                                                                   features_name=feature_type_name)

        num_features = len(features_dicts[feature_type_name])
        current_num_features += num_features

        print('\ngenerated {:} features , num features {:} , total num_features {:}'.format(feature_type_name,
                                                                                         num_features,
                                                                                         current_num_features))

    return features_dicts


In [69]:
dictionaries = generate_features_dicts('./data/train.labeled.train.labeled')


generated head word _ head pos features , num features 8594 , total num_features 8594

generated head word features , num features 7668 , total num_features 16262

generated head pos features , num features 35 , total num_features 16297

generated child word _ child pos features , num features 13194 , total num_features 29491

generated child word features , num features 11934 , total num_features 41425

generated child pos features , num features 45 , total num_features 41470

generated h_pos c_word c_pos features , num features 25658 , total num_features 67128

generated h_word h_pos c_pos features , num features 28717 , total num_features 95845

generated h_pos c_pos features , num features 693 , total num_features 96538

generated h_word h_pos c_word c_pos features , num features 58080 , total num_features 154618

generated h_word c_word c_pos features , num features 57462 , total num_features 212080

generated h_word h_pos c_word features , num features 57738 , total num_features

In [70]:
dictionaries.keys()

dict_keys(['head word _ head pos', 'head word', 'head pos', 'child word _ child pos', 'child word', 'child pos', 'h_pos c_word c_pos', 'h_word h_pos c_pos', 'h_pos c_pos', 'h_word h_pos c_word c_pos', 'h_word c_word c_pos', 'h_word h_pos c_word', 'h_word c_word'])

In [71]:
sample = next(dep_sample_generator('./data/train.labeled.train.labeled'))
sample
child = sample[6]
head = sample[child.head]

In [75]:
dict2chick = 'child word _ child pos'

In [106]:
a = dictionaries['h_word c_word'].get((head.token, child.token))
a

269823

In [107]:
dictionaries['h_word c_word']

OrderedDict([(('Vinken', 'Pierre'), 269818),
             (('will', 'Vinken'), 269819),
             (('Vinken', ','), 269820),
             (('years', '61'), 269821),
             (('old', 'years'), 269822),
             (('Vinken', 'old'), 269823),
             (('will', 'join'), 269824),
             (('board', 'the'), 269825),
             (('join', 'board'), 269826),
             (('join', 'as'), 269827),
             (('director', 'a'), 269828),
             (('director', 'nonexecutive'), 269829),
             (('as', 'director'), 269830),
             (('join', 'Nov.'), 269831),
             (('Nov.', '29'), 269832),
             (('will', '.'), 269833),
             (('Vinken', 'Mr.'), 269834),
             (('is', 'Vinken'), 269835),
             (('is', 'chairman'), 269836),
             (('chairman', 'of'), 269837),
             (('N.V.', 'Elsevier'), 269838),
             (('of', 'N.V.'), 269839),
             (('N.V.', ','), 269840),
             (('group', 'the'), 269841)

In [108]:
print(head, child, a)

DepSample(idx=2, token='Vinken', pos='NNP', head=8) DepSample(idx=6, token='old', pos='JJ', head=2) 269823
