In [41]:
import time
import os
import pickle
import numpy as np
from collections import OrderedDict
from collections import namedtuple
from random import shuffle
import copy
# named tuple has methods like _asdict()
ROOT = "*"
DepSample = namedtuple('DepSample', 'idx, token, pos, head')

In [2]:
example = "1	Pierre	_	NNP	_	_	2	NAME	_	_"
example = example.split('\t')
example = DepSample(example[0], example[1], example[3], example[6])

In [3]:
def dep_sample_generator(path_to_file):
    """
    This function generates samples, such that every sample is a list
    ordered by the tokens' counter (first column).
    :param: path_to_file: string to the location of the file tor read from (str)
    :return: sample (list of DepSample)
    """
    assert os.path.isfile(path_to_file), "File does not exist"
    root = DepSample(0, ROOT, ROOT, 0)
    with open(path_to_file) as fp:
        sample = [root]
        for line in fp:
            if not line.rstrip():
                yield sample
                sample = [root]
            else:
                ls = line.rstrip().split('\t')
#                 print(ls)
                sample.append(DepSample(int(ls[0]), ls[1], ls[3], int(ls[6])))
        if len(sample) > 1:
            yield sample
            

In [4]:
path_to_file = './data/train.labeled'

In [5]:
def split_train_validation(path_to_file, valid_amount=0.2):
    """
    This functions takes a train dataset and splits it to trainining
    and validation sets accoording to `valid_amount`.
    :param: path_to_file: path to file containing the dataset (str)
    :param: valid_amount: percentage of samples to take for validation (float)
    :return: train_file_path: path to file containing the training samples (str)
    :return: valid_file_path: path to file containing the validation samples (str)
    """
    path_train_file = path_to_file + ".train.labeled"
    path_valid_file = path_to_file + ".valid.labeled"
    # count samples
    samp_gen = dep_sample_generator(path_to_file)
    total_samples = 0
    for _ in samp_gen:
        total_samples += 1
    print("total samples ", total_samples)
    buffer = []
    num_validation = int(valid_amount * total_samples)
    num_training = total_samples - num_validation
    taken_for_training = 0
    t_file = open(path_train_file, 'w')
    v_file = open(path_valid_file, 'w')
    with open(path_to_file) as fp:
        sample = []
        for line in fp:
            if not line.rstrip():
                if taken_for_training < num_training:
                    for l in sample:
                        t_file.write(l)
                    t_file.write('\n')
                    taken_for_training += 1
                else:
                    for l in sample:
                        v_file.write(l)
                    v_file.write('\n')
                sample = []
            else:
                sample.append(line)
                
        if taken_for_training < num_training:
            for l in sample:
                t_file.write(l)
            t_file.write('\n')
            taken_for_training += 1
        else:
            for l in sample:
                v_file.write(l)
            v_file.write('\n')
    t_file.close()
    v_file.close()
    print("num training: ", num_training, " saved @ ", path_train_file)
    print("num validation: ", num_validation, " saved @ ", path_valid_file)
           

In [6]:
split_train_validation(path_to_file, valid_amount=0.2)

total samples  5000
num training:  4000  saved @  ./data/train.labeled.train.labeled
num validation:  1000  saved @  ./data/train.labeled.valid.labeled


In [7]:
path_to_train_file = './data/train.labeled.train.labeled'
path_to_Valid_file = './data/train.labeled.valid.labeled'

In [8]:
samp_gen = dep_sample_generator(path_to_file=path_to_train_file)
for s_i, s in enumerate(samp_gen):
    if s_i == 100:
        print(s)
#     if s_i > 0:
#         break
# print(s)

[DepSample(idx=0, token='*', pos='*', head=0), DepSample(idx=1, token='Alan', pos='NNP', head=2), DepSample(idx=2, token='Spoon', pos='NNP', head=9), DepSample(idx=3, token=',', pos=',', head=2), DepSample(idx=4, token='recently', pos='RB', head=5), DepSample(idx=5, token='named', pos='VBN', head=2), DepSample(idx=6, token='Newsweek', pos='NNP', head=7), DepSample(idx=7, token='president', pos='NN', head=5), DepSample(idx=8, token=',', pos=',', head=2), DepSample(idx=9, token='said', pos='VBD', head=0), DepSample(idx=10, token='Newsweek', pos='NNP', head=13), DepSample(idx=11, token="'s", pos='POS', head=10), DepSample(idx=12, token='ad', pos='NN', head=13), DepSample(idx=13, token='rates', pos='NNS', head=14), DepSample(idx=14, token='would', pos='MD', head=9), DepSample(idx=15, token='increase', pos='VB', head=14), DepSample(idx=16, token='5', pos='CD', head=17), DepSample(idx=17, token='%', pos='NN', head=15), DepSample(idx=18, token='in', pos='IN', head=15), DepSample(idx=19, token

## Dependency Sample

number of tokens in a sample = `sample[-1].idx`

POS tags = `[s.pos for s in sample]`

In [9]:
sample_len = s[-1].idx
s_tags = [l.pos for l in s]
print("sample length: ", sample_len)
print("POS tags: ", s_tags)

sample length:  4
POS tags:  ['*', 'WP', 'VBZ', 'JJ', '.']


## Features
### Unigram
* (head_word, head_pos)
* (head_word)
* (head_pos)
* (child_word, child_pos)
* (child_word)
* (child_pos)

### Bigram
* (head_word, head_pos, child_word, child_pos)
* (head_pos, child_word, child_pos)
* (head_word, child_word, child_pos)
* (head_word, head_pos, child_pos)
* (head_word, head_pos, child_word)
* (head_word, child_word)
* (head_pos, child_pos)

In [84]:
def generate_word_hist_dict(path_to_file, save_to_file=False):
    """
    This function generates histogram of of the tokens in the dataset.
    :param: path_to_file: path to location of the dataset (str)
    :param: save_to_file: whether or not to save the dictionary on disk (bool)
    :return: word_hist: OrderedDict word->word_count
    """
    samp_gen = dep_sample_generator(path_to_file)
    word_hist = {}
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT:
                continue
            if word_hist.get(s.token):
                word_hist[s.token] += 1
            else:
                word_hist[s.token] = 1
    word_hist = OrderedDict(sorted(word_hist.items(), key=lambda t: -t[1]))
    if save_to_file:
        path = path_to_file + ".word.hist"
        with open(path, 'wb') as fp:
            pickle.dump(word_hist, fp)
        print("word histogram dictionary saved @ ", path)
    return word_hist

"""
UNIGRAMS
"""

def generate_hw_hp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, head_pos)
    * (head_word)
    * (head_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hw_hp_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hw_hp_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            # ignore ROOT
            if s.token == ROOT or (word_hist.get(sample[s.head].token) \
                                   and word_hist[sample[s.head].token] < word_threshold):
                continue
            if sample[s.head].token == ROOT:
                continue
            feats = [(sample[s.head].token, sample[s.head].pos),
                     (sample[s.head].token),
                     (sample[s.head].pos)]
            for feat in feats:
                if hw_hp_feat_dict.get(feat) is None:
                    hw_hp_feat_dict[feat] = current_idx
                    current_idx += 1
    print("total (head_word, head_pos), (head_word), (head_pos) features: ", current_idx)
    hw_hp_feat_dict = OrderedDict(sorted(hw_hp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hw_hp.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hw_hp_feat_dict, fp)
        print("saved (head_word, head_pos), (head_word), (head_pos) features dictionary @ ", path)
    return hw_hp_feat_dict


def extract_hw_hp_feat_indices(sample, hw_hp_dict):
    """
    This function extracts the indices (in the feature vector) of the unigrams features:
    * (head_word, head_pos)
    * (head_word)
    * (head_pos)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: hw_hp_dict: the dictionary of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    feat_indices = {}
    for s in sample:
        if s.token == ROOT:
            continue
        if hw_hp_dict.get((sample[s.head].token, sample[s.head].pos)):
            idx = hw_hp_dict.get((sample[s.head].token, sample[s.head].pos))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
        if hw_hp_dict.get((sample[s.head].token)):
            idx = hw_hp_dict.get((sample[s.head].token))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
        if hw_hp_dict.get((sample[s.head].pos)):
            idx = hw_hp_dict.get((sample[s.head].pos))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
    return feat_indices


def generate_cw_cp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (child_word, child_pos)
    * (child_word)
    * (child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: cw_cp_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    cw_cp_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feats = [(s.token, s.pos),
                     (s.token),
                     (s.pos)]
            for feat in feats:
                if cw_cp_feat_dict.get(feat) is None:
                    cw_cp_feat_dict[feat] = current_idx
                    current_idx += 1
    print("total (child_word, child_pos), (child_word), (child_pos) features: ", current_idx)
    cw_cp_feat_dict = OrderedDict(sorted(cw_cp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".cw_cp.dict"
        with open(path, 'wb') as fp:
            pickle.dump(cw_cp_feat_dict, fp)
        print("saved (child_word, child_pos), (child_word), (child_pos) features dictionary @ ", path)
    return cw_cp_feat_dict


def extract_cw_cp_feat_indices(sample, cw_cp_dict):
    """
    This function extracts the indices (in the feature vector) of the unigrams features:
    * (child_word, child_pos)
    * (child_word)
    * (child_pos)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: cw_cp_dict: the dictionary of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    feat_indices = {}
    for s in sample:
        if s.token == ROOT:
            continue
        if cw_cp_dict.get((s.token, s.pos)):
            idx = cw_cp_dict.get((s.token, s.pos))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
        if cw_cp_dict.get((s.token)):
            idx = cw_cp_dict.get((s.token))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
        if cw_cp_dict.get((s.pos)):
            idx = cw_cp_dict.get((s.pos))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
    return feat_indices


def generate_unigram_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, head_pos)
    * (head_word)
    * (head_pos)
    * (child_word, child_pos)
    * (child_word)
    * (child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: unigram_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    hw_hp_dict = generate_hw_hp_feat_dict(path_to_file, word_threshold=word_threshold,
                                          save_to_file=save_to_file, word_hist=word_hist)
    cw_cp_dict = generate_cw_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                          save_to_file=save_to_file, word_hist=word_hist)
    print("total unigrams features: ", len(hw_hp_dict) + len(cw_cp_dict))
    return hw_hp_dict, cw_cp_dict


def extract_unigram_feat_indices(sample, unigram_dict):
    """
    This function extracts the indices (in the feature vector) of the unigrams features:
    * (head_word, head_pos)
    * (head_word)
    * (head_pos)
    * (child_word, child_pos)
    * (child_word)
    * (child_pos)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: unigram_dict: the dictionaries of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    num_hw_hp_feats = len(unigram_dict[0])
    num_cw_cp_feats = len(unigram_dict[1])
    current_num_features = 0
    hw_hp_ind = extract_hw_hp_feat_indices(s, unigram_dict[0])
    current_num_features += num_hw_hp_feats
    cw_cp_ind = extract_cw_cp_feat_indices(s, unigram_dict[1])
    unigram_indices = copy.deepcopy(hw_hp_ind)
    for item in cw_cp_ind.items():
        unigram_indices[current_num_features + item[0]] = item[1]
    current_num_features += num_cw_cp_feats
    return OrderedDict(sorted(unigram_indices.items(), key=lambda t: t[0]))

"""
TRIGRAMS
"""

def generate_hw_hp_cw_cp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, head_pos, child_word, child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hw_hp_cw_cp_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hw_hp_cw_cp_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feat = (sample[s.head].token, sample[s.head].pos, s.token, s.pos)
            if hw_hp_cw_cp_feat_dict.get(feat) is None:
                hw_hp_cw_cp_feat_dict[feat] = current_idx
                current_idx += 1
    print("total (head_word, head_pos, child_word, child_pos) features: ", current_idx)
    hw_hp_cw_cp_feat_dict = OrderedDict(sorted(hw_hp_cw_cp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hw_hp_cw_cp.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hw_hp_cw_cp_feat_dict, fp)
        print("saved (head_word, head_pos, child_word, child_pos) features dictionary @ ", path)
    return hw_hp_cw_cp_feat_dict


def generate_hp_cw_cp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_pos, child_word, child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hp_cw_cp_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hp_cw_cp_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feat = (sample[s.head].pos, s.token, s.pos)
            if hp_cw_cp_feat_dict.get(feat) is None:
                hp_cw_cp_feat_dict[feat] = current_idx
                current_idx += 1
    print("total (head_pos, child_word, child_pos) features: ", current_idx)
    hp_cw_cp_feat_dict = OrderedDict(sorted(hp_cw_cp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hp_cw_cp.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hp_cw_cp_feat_dict, fp)
        print("saved (head_pos, child_word, child_pos) features dictionary @ ", path)
    return hp_cw_cp_feat_dict


def generate_hw_cw_cp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, child_word, child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hw_cw_cp_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hw_cw_cp_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feat = (sample[s.head].token, s.token, s.pos)
            if hw_cw_cp_feat_dict.get(feat) is None:
                hw_cw_cp_feat_dict[feat] = current_idx
                current_idx += 1
    print("total (head_word, child_word, child_pos) features: ", current_idx)
    hw_cw_cp_feat_dict = OrderedDict(sorted(hw_cw_cp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hw_cw_cp.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hw_cw_cp_feat_dict, fp)
        print("saved (head_word, child_word, child_pos) features dictionary @ ", path)
    return hw_cw_cp_feat_dict


def generate_hw_hp_cp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, head_pos, child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hw_hp_cp_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hw_hp_cp_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feat = (sample[s.head].token, sample[s.head].pos, s.pos)
            if hw_hp_cp_feat_dict.get(feat) is None:
                hw_hp_cp_feat_dict[feat] = current_idx
                current_idx += 1
    print("total (head_word, head_pos, child_pos) features: ", current_idx)
    hw_hp_cp_feat_dict = OrderedDict(sorted(hw_hp_cp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hw_hp_cp.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hw_hp_cp_feat_dict, fp)
        print("saved (head_word, head_pos, child_pos) features dictionary @ ", path)
    return hw_hp_cp_feat_dict


def generate_hw_hp_cw_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, head_pos, child_word)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hw_hp_cw_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hw_hp_cw_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feat = (sample[s.head].token, sample[s.head].pos, s.token)
            if hw_hp_cw_feat_dict.get(feat) is None:
                hw_hp_cw_feat_dict[feat] = current_idx
                current_idx += 1
    print("total (head_word, head_pos, child_word) features: ", current_idx)
    hw_hp_cw_feat_dict = OrderedDict(sorted(hw_hp_cw_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hw_hp_cw.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hw_hp_cw_feat_dict, fp)
        print("saved (head_word, head_pos, child_word) features dictionary @ ", path)
    return hw_hp_cw_feat_dict


def generate_hw_cw_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, child_word)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hw_cw_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hw_cw_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feat = (sample[s.head].token, s.token)
            if hw_cw_feat_dict.get(feat) is None:
                hw_cw_feat_dict[feat] = current_idx
                current_idx += 1
    print("total (head_word, child_word) features: ", current_idx)
    hw_cw_feat_dict = OrderedDict(sorted(hw_cw_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hw_cw.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hw_cw_feat_dict, fp)
        print("saved (head_word, child_word) features dictionary @ ", path)
    return hw_cw_feat_dict


def generate_hp_cp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_pos, child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hp_cp_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hp_cp_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feat = (sample[s.head].pos, s.pos)
            if hp_cp_feat_dict.get(feat) is None:
                hp_cp_feat_dict[feat] = current_idx
                current_idx += 1
    print("total (head_pos, child_pos) features: ", current_idx)
    hp_cp_feat_dict = OrderedDict(sorted(hp_cp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hp_cp.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hp_cp_feat_dict, fp)
        print("saved (head_pos, child_pos) features dictionary @ ", path)
    return hp_cp_feat_dict


def generate_trigram_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, head_pos, child_word, child_pos)
    * (head_pos, child_word, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_word, head_pos, child_word)
    * (head_word, child_word)
    * (head_pos, child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: trigram_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    num_features = 0
    hw_hp_cw_cp_dict = generate_hw_hp_cw_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                                     save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hw_hp_cw_cp_dict)
    hp_cw_cp_dict = generate_hp_cw_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hp_cw_cp_dict)
    hw_cw_cp_dict = generate_hw_cw_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hw_cw_cp_dict)
    hw_hp_cp_dict = generate_hw_hp_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hw_hp_cp_dict)
    hw_hp_cw_dict = generate_hw_hp_cw_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hw_hp_cw_dict)
    hw_cw_dict = generate_hw_cw_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hw_cw_dict)
    hp_cp_dict = generate_hp_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hp_cp_dict)
    print("total trigrams features: ", num_features)
    return hw_hp_cw_cp_dict, hp_cw_cp_dict, hw_cw_cp_dict, hw_hp_cp_dict, hw_hp_cw_dict, hw_cw_dict, hp_cp_dict


def generate_trigram_feat_dict_minimal(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_pos, child_word, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_pos, child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: trigram_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    num_features = 0
    hp_cw_cp_dict = generate_hp_cw_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hp_cw_cp_dict)
    hw_hp_cp_dict = generate_hw_hp_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hw_hp_cp_dict)
    
    hp_cp_dict = generate_hp_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hp_cp_dict)
    print("total trigrams features: ", num_features)
    return hp_cw_cp_dict, hw_hp_cp_dict, hp_cp_dict

In [9]:
word_hist = generate_word_hist_dict(path_to_file)
print(len(word_hist))
# print(word_hist)

14162


In [86]:
# unigram_feat_dict = generate_unigram_feat_dict(path_to_file)
# hw_hp_dict = generate_hw_hp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None)
# cw_cp_dict = generate_cw_cp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None)
# hw_hp_cw_cp_dict = generate_hw_hp_cw_cp_feat_dict(path_to_file, word_threshold=2)
# hp_cw_cp_dict = generate_hp_cw_cp_feat_dict(path_to_file, word_threshold=0)
# hw_cw_cp_dict = generate_hw_cw_cp_feat_dict(path_to_file, word_threshold=4)
# hw_hp_cp_dict = generate_hw_hp_cp_feat_dict(path_to_file, word_threshold=2)
# hw_hp_cw_dict = generate_hw_hp_cw_feat_dict(path_to_file, word_threshold=1)
# hw_cw_dict = generate_hw_cw_feat_dict(path_to_file, word_threshold=1)
# hp_cp_dict = generate_hp_cp_feat_dict(path_to_file, word_threshold=1)
# trigram_feat_dict = generate_trigram_feat_dict(path_to_file, word_threshold=0)
trigram_feat_dict_minimal = generate_trigram_feat_dict_minimal(path_to_file)

total (head_pos, child_word, child_pos) features:  31314
total (head_word, head_pos, child_pos) features:  33936
total (head_pos, child_pos) features:  749
total trigrams features:  65999


In [80]:
hp_cp_dict

OrderedDict([(('NNP', 'NNP'), 0),
             (('MD', 'NNP'), 1),
             (('NNP', ','), 2),
             (('NNS', 'CD'), 3),
             (('JJ', 'NNS'), 4),
             (('NNP', 'JJ'), 5),
             (('*', 'MD'), 6),
             (('MD', 'VB'), 7),
             (('NN', 'DT'), 8),
             (('VB', 'NN'), 9),
             (('VB', 'IN'), 10),
             (('NN', 'JJ'), 11),
             (('IN', 'NN'), 12),
             (('VB', 'NNP'), 13),
             (('NNP', 'CD'), 14),
             (('MD', '.'), 15),
             (('VBZ', 'NNP'), 16),
             (('*', 'VBZ'), 17),
             (('VBZ', 'NN'), 18),
             (('NN', 'IN'), 19),
             (('IN', 'NNP'), 20),
             (('NN', 'NNP'), 21),
             (('NN', 'VBG'), 22),
             (('NNP', 'NN'), 23),
             (('VBZ', '.'), 24),
             (('VBD', 'NNP'), 25),
             (('JJ', 'CC'), 26),
             (('CC', 'NN'), 27),
             (('*', 'VBD'), 28),
             (('VBD', 'VBN'), 29),
   

In [37]:
s

[DepSample(idx=0, token='*', pos='*', head=0),
 DepSample(idx=1, token='What', pos='WP', head=2),
 DepSample(idx=2, token="'s", pos='VBZ', head=0),
 DepSample(idx=3, token='next', pos='JJ', head=2),
 DepSample(idx=4, token='?', pos='.', head=2)]

In [51]:
hw_hp_ind = extract_hw_hp_feat_indices(s, unigram_feat_dict[0])
cw_cp_ind = extract_cw_cp_feat_indices(s, unigram_feat_dict[1])
unigrams_ind = extract_unigram_feat_indices(s, unigram_feat_dict)
print(hw_hp_ind)
print(cw_cp_ind)
print(unigrams_ind)

{432: 3, 433: 3, 27: 3}
{2079: 1, 2080: 1, 318: 1, 730: 1, 246: 1, 47: 1, 1301: 1, 1302: 1, 15: 1, 4766: 1, 4767: 1, 42: 1}
OrderedDict([(27, 3), (432, 3), (433, 3), (18911, 1), (18938, 1), (18943, 1), (19142, 1), (19214, 1), (19626, 1), (20197, 1), (20198, 1), (20975, 1), (20976, 1), (23662, 1), (23663, 1)])
