In [1]:
import time
import os
import pickle
import numpy as np
from collections import OrderedDict
from collections import namedtuple
from random import shuffle
import copy
# named tuple has methods like _asdict()
ROOT = "*"
DepSample = namedtuple('DepSample', 'idx, token, pos, head')

In [2]:
example = "1	Pierre	_	NNP	_	_	2	NAME	_	_"
example = example.split('\t')
example = DepSample(example[0], example[1], example[3], example[6])

In [8]:
def dep_sample_generator(path_to_file):
    """
    This function generates samples, such that every sample is a list
    ordered by the tokens' counter (first column).
    :param: path_to_file: string to the location of the file tor read from (str)
    :return: sample (list of DepSample)
    """
    assert os.path.isfile(path_to_file), "File does not exist"
    root = DepSample(0, ROOT, ROOT, 0)
    with open(path_to_file) as fp:
        sample = [root]
        for line in fp:
            if not line.rstrip():
                yield sample
                sample = [root]
            else:
                ls = line.rstrip().split('\t')
#                 print(ls)
                try:
                    head = int(ls[6])
                except ValueError:
                    head = ls[6]
                sample.append(DepSample(int(ls[0]), ls[1], ls[3], head))
        if len(sample) > 1:
            yield sample
            

In [4]:
path_to_file = './data/train.labeled'

In [5]:
def split_train_validation(path_to_file, valid_amount=0.2):
    """
    This functions takes a train dataset and splits it to trainining
    and validation sets accoording to `valid_amount`.
    :param: path_to_file: path to file containing the dataset (str)
    :param: valid_amount: percentage of samples to take for validation (float)
    :return: train_file_path: path to file containing the training samples (str)
    :return: valid_file_path: path to file containing the validation samples (str)
    """
    path_train_file = path_to_file + ".train.labeled"
    path_valid_file = path_to_file + ".valid.labeled"
    # count samples
    samp_gen = dep_sample_generator(path_to_file)
    total_samples = 0
    for _ in samp_gen:
        total_samples += 1
    print("total samples ", total_samples)
    buffer = []
    num_validation = int(valid_amount * total_samples)
    num_training = total_samples - num_validation
    taken_for_training = 0
    t_file = open(path_train_file, 'w')
    v_file = open(path_valid_file, 'w')
    with open(path_to_file) as fp:
        sample = []
        for line in fp:
            if not line.rstrip():
                if taken_for_training < num_training:
                    for l in sample:
                        t_file.write(l)
                    t_file.write('\n')
                    taken_for_training += 1
                else:
                    for l in sample:
                        v_file.write(l)
                    v_file.write('\n')
                sample = []
            else:
                sample.append(line)
                
        if taken_for_training < num_training:
            for l in sample:
                t_file.write(l)
            t_file.write('\n')
            taken_for_training += 1
        else:
            for l in sample:
                v_file.write(l)
            v_file.write('\n')
    t_file.close()
    v_file.close()
    print("num training: ", num_training, " saved @ ", path_train_file)
    print("num validation: ", num_validation, " saved @ ", path_valid_file)
           

In [6]:
split_train_validation(path_to_file, valid_amount=0.2)

total samples  5000
num training:  4000  saved @  ./data/train.labeled.train.labeled
num validation:  1000  saved @  ./data/train.labeled.valid.labeled


In [7]:
path_to_train_file = './data/train.labeled.train.labeled'
path_to_Valid_file = './data/train.labeled.valid.labeled'

In [8]:
samp_gen = dep_sample_generator(path_to_file=path_to_train_file)
for s_i, s in enumerate(samp_gen):
    if s_i == 100:
        print(s)
#     if s_i > 0:
#         break
# print(s)

[DepSample(idx=0, token='*', pos='*', head=0), DepSample(idx=1, token='Alan', pos='NNP', head=2), DepSample(idx=2, token='Spoon', pos='NNP', head=9), DepSample(idx=3, token=',', pos=',', head=2), DepSample(idx=4, token='recently', pos='RB', head=5), DepSample(idx=5, token='named', pos='VBN', head=2), DepSample(idx=6, token='Newsweek', pos='NNP', head=7), DepSample(idx=7, token='president', pos='NN', head=5), DepSample(idx=8, token=',', pos=',', head=2), DepSample(idx=9, token='said', pos='VBD', head=0), DepSample(idx=10, token='Newsweek', pos='NNP', head=13), DepSample(idx=11, token="'s", pos='POS', head=10), DepSample(idx=12, token='ad', pos='NN', head=13), DepSample(idx=13, token='rates', pos='NNS', head=14), DepSample(idx=14, token='would', pos='MD', head=9), DepSample(idx=15, token='increase', pos='VB', head=14), DepSample(idx=16, token='5', pos='CD', head=17), DepSample(idx=17, token='%', pos='NN', head=15), DepSample(idx=18, token='in', pos='IN', head=15), DepSample(idx=19, token

## Dependency Sample

number of tokens in a sample = `sample[-1].idx`

POS tags = `[s.pos for s in sample]`

In [9]:
sample_len = s[-1].idx
s_tags = [l.pos for l in s]
s_heads = [l.head for l in s]
print("sample length: ", sample_len)
print("POS tags: ", s_tags)
print("Heads: ", s_heads)
s[s_heads[0]]

sample length:  4
POS tags:  ['*', 'WP', 'VBZ', 'JJ', '.']
Heads:  [0, 2, 0, 2, 2]


DepSample(idx=0, token='*', pos='*', head=0)

## Features
### Unigram
* (head_word, head_pos)
* (head_word)
* (head_pos)
* (child_word, child_pos)
* (child_word)
* (child_pos)

### Bigram
* (head_word, head_pos, child_word, child_pos)
* (head_pos, child_word, child_pos)
* (head_word, child_word, child_pos)
* (head_word, head_pos, child_pos)
* (head_word, head_pos, child_word)
* (head_word, child_word)
* (head_pos, child_pos)

In [12]:
def generate_word_hist_dict(path_to_file, save_to_file=False):
    """
    This function generates histogram of of the tokens in the dataset.
    :param: path_to_file: path to location of the dataset (str)
    :param: save_to_file: whether or not to save the dictionary on disk (bool)
    :return: word_hist: OrderedDict word->word_count
    """
    samp_gen = dep_sample_generator(path_to_file)
    word_hist = {}
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT:
                continue
            if word_hist.get(s.token):
                word_hist[s.token] += 1
            else:
                word_hist[s.token] = 1
    word_hist = OrderedDict(sorted(word_hist.items(), key=lambda t: -t[1]))
    if save_to_file:
        path = path_to_file + ".word.hist"
        with open(path, 'wb') as fp:
            pickle.dump(word_hist, fp)
        print("word histogram dictionary saved @ ", path)
    return word_hist

"""
UNIGRAMS
"""

def generate_hw_hp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, head_pos)
    * (head_word)
    * (head_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hw_hp_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hw_hp_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            # ignore ROOT
            if s.token == ROOT or (word_hist.get(sample[s.head].token) \
                                   and word_hist[sample[s.head].token] < word_threshold):
                continue
            if sample[s.head].token == ROOT:
                continue
            feats = [(sample[s.head].token, sample[s.head].pos),
                     (sample[s.head].token),
                     (sample[s.head].pos)]
            for feat in feats:
                if hw_hp_feat_dict.get(feat) is None:
                    hw_hp_feat_dict[feat] = current_idx
                    current_idx += 1
    print("total (head_word, head_pos), (head_word), (head_pos) features: ", current_idx)
    hw_hp_feat_dict = OrderedDict(sorted(hw_hp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hw_hp.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hw_hp_feat_dict, fp)
        print("saved (head_word, head_pos), (head_word), (head_pos) features dictionary @ ", path)
    return hw_hp_feat_dict


def extract_hw_hp_feat_indices(sample, hw_hp_dict):
    """
    This function extracts the indices (in the feature vector) of the unigrams features:
    * (head_word, head_pos)
    * (head_word)
    * (head_pos)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: hw_hp_dict: the dictionary of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    feat_indices = {}
    for s in sample:
        if s.token == ROOT:
            continue
        if hw_hp_dict.get((sample[s.head].token, sample[s.head].pos)):
            idx = hw_hp_dict.get((sample[s.head].token, sample[s.head].pos))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
        if hw_hp_dict.get((sample[s.head].token)):
            idx = hw_hp_dict.get((sample[s.head].token))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
        if hw_hp_dict.get((sample[s.head].pos)):
            idx = hw_hp_dict.get((sample[s.head].pos))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
    return feat_indices


def extract_hw_hp_feat_indices_pair(head, child, hw_hp_dict):
    """
    This function extracts the indices (in the feature vector) of the unigrams features:
    * (head_word, head_pos)
    * (head_word)
    * (head_pos)
    :param: head: head DepSample (DepSample)
    :param: child: child DepSample (DepSample)
    :param: hw_hp_dict: the dictionary of indices (dict)
    :return: feat_indices_list: list of indices
    """
    feat_indices = []
    if hw_hp_dict.get((head.token, head.pos)):
        feat_indices.append(hw_hp_dict.get((head.token, head.pos)))
    if hw_hp_dict.get(head.token):
        feat_indices.append(hw_hp_dict.get(head.token))
    if hw_hp_dict.get(head.pos):
        feat_indices.append(hw_hp_dict.get(head.pos))
    return feat_indices


def generate_cw_cp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (child_word, child_pos)
    * (child_word)
    * (child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: cw_cp_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    cw_cp_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feats = [(s.token, s.pos),
                     (s.token),
                     (s.pos)]
            for feat in feats:
                if cw_cp_feat_dict.get(feat) is None:
                    cw_cp_feat_dict[feat] = current_idx
                    current_idx += 1
    print("total (child_word, child_pos), (child_word), (child_pos) features: ", current_idx)
    cw_cp_feat_dict = OrderedDict(sorted(cw_cp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".cw_cp.dict"
        with open(path, 'wb') as fp:
            pickle.dump(cw_cp_feat_dict, fp)
        print("saved (child_word, child_pos), (child_word), (child_pos) features dictionary @ ", path)
    return cw_cp_feat_dict


def extract_cw_cp_feat_indices(sample, cw_cp_dict):
    """
    This function extracts the indices (in the feature vector) of the unigrams features:
    * (child_word, child_pos)
    * (child_word)
    * (child_pos)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: cw_cp_dict: the dictionary of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    feat_indices = {}
    for s in sample:
        if s.token == ROOT:
            continue
        if cw_cp_dict.get((s.token, s.pos)):
            idx = cw_cp_dict.get((s.token, s.pos))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
        if cw_cp_dict.get((s.token)):
            idx = cw_cp_dict.get((s.token))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
        if cw_cp_dict.get((s.pos)):
            idx = cw_cp_dict.get((s.pos))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
    return feat_indices


def extract_cw_cp_feat_indices_pair(head, child, cw_cp_dict):
    """
    This function extracts the indices (in the feature vector) of the unigrams features:
    * (child_word, child_pos)
    * (child_word)
    * (child_pos)
    :param: head: head DepSample (DepSample)
    :param: child: child DepSample (DepSample)
    :param: cw_cp_dict: the dictionary of indices (dict)
    :return: feat_indices_list: list of indices
    """
    feat_indices = []
    if cw_cp_dict.get((child.token, child.pos)):
        feat_indices.append(cw_cp_dict.get((child.token, child.pos)))
    if cw_cp_dict.get(child.token):
        feat_indices.append(cw_cp_dict.get(child.token))
    if cw_cp_dict.get(child.pos):
        feat_indices.append(cw_cp_dict.get(child.pos))
    return feat_indices


def generate_unigram_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, head_pos)
    * (head_word)
    * (head_pos)
    * (child_word, child_pos)
    * (child_word)
    * (child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: unigram_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    hw_hp_dict = generate_hw_hp_feat_dict(path_to_file, word_threshold=word_threshold,
                                          save_to_file=save_to_file, word_hist=word_hist)
    cw_cp_dict = generate_cw_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                          save_to_file=save_to_file, word_hist=word_hist)
    print("total unigrams features: ", len(hw_hp_dict) + len(cw_cp_dict))
    return hw_hp_dict, cw_cp_dict


def extract_unigram_feat_indices_pair(head, child, unigram_dict):
    """
    This function extracts the indices (in the feature vector) of the unigrams features:
    * (head_word, head_pos)
    * (head_word)
    * (head_pos)
    * (child_word, child_pos)
    * (child_word)
    * (child_pos)
    :param: head: head DepSample (DepSample)
    :param: child: child DepSample (DepSample)
    :param: unigram_dict: the dictionaries of indices (dict)
    :return: feat_indices_list: list of indices
    """
    num_hw_hp_feats = len(unigram_dict[0])
    num_cw_cp_feats = len(unigram_dict[1])
    current_num_features = 0
    hw_hp_ind = extract_hw_hp_feat_indices_pair(head, child, unigram_dict[0])
    current_num_features += num_hw_hp_feats
    cw_cp_ind = extract_cw_cp_feat_indices_pair(head, child, unigram_dict[1])
    unigram_indices = copy.deepcopy(hw_hp_ind)
    for i in cw_cp_ind:
        unigram_indices.append(current_num_features + i)
    current_num_features += num_cw_cp_feats
    return sorted(unigram_indices)


def extract_unigram_feat_indices(sample, unigram_dict):
    """
    This function extracts the indices (in the feature vector) of the unigrams features:
    * (head_word, head_pos)
    * (head_word)
    * (head_pos)
    * (child_word, child_pos)
    * (child_word)
    * (child_pos)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: unigram_dict: the dictionaries of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    num_hw_hp_feats = len(unigram_dict[0])
    num_cw_cp_feats = len(unigram_dict[1])
    current_num_features = 0
    hw_hp_ind = extract_hw_hp_feat_indices(sample, unigram_dict[0])
    current_num_features += num_hw_hp_feats
    cw_cp_ind = extract_cw_cp_feat_indices(sample, unigram_dict[1])
    unigram_indices = copy.deepcopy(hw_hp_ind)
    for item in cw_cp_ind.items():
        unigram_indices[current_num_features + item[0]] = item[1]
    current_num_features += num_cw_cp_feats
    return OrderedDict(sorted(unigram_indices.items(), key=lambda t: t[0]))

In [32]:
"""
BIGRAMS
"""

def generate_hw_hp_cw_cp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, head_pos, child_word, child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than
            that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hw_hp_cw_cp_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hw_hp_cw_cp_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feat = (sample[s.head].token, sample[s.head].pos, s.token, s.pos)
            if hw_hp_cw_cp_feat_dict.get(feat) is None:
                hw_hp_cw_cp_feat_dict[feat] = current_idx
                current_idx += 1
    print("total (head_word, head_pos, child_word, child_pos) features: ", current_idx)
    hw_hp_cw_cp_feat_dict = OrderedDict(sorted(hw_hp_cw_cp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hw_hp_cw_cp.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hw_hp_cw_cp_feat_dict, fp)
        print("saved (head_word, head_pos, child_word, child_pos) features dictionary @ ", path)
    return hw_hp_cw_cp_feat_dict


def extract_hw_hp_cw_cp_feat_indices(sample, hw_hp_cw_cp_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_word, head_pos, child_word, child_pos)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: hw_hp_cw_cp_dict: the dictionary of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    feat_indices = {}
    for s in sample:
        if s.token == ROOT:
            continue
        if hw_hp_cw_cp_dict.get((sample[s.head].token, sample[s.head].pos, s.token, s.pos)):
            idx = hw_hp_cw_cp_dict.get((sample[s.head].token, sample[s.head].pos, s.token, s.pos))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
    return feat_indices


def extract_hw_hp_cw_cp_feat_indices_pair(head, child, hw_hp_cw_cp_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_word, head_pos, child_word, child_pos)
    :param: head: head DepSample (DepSample)
    :param: child: child DepSample (DepSample)
    :param: hw_hp_cw_cp_dict: the dictionary of indices (dict)
    :return: feat_idx: index of the feature (list)
    """
    if hw_hp_cw_cp_dict.get((head.token, head.pos, child.token, child.pos)):
        return [hw_hp_cw_cp_dict.get((head.token, head.pos, child.token, child.pos))]
    else:
        return []


def generate_hp_cw_cp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_pos, child_word, child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than
                that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hp_cw_cp_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hp_cw_cp_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feat = (sample[s.head].pos, s.token, s.pos)
            if hp_cw_cp_feat_dict.get(feat) is None:
                hp_cw_cp_feat_dict[feat] = current_idx
                current_idx += 1
    print("total (head_pos, child_word, child_pos) features: ", current_idx)
    hp_cw_cp_feat_dict = OrderedDict(sorted(hp_cw_cp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hp_cw_cp.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hp_cw_cp_feat_dict, fp)
        print("saved (head_pos, child_word, child_pos) features dictionary @ ", path)
    return hp_cw_cp_feat_dict


def extract_hp_cw_cp_feat_indices(sample, hp_cw_cp_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_pos, child_word, child_pos)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: hp_cw_cp_dict: the dictionary of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    feat_indices = {}
    for s in sample:
        if s.token == ROOT:
            continue
        if hp_cw_cp_dict.get((sample[s.head].pos, s.token, s.pos)):
            idx = hp_cw_cp_dict.get((sample[s.head].pos, s.token, s.pos))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
    return feat_indices


def extract_hp_cw_cp_feat_indices_pair(head, child, hp_cw_cp_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_pos, child_word, child_pos)
    :param: head: head DepSample (DepSample)
    :param: child: child DepSample (DepSample)
    :param: hp_cw_cp_dict: the dictionary of indices (dict)
    :return: feat_idx: index of the feature (list)
    """
    if hp_cw_cp_dict.get((head.pos, child.token, child.pos)):
        return [hp_cw_cp_dict.get((head.pos, child.token, child.pos))]
    else:
        return []


def generate_hw_cw_cp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, child_word, child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than
                that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hw_cw_cp_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hw_cw_cp_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feat = (sample[s.head].token, s.token, s.pos)
            if hw_cw_cp_feat_dict.get(feat) is None:
                hw_cw_cp_feat_dict[feat] = current_idx
                current_idx += 1
    print("total (head_word, child_word, child_pos) features: ", current_idx)
    hw_cw_cp_feat_dict = OrderedDict(sorted(hw_cw_cp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hw_cw_cp.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hw_cw_cp_feat_dict, fp)
        print("saved (head_word, child_word, child_pos) features dictionary @ ", path)
    return hw_cw_cp_feat_dict


def extract_hw_cw_cp_feat_indices(sample, hw_cw_cp_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_word, child_word, child_pos)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: hw_cw_cp_dict: the dictionary of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    feat_indices = {}
    for s in sample:
        if s.token == ROOT:
            continue
        if hw_cw_cp_dict.get((sample[s.head].token, s.token, s.pos)):
            idx = hw_cw_cp_dict.get((sample[s.head].token, s.token, s.pos))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
    return feat_indices


def extract_hw_cw_cp_feat_indices_pair(head, child, hw_cw_cp_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_word, child_word, child_pos)
    :param: head: head DepSample (DepSample)
    :param: child: child DepSample (DepSample)
    :param: hw_cw_cp_dict: the dictionary of indices (dict)
    :return: feat_idx: index of the feature (list)
    """
    if hw_cw_cp_dict.get((head.token, child.token, child.pos)):
        return [hw_cw_cp_dict.get((head.token, child.token, child.pos))]
    else:
        return []


def generate_hw_hp_cp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, head_pos, child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than 
                that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hw_hp_cp_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hw_hp_cp_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feat = (sample[s.head].token, sample[s.head].pos, s.pos)
            if hw_hp_cp_feat_dict.get(feat) is None:
                hw_hp_cp_feat_dict[feat] = current_idx
                current_idx += 1
    print("total (head_word, head_pos, child_pos) features: ", current_idx)
    hw_hp_cp_feat_dict = OrderedDict(sorted(hw_hp_cp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hw_hp_cp.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hw_hp_cp_feat_dict, fp)
        print("saved (head_word, head_pos, child_pos) features dictionary @ ", path)
    return hw_hp_cp_feat_dict


def extract_hw_hp_cp_feat_indices(sample, hw_hp_cp_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_word, head_pos, child_pos)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: hw_hp_cp_dict: the dictionary of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    feat_indices = {}
    for s in sample:
        if s.token == ROOT:
            continue
        if hw_hp_cp_dict.get((sample[s.head].token, sample[s.head].pos, s.pos)):
            idx = hw_hp_cp_dict.get((sample[s.head].token, sample[s.head].pos, s.pos))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
    return feat_indices


def extract_hw_hp_cp_feat_indices_pair(head, child, hw_hp_cp_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_word, head_pos, child_pos)
    :param: head: head DepSample (DepSample)
    :param: child: child DepSample (DepSample)
    :param: hw_hp_cp_dict: the dictionary of indices (dict)
    :return: feat_idx: index of the feature (list)
    """
    if hw_hp_cp_dict.get((head.token, head.pos, child.pos)):
        return [hw_hp_cp_dict.get((head.token, head.pos, child.pos))]
    else:
        return []

    
def generate_hw_hp_cw_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, head_pos, child_word)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than 
                that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hw_hp_cw_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hw_hp_cw_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feat = (sample[s.head].token, sample[s.head].pos, s.token)
            if hw_hp_cw_feat_dict.get(feat) is None:
                hw_hp_cw_feat_dict[feat] = current_idx
                current_idx += 1
    print("total (head_word, head_pos, child_word) features: ", current_idx)
    hw_hp_cw_feat_dict = OrderedDict(sorted(hw_hp_cw_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hw_hp_cw.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hw_hp_cw_feat_dict, fp)
        print("saved (head_word, head_pos, child_word) features dictionary @ ", path)
    return hw_hp_cw_feat_dict


def extract_hw_hp_cw_feat_indices(sample, hw_hp_cw_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_word, head_pos, child_word)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: hw_hp_cw_dict: the dictionary of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    feat_indices = {}
    for s in sample:
        if s.token == ROOT:
            continue
        if hw_hp_cw_dict.get((sample[s.head].token, sample[s.head].pos, s.token)):
            idx = hw_hp_cw_dict.get((sample[s.head].token, sample[s.head].pos, s.token))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
    return feat_indices


def extract_hw_hp_cw_feat_indices_pair(head, child, hw_hp_cw_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_word, head_pos, child_word)
    :param: head: head DepSample (DepSample)
    :param: child: child DepSample (DepSample)
    :param: hw_hp_cw_dict: the dictionary of indices (dict)
    :return: feat_idx: index of the feature (list)
    """
    if hw_hp_cw_dict.get((head.token, head.pos, child.token)):
        return [hw_hp_cw_dict.get((head.token, head.pos, child.token))]
    else:
        return []


def generate_hw_cw_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, child_word)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than
                that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hw_cw_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hw_cw_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feat = (sample[s.head].token, s.token)
            if hw_cw_feat_dict.get(feat) is None:
                hw_cw_feat_dict[feat] = current_idx
                current_idx += 1
    print("total (head_word, child_word) features: ", current_idx)
    hw_cw_feat_dict = OrderedDict(sorted(hw_cw_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hw_cw.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hw_cw_feat_dict, fp)
        print("saved (head_word, child_word) features dictionary @ ", path)
    return hw_cw_feat_dict


def extract_hw_cw_feat_indices(sample, hw_cw_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_word, child_word)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: hw_cw_dict: the dictionary of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    feat_indices = {}
    for s in sample:
        if s.token == ROOT:
            continue
        if hw_cw_dict.get((sample[s.head].token, s.token)):
            idx = hw_cw_dict.get((sample[s.head].token, s.token))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
    return feat_indices


def extract_hw_cw_feat_indices_pair(head, child, hw_cw_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_word, child_word)
    :param: head: head DepSample (DepSample)
    :param: child: child DepSample (DepSample)
    :param: hw_cw_dict: the dictionary of indices (dict)
    :return: feat_idx: index of the feature (list)
    """
    if hw_cw_dict.get((head.token, child.token)):
        return [hw_cw_dict.get((head.token, child.token))]
    else:
        return []


def generate_hp_cp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_pos, child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than
                    that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: hp_cp_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    hp_cp_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT or (word_hist.get(s.token) \
                                   and word_hist[s.token] < word_threshold):
                continue
            feat = (sample[s.head].pos, s.pos)
            if hp_cp_feat_dict.get(feat) is None:
                hp_cp_feat_dict[feat] = current_idx
                current_idx += 1
    print("total (head_pos, child_pos) features: ", current_idx)
    hp_cp_feat_dict = OrderedDict(sorted(hp_cp_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".hp_cp.dict"
        with open(path, 'wb') as fp:
            pickle.dump(hp_cp_feat_dict, fp)
        print("saved (head_pos, child_pos) features dictionary @ ", path)
    return hp_cp_feat_dict


def extract_hp_cp_feat_indices(sample, hp_cp_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_pos, child_pos)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: hw_hp_cw_cp_dict: the dictionary of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    feat_indices = {}
    for s in sample:
        if s.token == ROOT:
            continue
        if hp_cp_dict.get((sample[s.head].pos, s.pos)):
            idx = hp_cp_dict.get((sample[s.head].pos, s.pos))
            if feat_indices.get(idx):
                feat_indices[idx] += 1
            else:
                feat_indices[idx] = 1
    return feat_indices


def extract_hp_cp_feat_indices_pair(head, child, hp_cp_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_pos, child_pos)
    :param: head: head DepSample (DepSample)
    :param: child: child DepSample (DepSample)
    :param: hw_hp_cw_cp_dict: the dictionary of indices (dict)
    :return: feat_idx: index of the feature (list)
    """
    if hp_cp_dict.get((head.pos, child.pos)):
        return [hp_cp_dict.get((head.pos, child.pos))]
    else:
        return []


def generate_bigram_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, head_pos, child_word, child_pos)
    * (head_pos, child_word, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_word, head_pos, child_word)
    * (head_word, child_word)
    * (head_pos, child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than
                                that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: bigram_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    num_features = 0
    hw_hp_cw_cp_dict = generate_hw_hp_cw_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                                     save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hw_hp_cw_cp_dict)
    hp_cw_cp_dict = generate_hp_cw_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hp_cw_cp_dict)
    hw_cw_cp_dict = generate_hw_cw_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hw_cw_cp_dict)
    hw_hp_cp_dict = generate_hw_hp_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hw_hp_cp_dict)
    hw_hp_cw_dict = generate_hw_hp_cw_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hw_hp_cw_dict)
    hw_cw_dict = generate_hw_cw_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hw_cw_dict)
    hp_cp_dict = generate_hp_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hp_cp_dict)
    print("total bigrams features: ", num_features)
    return hw_hp_cw_cp_dict, hp_cw_cp_dict, hw_cw_cp_dict, hw_hp_cp_dict, \
                    hw_hp_cw_dict, hw_cw_dict, hp_cp_dict


def update_dict(current_dict, dict_to_add, current_num_features):
    """
    This function takes two dictionaries with indices as keys, and combines them.
    :param: current_dict: first dictionary
    :param: dict_to_add: second dictionary
    :param: current_num_features: total number of features (int)
    """
#     comb_dict = copy.deepcopy(current_dict)
    for item in dict_to_add.items():
        current_dict[current_num_features + item[0]] = item[1]
        
def update_list(current_list, list_to_add, current_num_features):
    """
    This function takes two lists with indices, and combines them.
    :param: current_list: first list
    :param: list_to_add: second list
    :param: current_num_features: total number of features (int)
    """
    if list_to_add:
        return (current_list + (np.array(list_to_add) + current_num_features).tolist())
    else:
        return current_list


def extract_bigram_feat_indices(sample, bigram_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_word, head_pos, child_word, child_pos)
    * (head_pos, child_word, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_word, head_pos, child_word)
    * (head_word, child_word)
    * (head_pos, child_pos)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: bigram_dict: the dictionaries of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    hw_hp_cw_cp_dict, hp_cw_cp_dict, hw_cw_cp_dict, hw_hp_cp_dict, \
        hw_hp_cw_dict, hw_cw_dict, hp_cp_dict = bigram_dict
    
    num_hw_hp_cw_cp_feats = len(hw_hp_cw_cp_dict)
    num_hp_cw_cp_feats = len(hp_cw_cp_dict)
    num_hw_cw_cp_feats = len(hw_cw_cp_dict)
    num_hw_hp_cp_feats = len(hw_hp_cp_dict)
    num_hw_hp_cw_feats = len(hw_hp_cw_dict)
    num_hw_cw_feats = len(hw_cw_dict)
    num_hp_cp_feats = len(hp_cp_dict)
    
    current_num_features = 0
    
    hw_hp_cw_cp_ind = extract_hw_hp_cw_cp_feat_indices(sample, hw_hp_cw_cp_dict)
    current_num_features += num_hw_hp_cw_cp_feats
    bigram_indices = copy.deepcopy(hw_hp_cw_cp_ind)
    
    hp_cw_cp_ind = extract_hp_cw_cp_feat_indices(sample, hp_cw_cp_dict)
    update_dict(bigram_indices, hp_cw_cp_ind, current_num_features)
    current_num_features += num_hp_cw_cp_feats
    
    hw_cw_cp_ind = extract_hw_cw_cp_feat_indices(sample, hw_cw_cp_dict)
    update_dict(bigram_indices, hw_cw_cp_ind, current_num_features)
    current_num_features += num_hw_cw_cp_feats
    
    hw_hp_cp_ind = extract_hw_hp_cp_feat_indices(sample, hw_hp_cp_dict)
    update_dict(bigram_indices, hw_hp_cp_ind, current_num_features)
    current_num_features += num_hw_hp_cp_feats
    
    hw_hp_cw_ind = extract_hw_hp_cw_feat_indices(sample, hw_hp_cw_dict)
    update_dict(bigram_indices, hw_hp_cw_ind, current_num_features)
    current_num_features += num_hw_hp_cw_feats
    
    hw_cw_ind = extract_hw_cw_feat_indices(sample, hw_cw_dict)
    update_dict(bigram_indices, hw_cw_ind, current_num_features)
    current_num_features += num_hw_cw_feats
    
    hp_cp_ind = extract_hp_cp_feat_indices(sample, hp_cp_dict)
    update_dict(bigram_indices, hp_cp_ind, current_num_features)
    current_num_features += num_hp_cp_feats
    

    return OrderedDict(sorted(bigram_indices.items(), key=lambda t: t[0]))


def extract_bigram_feat_indices_pair(head, child, bigram_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_word, head_pos, child_word, child_pos)
    * (head_pos, child_word, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_word, head_pos, child_word)
    * (head_word, child_word)
    * (head_pos, child_pos)
    :param: head: head DepSample (DepSample)
    :param: child: child DepSample (DepSample)
    :param: bigram_dict: the dictionaries of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    hw_hp_cw_cp_dict, hp_cw_cp_dict, hw_cw_cp_dict, hw_hp_cp_dict, \
        hw_hp_cw_dict, hw_cw_dict, hp_cp_dict = bigram_dict
    
    num_hw_hp_cw_cp_feats = len(hw_hp_cw_cp_dict)
    num_hp_cw_cp_feats = len(hp_cw_cp_dict)
    num_hw_cw_cp_feats = len(hw_cw_cp_dict)
    num_hw_hp_cp_feats = len(hw_hp_cp_dict)
    num_hw_hp_cw_feats = len(hw_hp_cw_dict)
    num_hw_cw_feats = len(hw_cw_dict)
    num_hp_cp_feats = len(hp_cp_dict)
    
    current_num_features = 0
    
    hw_hp_cw_cp_ind = extract_hw_hp_cw_cp_feat_indices_pair(head, child, hw_hp_cw_cp_dict)
    current_num_features += num_hw_hp_cw_cp_feats
    bigram_indices = copy.deepcopy(hw_hp_cw_cp_ind)
    
    hp_cw_cp_ind = extract_hp_cw_cp_feat_indices_pair(head, child, hp_cw_cp_dict)
    bigram_indices = update_list(bigram_indices, hp_cw_cp_ind, current_num_features)
    current_num_features += num_hp_cw_cp_feats
    
    hw_cw_cp_ind = extract_hw_cw_cp_feat_indices_pair(head, child, hw_cw_cp_dict)
    bigram_indices = update_list(bigram_indices, hw_cw_cp_ind, current_num_features)
    current_num_features += num_hw_cw_cp_feats
    
    hw_hp_cp_ind = extract_hw_hp_cp_feat_indices_pair(head, child, hw_hp_cp_dict)
    bigram_indices = update_list(bigram_indices, hw_hp_cp_ind, current_num_features)
    current_num_features += num_hw_hp_cp_feats
    
    hw_hp_cw_ind = extract_hw_hp_cw_feat_indices_pair(head, child, hw_hp_cw_dict)
    bigram_indices = update_list(bigram_indices, hw_hp_cw_ind, current_num_features)
    current_num_features += num_hw_hp_cw_feats
    
    hw_cw_ind = extract_hw_cw_feat_indices_pair(head, child, hw_cw_dict)
    bigram_indices = update_list(bigram_indices, hw_cw_ind, current_num_features)
    current_num_features += num_hw_cw_feats
    
    hp_cp_ind = extract_hp_cp_feat_indices_pair(head, child, hp_cp_dict)
    bigram_indices = update_list(bigram_indices, hp_cp_ind, current_num_features)
    current_num_features += num_hp_cp_feats

    return sorted(bigram_indices)


def generate_bigram_feat_dict_minimal(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_pos, child_word, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_pos, child_pos)
    igram_indices:param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than
                that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: bigram_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    num_features = 0
    hp_cw_cp_dict = generate_hp_cw_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hp_cw_cp_dict)
    hw_hp_cp_dict = generate_hw_hp_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hw_hp_cp_dict)
    
    hp_cp_dict = generate_hp_cp_feat_dict(path_to_file, word_threshold=word_threshold,
                                               save_to_file=save_to_file, word_hist=word_hist)
    num_features += len(hp_cp_dict)
    print("total bigrams features: ", num_features)
    return hp_cw_cp_dict, hw_hp_cp_dict, hp_cp_dict


def extract_bigram_feat_indices_minimal(sample, bigram_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_pos, child_word, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_pos, child_pos)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: bigram_dict: the dictionaries of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    hp_cw_cp_dict, hw_hp_cp_dict, hp_cp_dict = bigram_dict

    num_hp_cw_cp_feats = len(hp_cw_cp_dict)
    num_hw_hp_cp_feats = len(hw_hp_cp_dict)
    num_hp_cp_feats = len(hp_cp_dict)
    
    current_num_features = 0
    
    hp_cw_cp_ind = extract_hp_cw_cp_feat_indices(sample, hp_cw_cp_dict)
    current_num_features += num_hp_cw_cp_feats
    bigram_indices = copy.deepcopy(hp_cw_cp_ind)
    
    hw_hp_cp_ind = extract_hw_hp_cp_feat_indices(sample, hw_hp_cp_dict)
    update_dict(bigram_indices, hw_hp_cp_ind, current_num_features)
    current_num_features += num_hw_hp_cp_feats
    
    hp_cp_ind = extract_hp_cp_feat_indices(sample, hp_cp_dict)
    update_dict(bigram_indices, hp_cp_ind, current_num_features)
    current_num_features += num_hp_cp_feats

    return OrderedDict(sorted(bigram_indices.items(), key=lambda t: t[0]))


def extract_bigram_feat_indices_minimal_pair(head, child, bigram_dict):
    """
    This function extracts the indices (in the feature vector) of the bigrams features:
    * (head_pos, child_word, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_pos, child_pos)
    :param: head: head DepSample (DepSample)
    :param: child: child DepSample (DepSample)
    :param: bigram_dict: the dictionaries of indices (dict)
    :return: feat_indices_dict: dictionary idx->count
    """
    hp_cw_cp_dict, hw_hp_cp_dict, hp_cp_dict = bigram_dict

    num_hp_cw_cp_feats = len(hp_cw_cp_dict)
    num_hw_hp_cp_feats = len(hw_hp_cp_dict)
    num_hp_cp_feats = len(hp_cp_dict)
    
    current_num_features = 0
    
    hp_cw_cp_ind = extract_hp_cw_cp_feat_indices_pair(head, child, hp_cw_cp_dict)
    bigram_indices = copy.deepcopy(hp_cw_cp_ind)
    current_num_features += num_hp_cw_cp_feats
    
    hw_hp_cp_ind = extract_hw_hp_cp_feat_indices_pair(head, child, hw_hp_cp_dict)
    bigram_indices = update_list(bigram_indices, hw_hp_cp_ind, current_num_features)
    current_num_features += num_hw_hp_cp_feats
    
    hp_cp_ind = extract_hp_cp_feat_indices_pair(head, child, hp_cp_dict)
    bigram_indices = update_list(bigram_indices, hp_cp_ind, current_num_features)
    current_num_features += num_hp_cp_feats

    return sorted(bigram_indices)

In [46]:
"""
UNIGRAMS + BIGRAMS
"""
def extract_unigram_bigram_feat_indices(sample, dicts, minimal=False):
    """
    This function extracts the indices (in the feature vector) of the features:
    * (head_word, head_pos)
    * (head_word)
    * (head_pos)
    * (child_word, child_pos)
    * (child_word)
    * (child_pos)
    * (head_word, head_pos, child_word, child_pos)
    * (head_pos, child_word, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_word, head_pos, child_word)
    * (head_word, child_word)
    * (head_pos, child_pos)
    :param: sample: the sample to extract features from (list of DepSample)
    :param: dicts: the dictionaries of indices [unigram_dicts, bigrams_dicts] (list)
    :param: minimal: whether or not to use the minimal version (bool)
    :return: feat_indices_dict: dictionary idx->count
    """
    unigram_dict, bigram_dict = dicts[0], dicts[1]
    unigram_inds = extract_unigram_feat_indices(sample, unigram_dict)
    feat_indices_dict = copy.deepcopy(unigram_inds)
    if minimal:
        bigram_inds = extract_bigram_feat_indices_minimal(sample, bigram_dict)
    else:
        bigram_inds = extract_bigram_feat_indices(sample, bigram_dict)
    update_dict(feat_indices_dict, bigram_inds, sum(len(d) for d in unigram_dict))
    return OrderedDict(sorted(feat_indices_dict.items(), key=lambda t: t[0]))


def extract_unigram_bigram_feat_indices_pair(head, child, dicts, minimal=False):
    """
    This function extracts the indices (in the feature vector) of the features:
    * (head_word, head_pos)
    * (head_word)
    * (head_pos)
    * (child_word, child_pos)
    * (child_word)
    * (child_pos)
    * (head_word, head_pos, child_word, child_pos)
    * (head_pos, child_word, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_word, head_pos, child_pos)
    * (head_word, head_pos, child_word)
    * (head_word, child_word)
    * (head_pos, child_pos)
    :param: head: head DepSample (DepSample)
    :param: child: child DepSample (DepSample)
    :param: dicts: the dictionaries of indices [unigram_dicts, bigrams_dicts] (list)
    :param: minimal: whether or not to use the minimal version (bool)
    :return: feat_indices_dict: dictionary idx->count
    """
    unigram_dict, bigram_dict = dicts[0], dicts[1]
    unigram_inds = extract_unigram_feat_indices_pair(head, child, unigram_dict)
    feat_indices_list = copy.deepcopy(unigram_inds)
    if minimal:
        bigram_inds = extract_bigram_feat_indices_minimal_pair(head, child, bigram_dict)
    else:
        bigram_inds = extract_bigram_feat_indices_pair(head, child, bigram_dict)
    feat_indices_list = update_list(feat_indices_list, bigram_inds, sum(len(d) for d in unigram_dict))
    return sorted(feat_indices_list)

In [13]:
word_hist = generate_word_hist_dict(path_to_file)
print(len(word_hist))
# print(word_hist)

14162


In [13]:
unigram_feat_dict = generate_unigram_feat_dict(path_to_file)
# hw_hp_dict = generate_hw_hp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None)
# cw_cp_dict = generate_cw_cp_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None)
# hw_hp_cw_cp_dict = generate_hw_hp_cw_cp_feat_dict(path_to_file, word_threshold=2)
# hp_cw_cp_dict = generate_hp_cw_cp_feat_dict(path_to_file, word_threshold=0)
# hw_cw_cp_dict = generate_hw_cw_cp_feat_dict(path_to_file, word_threshold=4)
# hw_hp_cp_dict = generate_hw_hp_cp_feat_dict(path_to_file, word_threshold=2)
# hw_hp_cw_dict = generate_hw_hp_cw_feat_dict(path_to_file, word_threshold=1)
# hw_cw_dict = generate_hw_cw_feat_dict(path_to_file, word_threshold=1)
# hp_cp_dict = generate_hp_cp_feat_dict(path_to_file, word_threshold=1)
bigram_feat_dict = generate_bigram_feat_dict(path_to_file, word_threshold=0)
bigram_feat_dict_minimal = generate_bigram_feat_dict_minimal(path_to_file)

total (head_word, head_pos), (head_word), (head_pos) features:  18896
total (child_word, child_pos), (child_word), (child_pos) features:  30104
total unigrams features:  49000
total (head_word, head_pos, child_word, child_pos) features:  71232
total (head_pos, child_word, child_pos) features:  31314
total (head_word, child_word, child_pos) features:  70401
total (head_word, head_pos, child_pos) features:  33936
total (head_word, head_pos, child_word) features:  70679
total (head_word, child_word) features:  69819
total (head_pos, child_pos) features:  749
total bigrams features:  348130
total (head_pos, child_word, child_pos) features:  31314
total (head_word, head_pos, child_pos) features:  33936
total (head_pos, child_pos) features:  749
total bigrams features:  65999


In [33]:
extract_bigram_feat_indices_minimal_pair(s[2], s[4], bigram_feat_dict_minimal)

[4174, 35034, 65274]

In [38]:
s_feat_dict = extract_unigram_bigram_feat_indices(s, [unigram_feat_dict, bigram_feat_dict_minimal],
                                                   minimal=True)

In [39]:
s_feat_dict

OrderedDict([(27, 3),
             (432, 3),
             (433, 3),
             (18911, 1),
             (18938, 1),
             (18943, 1),
             (19142, 1),
             (19214, 1),
             (19626, 1),
             (20197, 1),
             (20198, 1),
             (20975, 1),
             (20976, 1),
             (23662, 1),
             (23663, 1),
             (50439, 1),
             (52608, 1),
             (53174, 1),
             (64983, 1),
             (80332, 1),
             (84034, 1),
             (84081, 1),
             (84245, 1),
             (114267, 1),
             (114274, 1),
             (114302, 1),
             (114511, 1)])

In [19]:
hw_hp_ind = extract_hw_hp_feat_indices(s, unigram_feat_dict[0])
cw_cp_ind = extract_cw_cp_feat_indices(s, unigram_feat_dict[1])
unigrams_ind = extract_unigram_feat_indices(s, unigram_feat_dict)
print(hw_hp_ind)
print(cw_cp_ind)
print(unigrams_ind)

{432: 3, 433: 3, 27: 3}
{2079: 1, 2080: 1, 318: 1, 730: 1, 246: 1, 47: 1, 1301: 1, 1302: 1, 15: 1, 4766: 1, 4767: 1, 42: 1}
OrderedDict([(27, 3), (432, 3), (433, 3), (18911, 1), (18938, 1), (18943, 1), (19142, 1), (19214, 1), (19626, 1), (20197, 1), (20198, 1), (20975, 1), (20976, 1), (23662, 1), (23663, 1)])


In [47]:
# Usage example, how to extract all featires for (head, child)
all_features_inds = extract_unigram_bigram_feat_indices_pair(s[2], s[4],
                                                             [unigram_feat_dict, bigram_feat_dict_minimal],
                                                             minimal=True)
print(all_features_inds)

all_features_inds = extract_unigram_bigram_feat_indices_pair(s[2], s[4],
                                                             [unigram_feat_dict, bigram_feat_dict],
                                                             minimal=False)
print(all_features_inds)

[27, 432, 433, 18938, 23662, 23663, 53174, 84034, 114274]
[27, 432, 433, 18938, 23662, 23663, 59431, 124406, 161944, 225667, 266285, 336930, 396405]


In [29]:
def sample_to_successors(sample):
    """
    This function converts sample representation in the form of list of DepSample to
    the form of Graph successors: map between heads to list of childs.
    :param: sample: the original sample (list of DepSample)
    :return: succ_rep: dictionary head->list_of_children (dict)
    """
    succ_rep = {}
    for s in sample:
        if s.token == ROOT:
            continue
        if succ_rep.get(s.head) is not None:
            succ_rep[s.head].append(s.idx)
        else:
            succ_rep[s.head] = [s.idx]
    return succ_rep


def sample_to_full_successors(N):
    """
    This function converts sample representation in the form of list of DepSample to
    the form of FULLY CONNECTED Graph successors: map between heads to list of childs.
    :param: N: length of the sentence
    :return: succ_rep: dictionary head->list_of_children (dict)
    """
    succ_rep = {}
    nodes_ids = list(range(N + 1))
    for i in nodes_ids:
        new_node_ids = copy.deepcopy(nodes_ids)
        new_node_ids.remove(0)
        if i > 0:
            new_node_ids.remove(i)
        succ_rep[i] = new_node_ids
    return succ_rep


def successors_to_sample(sample_no_head, succ_rep):
    """
    This function converts successors representation to list of DepSample.
    :param: sample_no_head: list of DepSample where s.head=None
    :param: succ_rep: dictionary head->list_of_children (dict)
    :return: sample_with_head (list of DepSamples)
    """
    root = DepSample(0, ROOT, ROOT, 0)
    sample_with_heads = [root]
    for head in succ_rep.keys():
        childs = succ_rep[head]
        for c in childs:
            new_sample = DepSample(sample_no_head[c].idx, sample_no_head[c].token,
                                   sample_no_head[c].pos, head)
            sample_with_heads.append(new_sample)
    return sorted(sample_with_heads, key=lambda t: t.idx)

In [26]:
succ = sample_to_successors(s)
recon_s = successors_to_sample(s, succ)
print(s)
print(succ)
print(recon_s)

[DepSample(idx=0, token='*', pos='*', head=0), DepSample(idx=1, token='What', pos='WP', head=2), DepSample(idx=2, token="'s", pos='VBZ', head=0), DepSample(idx=3, token='next', pos='JJ', head=2), DepSample(idx=4, token='?', pos='.', head=2)]
{2: [1, 3, 4], 0: [2]}
[DepSample(idx=0, token='*', pos='*', head=0), DepSample(idx=1, token='What', pos='WP', head=2), DepSample(idx=2, token="'s", pos='VBZ', head=0), DepSample(idx=3, token='next', pos='JJ', head=2), DepSample(idx=4, token='?', pos='.', head=2)]


In [30]:
sample_to_full_successors(8)

{0: [1, 2, 3, 4, 5, 6, 7, 8],
 1: [2, 3, 4, 5, 6, 7, 8],
 2: [1, 3, 4, 5, 6, 7, 8],
 3: [1, 2, 4, 5, 6, 7, 8],
 4: [1, 2, 3, 5, 6, 7, 8],
 5: [1, 2, 3, 4, 6, 7, 8],
 6: [1, 2, 3, 4, 5, 7, 8],
 7: [1, 2, 3, 4, 5, 6, 8],
 8: [1, 2, 3, 4, 5, 6, 7]}

In [48]:
class DepOptimizer:
    """
    This helper class holds the weights of a parser model and given a sentence,
    calculates scores for edges.
    """
    def __init__(self, w, sample, path_to_train_file=None, dicts=None, feature_extractor=None, minimal=True):
        """
        Initialzie an optimzier.
        :param: w: weights (list)
        :param: sample: current sample (list of DepSample)
        :param: path_to_train_file: training file that contains the samples
        :param: dicts: dictionaries of features (list of dicts)
        :param: feature_extractor: function to extract feature for (head, child)
        :param: minimal: whether or not to use the minimal version of the features
        """
        self.w = w
        self.sample = sample
        self.minimal= minimal
        if path_to_train_file is None:
            self.path_to_train_file = './data/train.labeled'
        else:
            self.path_to_train_file = path_to_train_file
        if dicts is None:
            if minimal:
                self.dicts = [generate_unigram_feat_dict(path_to_file), generate_bigram_feat_dict_minimal(path_to_file)]
            else:
                self.dicts = [generate_unigram_feat_dict(path_to_file), generate_bigram_feat_dict(path_to_file)]
        if feature_extractor is None:
            self.feature_extractor = extract_unigram_bigram_feat_indices_pair
        else:
            self.feature_extractor = feature_extractor
    
    def get_score(self, head_int, child_int):
        """
        Calculates a score for an edge between `head_int` to `child_int`.
        :param: head_int: head node id (int)
        :param: child_int: child node id (int)
        :return: score: score for the edge (float)
        """
        features_inds = self.feature_extractor(self.sample[head_int], self.sample[child_int], self.dicts, self.minimal)
        w = np.array(self.w)
        return np.sum(w[features_inds])
    
    def update_weights(self, w):
        """
        Updates the optimizer current weights.
        :param: w: weughts (list)
        """
        self.w = w
        
    def update_sample(self, sample):
        """
        Upates current sample.
        """
        self.sample = sample

In [19]:
extract_unigram_feat_indices_pair(s[1], s[2], unigram_feat_dict)

[8481, 11466, 11467, 18943, 19142, 19626]

In [28]:
(np.array([5, 2]) + 3).tolist()

[8, 5]

In [3]:
path_to_comp_file = './data/comp.unlabeled'

In [25]:
def sample_to_lines(sample, ref_lines):
    """
    This function generates lines from sample.
    :param: sample: list of DepSamples (list)
    :param: ref_lines: list of original lines (list of str)
    :returns: line
    """
    res_lines = []
    for s_i, s in enumerate(sample):
        if not s_i:
            continue # ROOT
        ls = ref_lines[s_i - 1].rstrip().split('\t')
        ls[6] = str(s.head)
        res_lines.append('\t'.join(ls))
    return res_lines
        

def generate_labeled_file(path_to_unlabeled_file):
    """
    This function generates labels for unlabeled samples in the same
    format as the original file.
    :param: path_to_unlabeled_file: path to loccation of the file (str)
    """
    root = DepSample(0, ROOT, ROOT, 0)
    path_to_labeled = path_to_unlabeled_file + '.labeled'
    with open(path_to_labeled, 'w') as fw:
        with open(path_to_unlabeled_file) as fr:
            sample = [root]
            lines = []
            for line in fr:
                if not line.rstrip():
                    # end of sample
#                     infered_sample = self.infer(sample)
                    infered_sample = sample
                    res_lines = sample_to_lines(infered_sample, lines)
                    for l in res_lines:
                        fw.write(l)
                        fw.write('\n')
                    fw.write('\n')
                    sample = [root]
                    lines = []
                else:
                    lines.append(line)
                    ls = line.rstrip().split('\t')
                    try:
                        head = int(ls[6])
                    except ValueError:
                        head = ls[6]
                    sample.append(DepSample(int(ls[0]), ls[1], ls[3], head))
    print("finished generating labeled file of ", path_to_unlabeled_file, " @ ", path_to_labeled)
                
    

In [26]:
generate_labeled_file(path_to_comp_file)

finished generating labeled file of  ./data/comp.unlabeled  @  ./data/comp.unlabeled.labeled


In [7]:
try:
    a = int("_")
except ValueError:
    a = "_"
    print(a)

_
