In [17]:
import time
import os
import pickle
import numpy as np
from collections import OrderedDict
from collections import namedtuple
from random import shuffle
# named tuple has methods like _asdict()
ROOT = "*"

In [2]:
DepSample = namedtuple('DepSample', 'idx, token, pos, head')

In [3]:
example = "1	Pierre	_	NNP	_	_	2	NAME	_	_"
example = example.split('\t')
example = DepSample(example[0], example[1], example[3], example[6])

In [35]:
def dep_sample_generator(path_to_file):
    """
    This function generates samples, such that every sample is a list
    ordered by the tokens' counter (first column).
    :param: path_to_file: string to the location of the file tor read from (str)
    :return: sample (list of DepSample)
    """
    assert os.path.isfile(path_to_file), "File does not exist"
    root = DepSample(0, ROOT, ROOT, 0)
    with open(path_to_file) as fp:
        sample = [root]
        for line in fp:
            if not line.rstrip():
                yield sample
                sample = [root]
            else:
                ls = line.rstrip().split('\t')
#                 print(ls)
                sample.append(DepSample(int(ls[0]), ls[1], ls[3], int(ls[6])))
        if len(sample) > 1:
            yield sample
            

In [5]:
path_to_file = './data/train.labeled'

In [39]:
def split_train_validation(path_to_file, valid_amount=0.2):
    """
    This functions takes a train dataset and splits it to trainining
    and validation sets accoording to `valid_amount`.
    :param: path_to_file: path to file containing the dataset (str)
    :param: valid_amount: percentage of samples to take for validation (float)
    :return: train_file_path: path to file containing the training samples (str)
    :return: valid_file_path: path to file containing the validation samples (str)
    """
    path_train_file = path_to_file + ".train.labeled"
    path_valid_file = path_to_file + ".valid.labeled"
    # count samples
    samp_gen = dep_sample_generator(path_to_file)
    total_samples = 0
    for _ in samp_gen:
        total_samples += 1
    print("total samples ", total_samples)
    buffer = []
    num_validation = int(valid_amount * total_samples)
    num_training = total_samples - num_validation
    taken_for_training = 0
    t_file = open(path_train_file, 'w')
    v_file = open(path_valid_file, 'w')
    with open(path_to_file) as fp:
        sample = []
        for line in fp:
            if not line.rstrip():
                if taken_for_training < num_training:
                    for l in sample:
                        t_file.write(l)
                    t_file.write('\n')
                    taken_for_training += 1
                else:
                    for l in sample:
                        v_file.write(l)
                    v_file.write('\n')
                sample = []
            else:
                sample.append(line)
                
        if taken_for_training < num_training:
            for l in sample:
                t_file.write(l)
            t_file.write('\n')
            taken_for_training += 1
        else:
            for l in sample:
                v_file.write(l)
            v_file.write('\n')
    t_file.close()
    v_file.close()
    print("num training: ", num_training, " saved @ ", path_train_file)
    print("num validation: ", num_validation, " saved @ ", path_valid_file)
           

In [40]:
split_train_validation(path_to_file, valid_amount=0.2)

total samples  5000
num training:  4000  saved @  ./data/train.labeled.train.labeled
num validation:  1000  saved @  ./data/train.labeled.valid.labeled


In [41]:
path_to_train_file = './data/train.labeled.train.labeled'
path_to_Valid_file = './data/train.labeled.valid.labeled'

In [44]:
samp_gen = dep_sample_generator(path_to_file=path_to_train_file)
for s_i, s in enumerate(samp_gen):
    if s_i == 100:
        print(s)
#     if s_i > 0:
#         break
# print(s)

[DepSample(idx=0, token='*', pos='*', head=0), DepSample(idx=1, token='Alan', pos='NNP', head=2), DepSample(idx=2, token='Spoon', pos='NNP', head=9), DepSample(idx=3, token=',', pos=',', head=2), DepSample(idx=4, token='recently', pos='RB', head=5), DepSample(idx=5, token='named', pos='VBN', head=2), DepSample(idx=6, token='Newsweek', pos='NNP', head=7), DepSample(idx=7, token='president', pos='NN', head=5), DepSample(idx=8, token=',', pos=',', head=2), DepSample(idx=9, token='said', pos='VBD', head=0), DepSample(idx=10, token='Newsweek', pos='NNP', head=13), DepSample(idx=11, token="'s", pos='POS', head=10), DepSample(idx=12, token='ad', pos='NN', head=13), DepSample(idx=13, token='rates', pos='NNS', head=14), DepSample(idx=14, token='would', pos='MD', head=9), DepSample(idx=15, token='increase', pos='VB', head=14), DepSample(idx=16, token='5', pos='CD', head=17), DepSample(idx=17, token='%', pos='NN', head=15), DepSample(idx=18, token='in', pos='IN', head=15), DepSample(idx=19, token

## Dependency Sample

number of tokens in a sample = `sample[-1].idx`

POS tags = `[s.pos for s in sample]`

In [7]:
sample_len = s[-1].idx
s_tags = [l.pos for l in s]
print("sample length: ", sample_len)
print("POS tags: ", s_tags)

sample length:  13
POS tags:  ['*', 'NNP', 'NNP', 'VBZ', 'NN', 'IN', 'NNP', 'NNP', ',', 'DT', 'NNP', 'VBG', 'NN', '.']


## Features
### Unigram
* (head_word, head_pos)
* (head_word)
* (head_pos)
* (child_word, child_pos)
* (child_word)
* (child_pos)

### Bigram
* (head_word, head_pos, child_word, child_pos)
* (head_pos, child_word, child_pos)
* (head_word, child_word, child_pos)
* (head_word, head_pos, child_pos)
* (head_word, head_pos, child_word)
* (head_word, child_word)
* (head_pos, child_pos)

In [8]:
def generate_word_hist_dict(path_to_file, save_to_file=False):
    """
    This function generates histogram of of the tokens in the dataset.
    :param: path_to_file: path to location of the dataset (str)
    :param: save_to_file: whether or not to save the dictionary on disk (bool)
    :return: word_hist: OrderedDict word->word_count
    """
    samp_gen = dep_sample_generator(path_to_file)
    word_hist = {}
    for s_i, sample in enumerate(samp_gen):
        for s in sample:
            if s.token == ROOT:
                continue
            if word_hist.get(s.token):
                word_hist[s.token] += 1
            else:
                word_hist[s.token] = 1
    word_hist = OrderedDict(sorted(word_hist.items(), key=lambda t: -t[1]))
    if save_to_file:
        path = path_to_file + ".word.hist"
        with open(path, 'wb') as fp:
            pickle.dump(word_hist, fp)
        print("word histogram dictionary saved @ ", path)
    return word_hist

def sample_to_unigrams(sample):
    """
    This function takes a sample in the form of list of DepSamples, and returns for each
    component to following unigrams:
    * (head_word, head_pos)
    * (head_word)
    * (head_pos)
    * (child_word, child_pos)
    * (child_word)
    * (child_pos)
    :param: sample: list of sample components (list of DepSample)
    :return: list of samples unigrams (list of tuples)
    """
    sample_unigrams = []
    for s in sample:
        if s.token == ROOT:
            continue
        sample_unigrams.extend([(sample[s.head].token, sample[s.head].pos),
                                (sample[s.head].token),
                                (sample[s.head].pos),
                                (s.token, s.pos),
                                (s.token),
                                (s.pos)])
    return sample_unigrams
    
    
def generate_unigram_feat_dict(path_to_file, word_threshold=0, save_to_file=False, word_hist=None):
    """
    This function generates a features dictionary, such that for every features, an index is given.
    The following features are generated for a given dataset:
    * (head_word, head_pos)
    * (head_word)
    * (head_pos)
    * (child_word, child_pos)
    * (child_word)
    * (child_pos)
    :param: path_to_file: path to location of the dataset (str)
    :param: word_threshold: if to consider a feature with word that appears less than that in the dataset (int)
    :param: save_to_file: whether or not to save the dictionary on the disk (bool)
    :param: word_hist: dictionary of words histogram in the dataset (dict)
    :return: unigram_feat_dict: dictionary feature->index (dict)
    """
    if not word_hist:
        word_hist = generate_word_hist_dict(path_to_file)
    samp_gen = dep_sample_generator(path_to_file)
    unigram_feat_dict = {}
    current_idx = 0
    for s_i, sample in enumerate(samp_gen):
        sample_unigrams = sample_to_unigrams(sample)
        for feat in sample_unigrams:
            skip = False
            for word in feat:
                if word_hist.get(word) and word_hist[word] < word_threshold:
                    skip = True
            if skip:
                continue
            if unigram_feat_dict.get(feat) is None:
                unigram_feat_dict[feat] = current_idx
                current_idx += 1
    print("total unigrams features: ", current_idx)
    unigram_feat_dict = OrderedDict(sorted(unigram_feat_dict.items(), key=lambda t: t[1]))
    if save_to_file:
        path = path_to_file + ".unigrams.dict"
        with open(path, 'wb') as fp:
            pickle.dump(unigram_feat_dict, fp)
        print("saved unigrams features dictionary @ ", path)
    return unigram_feat_dict
    

In [9]:
word_hist = generate_word_hist_dict(path_to_file)
print(len(word_hist))
# print(word_hist)

14162


In [15]:
unigram_feat_dict = generate_unigram_feat_dict(path_to_file)

total unigrams features:  30106


In [16]:
unigram_feat_dict

OrderedDict([(('Vinken', 'NNP'), 0),
             ('Vinken', 1),
             ('NNP', 2),
             (('Pierre', 'NNP'), 3),
             ('Pierre', 4),
             (('will', 'MD'), 5),
             ('will', 6),
             ('MD', 7),
             ((',', ','), 8),
             (',', 9),
             (('years', 'NNS'), 10),
             ('years', 11),
             ('NNS', 12),
             (('61', 'CD'), 13),
             ('61', 14),
             ('CD', 15),
             (('old', 'JJ'), 16),
             ('old', 17),
             ('JJ', 18),
             (('*', '*'), 19),
             ('*', 20),
             (('join', 'VB'), 21),
             ('join', 22),
             ('VB', 23),
             (('board', 'NN'), 24),
             ('board', 25),
             ('NN', 26),
             (('the', 'DT'), 27),
             ('the', 28),
             ('DT', 29),
             (('as', 'IN'), 30),
             ('as', 31),
             ('IN', 32),
             (('director', 'NN'), 33),
           

In [19]:
a = [1,2,3]
shuffle(a)
a

[2, 1, 3]