In [1]:
import numpy as np
import torch
# Get the unique words in a rcv1 file as a dictionary; number those words
def get_dict_from_file(filename):
    f = open(filename,'r')
    line_num = 0
    article_num = 0
    word_num = 0
    words = {}
    for line in f:
        if(line[0]!='.'): # the last line is 171541
            for word in line.split():
                if word not in words:
                    words[word] = word_num
                    word_num+=1
        line_num+=1
    print(len(words))
    return words
# Get the unique labels from the rcv1 raw label file as dictionary; number those labels
def get_dict_from_label_file(filename):
    f = open(filename,'r')
    word_num = 0
    words = {}
    for line in f:
        label = line.split()[0]
        if label not in words:
            # the +4 is necessary because MrMP deletes label 0-3 (it thinks they are special characters)
            words[label] = word_num+4 
            word_num+=1
    # inserts special characters expected by dictionary
    words['<blank>'] = 0
    words['<unk>'] = 1
    words['<s>'] = 2
    words['</s>'] = 3
    print(len(words))
    return words
# rcv1 dataset has numbered articles. The goal is to have a dictionary where the key is the article
# number and the value is a list of label numbers for that article. 
# num_of_label is a dictionary mapping label name to its number
def get_article_labels_dict(filename):
    article_labels = {}
    f = open(filename,'r')
    num_of_label = get_dict_from_label_file(filename)
    for line in f:
        label = line.split()[0]
        article_num = line.split()[1]
        if article_num not in article_labels:
            article_labels[article_num] = [num_of_label[label]]
        else:
            article_labels[article_num].append(num_of_label[label])
    return article_labels
    
    

In [2]:
# creates the feature sequence, which is a list containing sublists, one sublist for each article.
# Each sublist contains a sequence of words, using numbers corresponding to their dictionary value
# as specified by the dictionary "words". A label sequence is constructed
# in a similar manner. Both feature sequence and label sequence are returned. 
# article_labels_dict is a dictionary mapping article numbers to a list of labels
def get_sequences_from_file(filename, words,article_labels_dict):
    article_num = -1
    feature_sequence = []
    label_sequence = []
    current = []
    
    f = open(filename,'r')
    for line in f:
        if line[0:2] == ".I": # each article will begin with ".I 2286\n", where 2286 is the article num
            article_num += 1
            label_sequence.append(article_labels_dict[line[3:].strip()])
            if (article_num!=0):
                feature_sequence.append(current)
                current = []
        elif (line[0]!='.'): # the last line is 171541
            for word in line.split():
                # sometimes the test file may have words that do not appear in the dictionary
                # because they are not in training. We ignore these
                if word in words: 
                    current.append(words[word])
        
    feature_sequence.append(current)
    return feature_sequence, label_sequence


In [3]:
import argparse
data = {}
data['settings'] = argparse.Namespace()
data['dict'] = {}
data['dict']['src'] = get_dict_from_file('lyrl2004_tokens_train.dat')
data['dict']['tgt'] = get_dict_from_label_file('rcv1-v2.topics.qrels')
article_labels_dict = get_article_labels_dict('rcv1-v2.topics.qrels')
# features
train_features, train_labels = get_sequences_from_file('lyrl2004_tokens_train.dat', data['dict']['src'], article_labels_dict)
test_features,test_labels = get_sequences_from_file('lyrl2004_tokens_test_pt0.dat', data['dict']['src'], article_labels_dict)



# select random instances for validation

valid_index = np.random.choice(len(train_features), int((len(train_features)+len(test_features))*0.1), replace=False)
valid_from_src = np.array(train_features,dtype=object)[valid_index].tolist()
valid_from_tgt = np.array(train_labels,dtype=object)[valid_index].tolist()
train_features = np.delete(train_features,valid_index,0).tolist()
train_labels = np.delete(train_labels,valid_index,0).tolist()

data['train'] = {}
data['train']['src'] = train_features
data['train']['tgt'] = train_labels
data['valid'] = {}
data['valid']['src'] = valid_from_src
data['valid']['tgt'] = valid_from_tgt
data['test'] = {}
data['test']['src'] = test_features
data['test']['tgt'] = test_labels


47152
107
107


  return array(a, dtype, copy=False, order=order)


In [4]:
torch.save(data,'train_valid_test.pt')

In [79]:
import numpy as np
def max_of_jagged(array):
    max(max(array,key=max))
temp = [[1,2],[3,4,5],[6,7,8,9]]
temp = np.delete(temp,[0,1],0).tolist()
temp

[[6, 7, 8, 9]]

In [24]:
article_labels['2286          \n']

KeyError: '2286          \n'