In [74]:
import json
from os import path
from nltk.tokenize import word_tokenize as tokenize
import nltk
import itertools
import numpy as np

In [75]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vikas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [76]:
WHITELIST = '0123456789abcdefghijklmnopqrstuvwxyz '
VOCAB_SIZE = 1200
UNK = 'unk'
limit = {
    'max_descriptions' : 400,
    'min_descriptions' : 0,
    'max_headings' : 20,
    'min_headings' : 0,
}

In [77]:
def load_raw_data(filename):
    #################################
    #   Loads raw data from file    #
    #################################

    with open(filename, 'r') as fp:
        raw_data = json.load(fp)

    print('Loaded {:,} articles from {}'.format(len(raw_data), filename))
    return raw_data

In [78]:
raw_data = load_raw_data('raw_data.json')

Loaded 21 articles from raw_data.json


In [79]:
raw_data[1]

{'abstract': 'crimean parliament adopts new constitution .',
 'article': "the crimean parliament adopted a new compromise constitution wednesday , effectively ending a bitter three-year secession bid from the black sea peninsula 's ukrainian overlords .   deputies in the ##-seat chamber voted by ## votes to six in favor of the new charter , which shies away from separatist rhetoric and describes the southern region as an `` autonomous republic constituting an integral part of ukraine .   '' the constitution added that relations between the `` crimean republic '' and ukraine would be determined by their respective constitutions .   crimea , which was transferred from russia to ukraine in #### as a gift from former soviet leader nikita khrushchev , is populated predominantly by ethnic russians who have repeatedly demanded more autonomy from kiev and greater ties to moscow .   but a secession drive was scuttled after ukrainian authorities ousted the pro-russian president yury meshkov in m

In [80]:
import config

In [81]:
def tokenize_sentence(sentence):
    ######################################
    #   Splits article into sentences    #
    ######################################

    return ' '.join(list(tokenize(sentence)))

def article_is_complete(article):
    ###########################################################
    #   Checks if article has both heading and description    #
    ###########################################################

    if ('abstract' not in article) or ('article' not in article):
        return False
    if (article['abstract'] is None) or (article['article'] is None):
        return False

    return True

def tokenize_articles(raw_data):
    #########################################################################
    #   Tokenizes raw data and creates list of headings and descriptions    #
    #########################################################################

    headings, descriptions = [], []
    num_articles = len(raw_data)

    for i, a in enumerate(raw_data):
        if article_is_complete(a):
            headings.append(tokenize_sentence(a['abstract']))
            descriptions.append(tokenize_sentence(a['article']))
        if i % config.print_freq == 0:
            print('Tokenized {:,} / {:,} articles'.format(i, num_articles))

    return (headings, descriptions)


In [82]:
#tokenize articles and separate into headings and descriptions
headings, descriptions = tokenize_articles(raw_data)

Tokenized 0 / 21 articles
Tokenized 5 / 21 articles
Tokenized 10 / 21 articles
Tokenized 15 / 21 articles
Tokenized 20 / 21 articles


In [83]:
def filter(line, whitelist):
    ##############################################################
    #   Filters out all characters which are not in whitelist    #
    ##############################################################

    return ''.join([ch for ch in line if ch in whitelist])


def filter_length(headings, descriptions):
    ######################################################################
    #   Filters based on heading and description length defined above    #
    ######################################################################

    if len(headings) != len(descriptions):
        raise Exception('Number of headings does not match number of descriptions!')

    filtered_headings, filtered_descriptions = [], []

    for i in range(0, len(headings)):
        heading_length = len(headings[i].split(' '))
        description_length = len(descriptions[i].split(' '))

        if description_length >= limit['min_descriptions'] and description_length <= limit['max_descriptions']:
            if heading_length >= limit['min_headings'] and heading_length <= limit['max_headings']:
                filtered_headings.append(headings[i])
                filtered_descriptions.append(descriptions[i])

    print ('Length of filtered headings: {:,}'.format(len(filtered_headings)))
    print ('Length of filtered descriptions: {:,}'.format(len(filtered_descriptions)))

    return (filtered_headings, filtered_descriptions)


In [84]:
headings[12]

'czechs outdrink germans in beer-guzzling stakes .'

In [85]:
#keep only whitelisted characters and articles satisfying the length limits
headings = [filter(heading, WHITELIST) for heading in headings]
descriptions = [filter(sentence, WHITELIST) for sentence in descriptions]
headings, descriptions = filter_length(headings, descriptions)

Length of filtered headings: 17
Length of filtered descriptions: 17


In [86]:
headings

['sri lanka closes schools as war escalates ',
 'crimean parliament adopts new constitution ',
 'arafat doubts gaza attack will affect redeployment ',
 'six voted us athletics honor ',
 'argentine to extradite nazi ss captain to italy ',
 'injury worry for sampras ',
 'croatians vote in three constituencies ',
 'serb army accuses police of pulling back from front lines ',
 'us firm bids    billion pounds for seeboard ',
 'us agency to approve new antiaids drugs ',
 'gerry adams returning to us for peace process support ',
 'closure of territories to be partially lifted tuesday ',
 'czechs outdrink germans in beerguzzling stakes ',
 'opposition activists paralyse calcutta ',
 'abb to sell electrical explosionproof equipment business ',
 'moslem rights group wants narcotic khat legalised ',
 'brazilian families to get reparations for military regime killings ']

In [87]:
def index_data(tokenized_sentences, vocab_size):
    #####################################################
    #   Forms vocab, and idx2word and word2idx dicts    #
    #####################################################

    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    vocab = freq_dist.most_common(vocab_size)
    print ('Vocab length: {:,}'.format(len(vocab)))

    idx2word = ['_'] + [UNK] + [x[0] for x in vocab]
    word2idx = dict([(w, i) for i, w in enumerate(idx2word)])

    return (idx2word, word2idx, freq_dist)

In [88]:
#convert list of sentences into list of list of words
word_tokenized_headings = [word_list.split(' ') for word_list in headings]
word_tokenized_descriptions = [word_list.split(' ') for word_list in descriptions]

#indexing
idx2word, word2idx, freq_dist = index_data(word_tokenized_headings + word_tokenized_descriptions, VOCAB_SIZE)

Vocab length: 1,200


In [91]:
word2idx

{'_': 0,
 'unk': 1,
 '': 2,
 'the': 3,
 'of': 4,
 'to': 5,
 'in': 6,
 'a': 7,
 'and': 8,
 'for': 9,
 'said': 10,
 'on': 11,
 'that': 12,
 's': 13,
 'by': 14,
 'was': 15,
 'as': 16,
 'an': 17,
 'from': 18,
 'with': 19,
 'is': 20,
 'be': 21,
 'he': 22,
 'will': 23,
 'us': 24,
 'are': 25,
 'has': 26,
 'against': 27,
 'which': 28,
 'lrb': 29,
 'rrb': 30,
 'new': 31,
 'government': 32,
 'minister': 33,
 'who': 34,
 'we': 35,
 'but': 36,
 'west': 37,
 'at': 38,
 'week': 39,
 'police': 40,
 'wednesday': 41,
 'have': 42,
 'their': 43,
 'it': 44,
 'not': 45,
 'sunday': 46,
 'percent': 47,
 'army': 48,
 'after': 49,
 'were': 50,
 'last': 51,
 'this': 52,
 'its': 53,
 'priebke': 54,
 'his': 55,
 'company': 56,
 'drugs': 57,
 'peace': 58,
 'abb': 59,
 'here': 60,
 'leader': 61,
 'had': 62,
 'party': 63,
 'over': 64,
 'constitution': 65,
 'attack': 66,
 'serb': 67,
 'khat': 68,
 'military': 69,
 'if': 70,
 'up': 71,
 'expected': 72,
 'monday': 73,
 'also': 74,
 'congress': 75,
 'parliament': 76,
 '