In [1]:
import re
import html
import os

from random import randint
from itertools import chain
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models.word2vec import LineSentence, Word2Vec

In [2]:
from IPython.display import display, HTML

def browser_alert(message):
    display(HTML('<script type="text/javascript">alert("' + message + '");</script>'))
    
def browser_notify(message):
    display(HTML('<script type="text/javascript">var notification=new Notification("Jupyter Notification",{icon:"http://blog.jupyter.org/content/images/2015/02/jupyter-sq-text.png",body:"' + message + '"});</script>'))

# Input file cleaning and formatting

In [None]:
def clean_str(string):  
    string = html.unescape(string)
    string = re.sub(r"[^A-Za-z0-9!?\'\`]", " ", string)
    string = re.sub(r"[ ]+", " ", string)

    return string.strip().lower()

In [None]:
def get_sentences_from_line(line):
    
    sentences = list()
    if line[0:2] == "--" or len(line.strip()) == 0:
        return sentences
    
    sentences.extend(sent_tokenize(line.strip()))
    
    return sentences

In [None]:
def get_article_sentences(article_file_path):
    with open(article_file_path) as article_file:
        for line in article_file:
            sentences = get_sentences_from_line(line)
            
            for sentence in sentences:
                yield(sentence)

In [None]:
# sample_article_path = "/home/v2john/Projects/financial-news-dataset/20061020_20131126_bloomberg_news/2009-01-02/rust-through-transparency"
# for article_sentence in get_article_sentences(sample_article_path):
#     print(clean_str(article_sentence))

In [None]:
corpora_path = "/home/v2john/Projects/financial-news-dataset/ReutersNews106521/"

In [None]:
consolidated_output_path = "/home/v2john/Projects/financial-news-dataset/reuters_sentences.txt"

In [None]:
with open(consolidated_output_path, 'w') as consolidated_output_file:
    for path, subdirs, files in os.walk(corpora_path):
        for name in files:
            full_path = os.path.join(path, name)
            if "/." not in full_path:
                try:
                    for article_sentence in get_article_sentences(full_path):
                        consolidated_output_file.write(clean_str(article_sentence) + "\n")
                except Exception as e:
                    print(full_path)

In [None]:
browser_notify("Sentences generated")

# Learning word embeddings

In [3]:
reuters_sentences = LineSentence('/home/v2john/Projects/financial-news-dataset/reuters_sentences.txt')
bloomberg_sentences = LineSentence('/home/v2john/Projects/financial-news-dataset/bloomberg_sentences.txt')
w2v_model_path = "/home/v2john/Projects/financial-word-embedder/models/w2v_model"

In [None]:
model = Word2Vec(chain(reuters_sentences, bloomberg_sentences), size=400, window=5, min_count=25, workers=8)

In [4]:
# save model
# model.save(w2v_model_path)

# restore model
model = Word2Vec.load(w2v_model_path)

In [5]:
browser_notify("Word embeddings training complete")

In [6]:
# model.wv['shareholder']

In [24]:
# model.similar_by_word("stock", topn=10, restrict_vocab=None)

In [25]:
word_graph = dict()
k = 10

In [26]:
count = 0
for word in model.wv.vocab.keys():
    similar_word_tuples = model.similar_by_word(word, topn=k, restrict_vocab=None)
    similar_words = list()
    for similar_word_tuple in similar_word_tuples:
        similar_words.append(similar_word_tuple[0])
        
    word_graph[word] = similar_words
    count += 1

In [27]:
browser_notify("Word graph created")

In [28]:
len(word_graph)

82286

In [29]:
seed_words = ['shrink', 'drop', 'fall', 'plunge', 'slump']
walk_dict = dict()
random_walk_length = 100
walk_iterations = 100

In [30]:
def perform_random_walk(word_graph, word, iterations, walk_dict):
    words_chosen = list()
    for i in range(iterations):
        word_choices = word_graph[word]
        word_choice = word_choices[randint(0, k-1)]
        
        words_chosen.append(word_choice)
        word = word_choice
    
    for word_choice in words_chosen:
        if word_choice in walk_dict.keys():
            walk_dict[word_choice] += 1
        else:
            walk_dict[word_choice] = 1

In [31]:
for word in seed_words:
    for iteration in range(walk_iterations):
        perform_random_walk(word_graph, word, random_walk_length, walk_dict)

In [32]:
sorted(walk_dict, key=walk_dict.get, reverse=True)[:10]

["zealand's",
 'frontier',
 'believers',
 'ubisoft',
 'compagnia',
 'zambo',
 'mtg',
 'okinnander',
 "efsf's",
 'qedra']

In [33]:
browser_notify("Words identified")

In [34]:
walk_dict["zealand's"]

20