In [None]:
import re
import html
import os

from itertools import chain
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models.word2vec import LineSentence, Word2Vec

In [None]:
from IPython.display import display, HTML

def browser_alert(message):
    display(HTML('<script type="text/javascript">alert("' + message + '");</script>'))
    
def browser_notify(message):
    display(HTML('<script type="text/javascript">var notification=new Notification("Jupyter Notification",{icon:"http://blog.jupyter.org/content/images/2015/02/jupyter-sq-text.png",body:"' + message + '"});</script>'))

# Input file cleaning and formatting

In [None]:
def clean_str(string):  
    string = html.unescape(string)
    string = re.sub(r"[^A-Za-z0-9!?\'\`]", " ", string)
    string = re.sub(r"[ ]+", " ", string)

    return string.strip().lower()

In [None]:
def get_sentences_from_line(line):
    
    sentences = list()
    if line[0:2] == "--" or len(line.strip()) == 0:
        return sentences
    
    sentences.extend(sent_tokenize(line.strip()))
    
    return sentences

In [None]:
def get_article_sentences(article_file_path):
    with open(article_file_path) as article_file:
        for line in article_file:
            sentences = get_sentences_from_line(line)
            
            for sentence in sentences:
                yield(sentence)

In [None]:
# sample_article_path = "/home/v2john/Projects/financial-news-dataset/20061020_20131126_bloomberg_news/2009-01-02/rust-through-transparency"
# for article_sentence in get_article_sentences(sample_article_path):
#     print(clean_str(article_sentence))

In [None]:
corpora_path = "/home/v2john/Projects/financial-news-dataset/ReutersNews106521/"

In [None]:
consolidated_output_path = "/home/v2john/Projects/financial-news-dataset/reuters_sentences.txt"

In [None]:
with open(consolidated_output_path, 'w') as consolidated_output_file:
    for path, subdirs, files in os.walk(corpora_path):
        for name in files:
            full_path = os.path.join(path, name)
            if "/." not in full_path:
                try:
                    for article_sentence in get_article_sentences(full_path):
                        consolidated_output_file.write(clean_str(article_sentence) + "\n")
                except Exception as e:
                    print(full_path)

In [None]:
browser_notify("Sentences generated")

# Learning word embeddings

In [None]:
reuters_sentences = LineSentence('/home/v2john/Projects/financial-news-dataset/reuters_sentences.txt')
bloomberg_sentences = LineSentence('/home/v2john/Projects/financial-news-dataset/bloomberg_sentences.txt')

In [None]:
model = Word2Vec(chain(reuters_sentences, bloomberg_sentences), size=400, window=5, min_count=25, workers=8)

In [None]:
browser_notify("Word embeddings training complete")

In [None]:
# model.wv['shareholder']

In [None]:
model.similar_by_word("stock", topn=10, restrict_vocab=None)