In [None]:
import re
import html
import os
import pickle

from random import randint
from itertools import chain
from math import log

from numpy import zeros
from numpy.linalg import svd

from scipy.spatial.distance import cosine

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams

from gensim.models.word2vec import LineSentence, Word2Vec

from sklearn.utils.extmath import randomized_svd

In [None]:
from IPython.display import display, HTML

def browser_alert(message):
    display(HTML('<script type="text/javascript">alert("' + message + '");</script>'))
    
def browser_notify(message):
    display(HTML('<script type="text/javascript">var notification=new Notification("Jupyter Notification",{icon:"http://blog.jupyter.org/content/images/2015/02/jupyter-sq-text.png",body:"' + message + '"});</script>'))

# Input file cleaning and formatting

In [None]:
def clean_str(string):  
    string = html.unescape(string)
    string = re.sub(r"[^A-Za-z0-9!?\'\`]", " ", string)
    string = re.sub(r"[ ]+", " ", string)

    return string.strip().lower()

In [None]:
def get_sentences_from_line(line):
    
    sentences = list()
    if line[0:2] == "--" or len(line.strip()) == 0:
        return sentences
    
    sentences.extend(sent_tokenize(line.strip()))
    
    return sentences

In [None]:
def get_article_sentences(article_file_path):
    with open(article_file_path) as article_file:
        for line in article_file:
            sentences = get_sentences_from_line(line)
            
            for sentence in sentences:
                yield(sentence)

In [None]:
# sample_article_path = "/home/v2john/financial-news-dataset/20061020_20131126_bloomberg_news/2009-01-02/rust-through-transparency"
# for article_sentence in get_article_sentences(sample_article_path):
#     print(clean_str(article_sentence))

In [None]:
corpora_path = "/home/v2john/financial-news-dataset/20061020_20131126_bloomberg_news/"

In [None]:
consolidated_output_path = "/home/v2john/financial-news-dataset/bloomberg_sentences.txt"

In [None]:
with open(consolidated_output_path, 'w') as consolidated_output_file:
    for path, subdirs, files in os.walk(corpora_path):
        for name in files:
            full_path = os.path.join(path, name)
            if "/." not in full_path:
                try:
                    for article_sentence in get_article_sentences(full_path):
                        consolidated_output_file.write(clean_str(article_sentence) + "\n")
                except Exception as e:
                    print(full_path)

In [None]:
browser_notify("Sentences generated")

# Learning Word2Vec word embeddings

In [None]:
reuters_sentences = LineSentence('/home/v2john/Projects/financial-news-dataset/reuters_sentences.txt')
bloomberg_sentences = LineSentence('/home/v2john/Projects/financial-news-dataset/bloomberg_sentences.txt')
w2v_model_path = "/home/v2john/Projects/financial-word-embedder/models/w2v_model"

In [None]:
model = Word2Vec(chain(reuters_sentences, bloomberg_sentences), size=400, window=5, min_count=25, workers=8)

In [None]:
# save model
# model.save(w2v_model_path)

# restore model
model = Word2Vec.load(w2v_model_path)

In [None]:
browser_notify("Word embeddings training complete")

In [None]:
# model.wv['shareholder']

In [None]:
# model.similar_by_word("stock", topn=10, restrict_vocab=None)

In [None]:
word_graph = dict()
k = 10

In [None]:
count = 0
for word in model.wv.vocab.keys():
    similar_word_tuples = model.similar_by_word(word, topn=k, restrict_vocab=None)
    similar_words = list()
    for similar_word_tuple in similar_word_tuples:
        similar_words.append(similar_word_tuple[0])
        
    word_graph[word] = similar_words
    count += 1

In [None]:
browser_notify("Word graph created")

In [None]:
len(word_graph)

In [None]:
# seed_words = ['shrink', 'drop', 'fall', 'plunge', 'slump']
seed_words = ['surge', 'rise', 'jump', 'gain']
walk_dict = dict()
random_walk_length = 100
walk_iterations = 100

In [None]:
def perform_random_walk(word_graph, word, iterations, walk_dict):
    words_chosen = list()
    for i in range(iterations):
        word_choices = word_graph[word]
        word_choice = word_choices[randint(0, k-1)]
        
        words_chosen.append(word_choice)
        word = word_choice
    
    for word_choice in words_chosen:
        if word_choice in walk_dict.keys():
            walk_dict[word_choice] += 1
        else:
            walk_dict[word_choice] = 1

In [None]:
for word in seed_words:
    for iteration in range(walk_iterations):
        perform_random_walk(word_graph, word, random_walk_length, walk_dict)

In [None]:
sorted(walk_dict, key=walk_dict.get, reverse=True)[:10]

In [None]:
browser_notify("Words identified")

In [None]:
# walk_dict["zealand's"]

# Learn SVD Word Embeddings

In [None]:
sentences_path = "/home/v2john/financial-news-dataset/all_sentences.txt"
context_width = 2
vocabulary = set()
corpus_term_frequency = 0
term_frequencies = dict()

In [None]:
with open(sentences_path) as sentences_file:
    for line in sentences_file:
        tokens = word_tokenize(line)
        num_tokens = len(tokens)
        
        for i in range(num_tokens):
            if tokens[i] not in term_frequencies:
                term_frequencies[tokens[i]] = 1
            else:
                term_frequencies[tokens[i]] += 1
        
        corpus_term_frequency += num_tokens

In [None]:
vocabulary = {k for k,v in term_frequencies.items() if v >= 100}

In [None]:
print(corpus_term_frequency)
print(len(vocabulary))
# print(term_frequencies)

In [None]:
browser_notify("Corpus built")

In [None]:
npmi_matrix = zeros((len(vocabulary), len(vocabulary)))
vocab_list = list(vocabulary)
print(len(vocab_list))

In [None]:
vocab_pos_dict = dict()
counter = 0

for word in vocab_list:
    vocab_pos_dict[word] = counter
    counter += 1

In [None]:
with open(sentences_path) as sentences_file:
    for line in sentences_file:
        tokens = word_tokenize(line)
        num_tokens = len(tokens)
        
        for i in range(num_tokens):
            current_word = tokens[i]
            try:
                for j in range(i, i + context_width + 1):
                    context_word = tokens[j]
                    if current_word in vocabulary and context_word in vocabulary:
                        npmi_matrix[vocab_pos_dict[current_word]][vocab_pos_dict[context_word]] += 1 
                        npmi_matrix[vocab_pos_dict[context_word]][vocab_pos_dict[current_word]] += 1 
            except IndexError as ie:
                pass

In [None]:
for i in range(len(vocabulary)):
    for j in range(len(vocabulary)):
        joint_prob = npmi_matrix[i][j] / corpus_term_frequency
        term1_prob = term_frequencies[vocab_list[i]] / corpus_term_frequency
        term2_prob = term_frequencies[vocab_list[j]] / corpus_term_frequency
        
        if joint_prob > 0 and term1_prob > 0 and term2_prob > 0:
            mutual_information = \
                log(joint_prob / (term1_prob * term2_prob)) / -log(joint_prob)
        else:
            mutual_information = 0.0
            
        npmi_matrix[i][j] = mutual_information

In [None]:
u, sigma, vt = randomized_svd(npmi_matrix, n_components=300)

## Learning related words

In [None]:
svd_word_embeddings_path = "/home/v2john/svd_word_embeddings.pkl"
vocab_path = "/home/v2john/financial_vocab.pkl"
similarity_dict_path = "/home/v2john/similarity_dict.pkl"

In [None]:
# # Save vectors
# with open(svd_word_embeddings_path, 'wb') as svd_word_embeddings_file:
#     pickle.dump(u, svd_word_embeddings_file)
    
# with open(vocab_path, 'wb') as vocab_file:
#     pickle.dump(vocab_list, vocab_file)

# browser_notify("Persisted to disk")

In [None]:
# # Restore
# with open(svd_word_embeddings_path, 'rb') as svd_word_embeddings_file:
#     u = pickle.load(svd_word_embeddings_file)
    
# with open(vocab_path, 'rb') as vocab_file:
#     vocab_list = pickle.load(vocab_file)

# browser_notify("Persisted to disk")

In [None]:
u.shape

In [None]:
browser_notify("Embeddings learnt")

In [None]:
k = 10
global_similarity_dict = dict()
num_words = len(vocab_list)

for i in range(num_words):
    print("Word " + str(i + 1) + " of " + str(num_words))
    word_similarity_dict = dict()
    for j in range(num_words):
        if j == i:
            continue    
        word_similarity_dict[vocab_list[j]] = cosine(u[i], u[j])
    
    global_similarity_dict[vocab_list[i]] = \
        list(map(lambda x: x[0], sorted(word_similarity_dict.items(), key=lambda x: x[1], reverse=True)[:10]))

In [None]:
# # Save vectors

# with open(similarity_dict_path, 'wb') as similarity_dict_file:
#     pickle.dump(global_similarity_dict, similarity_dict_file)

In [None]:
# seed_words = ['shrink', 'drop', 'fall', 'plunge', 'slump']
# seed_words = ['surge', 'rise', 'jump', 'gain']
seed_words = ['random', 'garbage']
# seed_words = ['dire']
walk_dict = dict()
random_walk_length = 1000
walk_iterations = 10000

In [None]:
def perform_random_walk(word_graph, word, iterations, walk_dict):
    words_chosen = list()
    for i in range(iterations):
        word_choices = word_graph[word]
        word_choice = word_choices[randint(0, k-1)]
        
        words_chosen.append(word_choice)
        word = word_choice
    
    for word_choice in words_chosen:
        if word_choice in walk_dict.keys():
            walk_dict[word_choice] += 1
        else:
            walk_dict[word_choice] = 1

In [None]:
for word in seed_words:
    for iteration in range(walk_iterations):
        perform_random_walk(global_similarity_dict, word, random_walk_length, walk_dict)

In [None]:
sorted(walk_dict, key=walk_dict.get, reverse=True)[:10]