# Word2vec with SFCRs

In [1]:
import numpy as np
import os
import pickle

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Willem Jan
[nltk_data]     Willemse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Willem Jan
[nltk_data]     Willemse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import warnings
warnings.filterwarnings(action = 'ignore', category = UserWarning, module = 'gensim')
import gensim

## Read SFCRs

In [4]:
language = 'EN'
local_path = '../SFCR_data/'
if not(os.path.isfile(local_path + 'SFCRs_' + language + '.dat')):
    print("Files not found.")
else:
    with open(local_path + 'SFCRs_' + language + '.dat', 'rb') as fp:
        documents = pickle.load(fp)

# Read Text8 / wiki

In [5]:
from multiprocessing import cpu_count
import gensim.downloader as api

gensim.downloader.BASE_DIR = '../../10_central_data/gensim-data'

dataset = api.load("text8")
#dataset = api.load("wiki-english-20171001")

pages_wiki = [d for d in dataset]

pages_wiki = pages_wiki

del dataset

In [6]:
len(pages_wiki)

1701

## Read legislation

In [7]:
import re
da_path = '../../../../../10_central_data/legislation/'
DA = dict()
art_dict= dict({'EN': ['Article',   'pre']})
da_file = open(da_path + "Delegated_Acts_" + "EN" + ".txt", "rb")
DA[language] = da_file.read().decode('utf-8')
da_file.close()
def retrieve_article(language, article_num):
    method = art_dict[language][1]
    if method == 'pre':
        string = art_dict[language][0] + ' ' + str(article_num) + ' (.*?)' + art_dict[language][0] + ' ' + str(article_num + 1)
    elif method == 'post':
        string = str(article_num) + ' ' + art_dict[language][0] + '(.*?)' + str(article_num + 1) + ' ' + art_dict[language][0]
    elif method == 'postdot':
        string = str(article_num) + '. ' + art_dict[language][0] + '(.*?)' + str(article_num + 1) + '. ' + art_dict[language][0]
    r = re.compile(string, re.DOTALL)
    result = ' '.join(r.search(DA[language])[1].split())
    return result

In [8]:
da_text = []
for article in range(1,381):
    da_text.append(retrieve_article('EN', article))

## Preprocess with NLTK and Gensim

In [9]:
stop_words = nltk.corpus.stopwords.words('english')
def remove_stopwords(texts):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [10]:
import spacy

# Initialize spacy 'en' model
nlp = spacy.load('en', disable = ['parser', 'ner'])
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [11]:
#sentence_wiki = []
#for page in pages_wiki:
#    for item in page['section_texts']:
#        sent_list = nltk.tokenize.sent_tokenize(item)
#        sentences_wiki.extend(sent_list)

sentences_wiki = pages_wiki

In [12]:
sentences_sfcr = []
for document in documents:
    sent_list = nltk.tokenize.sent_tokenize(document)
    sentences_sfcr.extend(sent_list)

In [13]:
sentences_da = []
for article in da_text:
    sent_list = nltk.tokenize.sent_tokenize(article)
    sentences_da.extend(sent_list)

In [14]:
#data_lemmatized = lemmatization(sentences_wiki, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [15]:
#data_lemmatized

In [16]:
sentences_sfcr = remove_stopwords(sentences_sfcr)
sentences_da = remove_stopwords(sentences_da)
sentences_wiki = remove_stopwords(sentences_wiki)

In [17]:
print("Number of sfcr documents: " + str(len(documents)))
print("Number of sfcr sentences: " + str(len(sentences_sfcr)))
print("Number of words: " + str(sum([len(word) for word in sentences_sfcr])))
print("\n")
print("Number of da sentences: " + str(len(sentences_da)))
print("Number of words: " + str(sum([len(word) for word in sentences_da])))
print("\n")
print("Number of wiki sentences: " + str(len(sentences_wiki)))
print("Number of words: " + str(sum([len(word) for word in sentences_wiki])))

Number of sfcr documents: 395
Number of sfcr sentences: 287632
Number of words: 5236154


Number of da sentences: 27883
Number of words: 668126


Number of wiki sentences: 1701
Number of words: 10753778


In [None]:
sentences_sfcr[1000]

## Preliminary analysis

In [None]:
sentences_total = sentences_sfcr + sentences_da + sentences_wiki

bigram = gensim.models.Phrases(sentences_total, 
                               min_count = 10, 
                               threshold = 5) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[sentences_total], 
                                min_count = 10, 
                                threshold = 10)  

In [None]:
list(trigram[['technical', 'provisions'], ['risk', 'margins'], ['external', 'auditor']])

## Start modelling

In [None]:
from gensim.models.word2vec import Word2Vec

In [None]:
model = Word2Vec(trigram[sentences_wiki], 
                 size = 300, 
                 window = 10, 
                 min_count = 10, 
                 workers = cpu_count())

In [None]:
model.wv.most_similar('provision')

In [None]:
model.wv.most_similar('compliance')

In [None]:
model.wv.most_similar('breach')

In [None]:
local_path = '../nlp_data/'

model.save(local_path + 'text8model')

## Training with SFCR data

In [None]:
len(sentences_sfcr)

In [None]:
model = Word2Vec.load('text8model')

In [None]:
model.build_vocab(trigram[sentences_sfcr], update = True)

model.train(trigram[sentences_sfcr],
            total_examples = model.corpus_count,
            epochs = model.epochs)

In [None]:
model.wv.most_similar('deferred_taxes')

In [None]:
model.wv.most_similar('compliance')

In [None]:
model.wv.most_similar('technical_provisions')

In [None]:
model.wv.most_similar('breach')

In [None]:
model.wv.words_closer_than('technical_provisions', 'risk_margin')

## Training with legislation text

In [None]:
model.build_vocab(sentences_da, update = True)
model.train(sentences_da,
            total_examples = model.corpus_count,
            epochs = model.epochs)

In [None]:
model.wv.words_closer_than('remuneration', 'bonus')

## top k words

In [None]:
# first get a list of all words
all_words_sfcr = [word for item in sentences_sfcr for word in item]
# use nltk fdist to get a frequency distribution of all words
fdist_sfcr = nltk.FreqDist(all_words_sfcr)
print("Number of unique words: " +str(len(fdist_sfcr)))

In [None]:
# choose k and visually inspect the bottom 10 words of the top k
k = 10000
top_k_words_sfcr = fdist_sfcr.most_common(k)

# define a function only to keep words in the top k words
top_k_words_sfcr,_ = zip(*fdist_sfcr.most_common(k))

## Visualizations

In [None]:
embeddings = []
for word in top_k_words_da:
    if word in model.wv.vocab:
        embeddings.append(model.wv[word])

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib
%matplotlib inline

In [None]:
tsne_2d = TSNE(perplexity = 40, 
               n_components = 2, 
               init = 'pca', 
               n_iter = 5000, 
               learning_rate = 20,
               random_state = 0)

embeddings_2d = tsne_2d.fit_transform(embeddings)

def tsne_plot_2d(label, embeddings, words = [], a = 1):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, 1))
    x = embeddings[:, 0]
    y = embeddings[:, 1]

    plt.scatter(x, y, c = colors, alpha = a, label = label)
    for i, word in enumerate(words):
        plt.annotate(word, xy = (x[i], y[i]), xytext = (5, 2), 
                     textcoords = 'offset points', ha = 'right', va = 'bottom', size = 10)
    plt.legend(loc = 4)
    plt.grid(True)
    plt.show()

tsne_plot_2d('SFCR', embeddings_2d, words = top_k_words_da, a = 0.1)

## DA words

In [None]:
all_words_da = [word for item in trigram[sentences_da] for word in item if word in model.wv.vocab]
fdist_da = nltk.FreqDist(all_words_da)
print("Number of unique words: " +str(len(fdist_da)))

In [None]:
k = 4543
top_k_words_da = fdist_da.most_common(k)

# define a function only to keep words in the top k words
top_k_words_da,_ = zip(*fdist_da.most_common(k))

In [None]:
model.wv.most_similar_to_given(["bdo"], list(top_k_words_da))

In [None]:
model.wv.most_similar_to_given(["actuary"], list(top_k_words_da))

In [None]:
model.wv.most_similar_to_given(["climate"], list(top_k_words_da))

In [None]:
model.wv.most_similar_to_given(["tps"], list(top_k_words_da))

In [None]:
model.wv.most_similar_to_given(["good"], list(top_k_words_da))

In [None]:
model.wv.most_similar_to_given()

In [None]:
for word in list(top_k_words_sfcr)[0:1000]:
    if word in list(model.wv.vocab):
        if word not in list(top_k_words_da):
            print(word + " --> " + model.wv.most_similar_to_given([word], list(top_k_words_da)))

In [None]:
sentences_sfcr[800]

In [None]:
i = {}
for art in range(292, 310):
    article = retrieve_article("EN", art)
    article = nltk.tokenize.sent_tokenize(article)
    article = remove_stopwords(article)
    value = 0
    for item in trigram[article]:
        item = [word for word in item if word in model.wv.vocab]
        if item != []:
            r = model.wv.n_similarity(sentences_sfcr[800], item)
            value = value + r
    i[art] = value / len(trigram[article])

In [None]:
i

In [None]:
retrieve_article("EN", 298)

In [None]:
model.wv.most_similar_to_given('coc', ['governance', 'risk_margin'])

In [None]:
# Governance article 258-275

In [None]:
 gensim.matutils.kullback_leibler(vec1, vec2, num_features=None)

## Doc2Vec

In [None]:
from gensim.models.doc2vec import Doc2Vec
import gensim

In [None]:
len(sentences_sfcr)

In [46]:
sentences = [sent for sent in sentences_da + sentences_sfcr + sentences_wiki if (sent != "") and (len(sent) > 40)]

In [47]:
train_corpus = [gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(" ".join(line)), [i]) for i, line in enumerate(sentences)]
test_corpus = [gensim.utils.simple_preprocess(" ".join(line)) for i, line in enumerate(sentences)]

In [49]:
len(train_corpus)

21729

In [50]:
model = gensim.models.doc2vec.Doc2Vec(vector_size = 100, 
                 epochs = 100,
                 workers = cpu_count())

In [51]:
model.build_vocab(train_corpus)

In [52]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

Wall time: 10min 53s


In [70]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus))[0:10]:
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn = len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    second_ranks.append(sims[1])

In [71]:
import collections

In [72]:
collections.Counter(ranks)

Counter({2767: 1,
         737: 1,
         8170: 1,
         205: 1,
         1057: 1,
         244: 1,
         1898: 1,
         1818: 1,
         3165: 1,
         2513: 1})

In [56]:
import random

In [73]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
#doc_id = 3
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (6803): «system governance general information system governance fit proper requirements risk management system including risk solvency assessment orsa internal control system compliance function internal audit function actuarial function outsourcing information risk profile underwriting risk market risk credit risk liquidity risk operational risk material risks information valuation solvency purposes assets technical provisions liabilities alternative methods valuation information capital management funds solvency capital requirement minimum capital requirement non compliance minimum capital requirement non compliance solvency capital requirement information templates summary summary solvency financial condition report sfcr contains quantitative qualitative information relating compre group group covering business performance system governance risk profile solvency valuation capital management group»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d100,n5,w5,mc5,s0.001,t

In [None]:
embeddings = []
for doc_id in range(len(test_corpus)):
    embeddings.append(model.infer_vector(test_corpus[doc_id]))

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib
%matplotlib inline

In [None]:
tsne_2d = TSNE(perplexity = 2, 
               n_components = 2, 
               init = 'pca', 
               n_iter = 1000, 
               learning_rate = 20,
               random_state = 0)

embeddings_2d = tsne_2d.fit_transform(embeddings)

def tsne_plot_2d(label, embeddings, words = [], a = 1):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, 1))
    x = embeddings[:, 0]
    y = embeddings[:, 1]

    plt.scatter(x, y, c = colors, alpha = a, label = label)
    #for i, word in enumerate(words):
#        plt.annotate(word, xy = (x[i], y[i]), xytext = (5, 2), 
#                     textcoords = 'offset points', ha = 'right', va = 'bottom', size = 10)
    plt.legend(loc = 4)
    plt.grid(True)
    plt.show()

tsne_plot_2d('DA', embeddings_2d, words = range(len(corpus)), a = 0.1)