In [3]:
import os
import re
import string
import math
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from sklearn import preprocessing

# Spacy 
import spacy
#from spacy.lang.en import English
#from spacy import displacy

# sklearn
from sklearn.metrics.pairwise import cosine_similarity

#import Utils
from utils import get_corpus_dataframe

from IPython.display import HTML
import logging
logging.getLogger('tensorflow').disabled = True

## Retireve sentences

In [4]:
def create_sentences(save_as_filename):
    
    nlp = spacy.load('en_core_web_md')
    nlp.max_length = 1520000
    MAX_WORDS_IN_SENTENCE = 80
    MIN_WORDS_IN_SENTENCE = 3
    punctuation = '!"#$%&()*+:;<=>?@[\\]^_`{|}~●'
    # Load data 
    # Import Dataset into a Pandas Dataframe
    df = get_corpus_dataframe(eu_only=False)

    ## Clean Text
    # remove punctuation 
    df['clean_content'] = df['clean_content'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

    # convert text to lowercase
    df['clean_content'] = df['clean_content'].str.lower()

    ## Load Sentences
    sentences = [sent.string.strip() for text in df.clean_content.tolist() for sent in nlp(text).sents if len(sent) in range(MIN_WORDS_IN_SENTENCE,MAX_WORDS_IN_SENTENCE)]

    #save 
    if (save_as_filename):
        with open(save_as_filename, "wb") as fp:   #Pickling
            pickle.dump(sentences, fp)
    
    return sentences


In [5]:
elmo_sentences_filename ='./preprocessed_text/elmo_sentences.sav'
if os.path.isfile(elmo_sentences_filename):
    with open(elmo_sentences_filename, "rb") as fp:   # Unpickling
        sentences = pickle.load(fp)
else:
    # Create sentences
    sentences = create_sentences(save_as_filename=elmo_sentences_filename)
    

## Create Sentence Embeddings

In [6]:
print(len(sentences))

9201


In [None]:
 # calculate embeddings via tfhub
url = "https://tfhub.dev/google/elmo/3"
elmo = hub.load(url)

def elmo_vectors(x):
    tensor_list = tf.convert_to_tensor(x)
    embeddings = elmo.signatures['default'](tensor_list)["default"]
    return embeddings
        

# function to return the elmo embeddings
def get_sentences_vectors():
    
    elmo_embeddings_filename ='./saved_state/elmo_embeddings.pkl'
    # check if the embeddings are already available in pickle file
    if os.path.isfile(elmo_embeddings_filename):
        with open(elmo_embeddings_filename, "rb") as fp:   # Unpickling
            sentences_embeddings = pickle.load(fp)
    else:
       
        # split in batches of 500
        list_batch_sentences = [sentences[i:i+500] for i in range(0,len(sentences),500)]

        # Extract ELMo embeddings
        elmo_embeddings = [elmo_vectors(x) for x in list_batch_sentences]

        # concentrate ELMo embeddings
        #flatten the lists
        sentences_embeddings = [y for x in elmo_embeddings for y in x]

        # pickle
        with open(elmo_embeddings_filename, "wb") as fp:   #Pickling
            pickle.dump(sentences_embeddings, fp)
    
    return sentences_embeddings

sentences_vectors = get_sentences_vectors()

## Similarity calculation

In [18]:
%%time

#search_term='code of ethics' # param
search_term='skills and education'
embeddings_search_vectors = elmo.signatures['default'](tf.convert_to_tensor([search_term]))['default']

cosine_similarities = pd.Series(cosine_similarity(embeddings_search_vectors,sentences_vectors).flatten())



Wall time: 12min 50s


## Display results

In [19]:
results_returned = "20" #@param [1, 2, 3]

output =""
for i,j in cosine_similarities.nlargest(int(results_returned)).iteritems():
  output +='<p style="font-family:verdana; font-size:110%;"> '
  for i in sentences[i].split():
    if i.lower() in search_term:
      output += " <b>"+str(i)+"</b>"
    else:
      output += " "+str(i)
  output += "</p><hr>"
    
output = '<h3>Results:</h3>'+output
display(HTML(output))

In [None]:
# split in batches of 500
sentences_chunks = [sentences[i:i+500] for i in range(0,len(sentences),500)]

# Extract ELMo embeddings
sentences_upper = [y.upper() for x in sentences_chunks for y in x]

# # concentrate ELMo embeddings
# #flatten the lists
# sentences_new = [y for x in sentences_upper for y in x]

for x in [1,2,3,4,5]: #stupid way for fun
    i = np.random.randint(0,len(sentences))
    print(sentences[i])
    print(sentences_upper[i])


In [None]:
len(sentences_upper)