In [2]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from six.moves.urllib.request import urlretrieve
import zipfile
import numpy as np
import os
import time
import random
import tensorflow as tf
from matplotlib import pylab
from sklearn.manifold import TSNE
from scipy.sparse import lil_matrix
import nltk # standard preprocessing
import operator # sorting items in dictionary by value
#nltk.download() #tokenizers/punkt/PY3/english.pickle
from math import ceil

## Understanding the data

### Downloading the data

This code downloads a [BBC dataset](hhttp://mlg.ucd.ie/files/datasets/bbc-fulltext.zip) consisting of news articles published by BBC. 

In [3]:
url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'


def download_data(url, data_dir):
    """Download a file if not present, and make sure it's the right size."""
    
    os.makedirs(data_dir, exist_ok=True)

    file_path = os.path.join(data_dir, 'bbc-fulltext.zip')
  
    if not os.path.exists(file_path):
        print('Downloading file...')
        filename, _ = urlretrieve(url, file_path)
    else:
        print("File already exists")
  
    extract_path = os.path.join(data_dir, 'bbc')
    if not os.path.exists(extract_path):
        
        with zipfile.ZipFile(os.path.join(data_dir, 'bbc-fulltext.zip'), 'r') as zipf:
            zipf.extractall(data_dir)
  
    else:
        print("bbc-fulltext.zip has already been extracted")
    
download_data(url, 'data')

File already exists
bbc-fulltext.zip has already been extracted


## Using pre-trained ELMo Model

### Downloading the ELMo Model from TFHub

In [41]:
import tensorflow_hub as hub
import tensorflow.keras.backend as K

K.clear_session()
elmo_layer = hub.KerasLayer("https://tfhub.dev/google/elmo/3", signature="tokens",signature_outputs_as_dict=True)

In [49]:

tokens_input = tf.constant(
    [["the", "cat", "is", "on", "the", "mat"]    ]
)

tokens_length = tf.constant([6])


res = elmo_layer({
        "tokens": tokens_input,
        "sequence_len": tokens_length
    })
print(res)


{'lstm_outputs2': <tf.Tensor: shape=(1, 6, 1024), dtype=float32, numpy=
array([[[ 0.6206704 ,  0.36537755,  0.1887973 , ..., -0.84178007,
          0.356871  ,  0.16665666],
        [ 0.86018425, -0.7156644 ,  0.3361352 , ..., -0.02970919,
          0.23038185,  1.8898063 ],
        [-0.01657382, -0.6142872 ,  0.12097765, ..., -0.84056723,
          1.2542009 ,  0.7461188 ],
        [-0.7640561 , -0.46110973, -1.4737943 , ..., -1.6910062 ,
          0.24993908, -0.01837131],
        [-0.31592393, -0.5211221 , -0.6046509 , ..., -0.68753093,
         -0.31786534,  0.02256058],
        [ 0.46374235, -0.20050219, -0.61079466, ..., -0.03458577,
          0.17038181,  0.70538455]]], dtype=float32)>, 'elmo': <tf.Tensor: shape=(1, 6, 1024), dtype=float32, numpy=
array([[[ 0.30815446,  0.2663037 ,  0.23561308, ..., -0.37085718,
          0.16490504, -0.07245933],
        [ 0.5142877 , -0.13532332,  0.11090418, ...,  0.04046869,
         -0.04789776,  0.73659605],
        [-0.02588062, -0.072836