In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import time

# prevent tensorflow from using GPU. Otherwise, run out of memory
# https://stackoverflow.com/questions/44552585/prevent-tensorflow-from-accessing-the-gpu
# os.environ["CUDA_VISIBLE_DEVICES"]="-1"

import tensorflow_hub as hub
import tensorflow as tf
import nltk
import regex as re
import emoji as em
from sklearn.model_selection import train_test_split


elmo = hub.Module("https://tfhub.dev/google/elmo/2")

W0412 09:30:17.571392 140365206058816 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [2]:
# Set path
path = '/home/tim/Documents/Sentiment/Data/processed'
file1 = '/home/tim/Documents/Sentiment/Data/processed/dev.tsv'


col_names = ['text','sent']

# Create dataframe
df = pd.read_csv(os.path.join(path,file1),delimiter="\t",names=col_names)
df.head()

Unnamed: 0,text,sent
0,so <user> site crashes everytime i try to book...,-3
1,theme of week : ask the lord for strength & pe...,-2
2,"<user> why announcing so late , it will be har...",-3
3,the greatest happiness is seeing someone you l...,3
4,omg so grateful to have an education but ive b...,1


In [3]:
# Create tokens with simple split on whitespace
def simple_token(s):
    return s.split()

# Get token length
def token_length(s):
    return len(s)

df['tokens'] = df['text'].apply(simple_token)
df.head()

Unnamed: 0,text,sent,tokens
0,so <user> site crashes everytime i try to book...,-3,"[so, <user>, site, crashes, everytime, i, try,..."
1,theme of week : ask the lord for strength & pe...,-2,"[theme, of, week, :, ask, the, lord, for, stre..."
2,"<user> why announcing so late , it will be har...",-3,"[<user>, why, announcing, so, late, ,, it, wil..."
3,the greatest happiness is seeing someone you l...,3,"[the, greatest, happiness, is, seeing, someone..."
4,omg so grateful to have an education but ive b...,1,"[omg, so, grateful, to, have, an, education, b..."


In [4]:
df['len'] = df['tokens'].apply(token_length)
df.head()

Unnamed: 0,text,sent,tokens,len
0,so <user> site crashes everytime i try to book...,-3,"[so, <user>, site, crashes, everytime, i, try,...",32
1,theme of week : ask the lord for strength & pe...,-2,"[theme, of, week, :, ask, the, lord, for, stre...",28
2,"<user> why announcing so late , it will be har...",-3,"[<user>, why, announcing, so, late, ,, it, wil...",24
3,the greatest happiness is seeing someone you l...,3,"[the, greatest, happiness, is, seeing, someone...",17
4,omg so grateful to have an education but ive b...,1,"[omg, so, grateful, to, have, an, education, b...",31


In [5]:
# Find the tweet with the longest length
print("The largest token has length =", df['len'].max())
m = 80
print("Set max token length =",m, "(used for padding)")

The largest token has length = 65
Set max token length = 80 (used for padding)


In [6]:
# Create "embedding lists" of equal size -- pad with empty characters, e.g. ""
# https://stackoverflow.com/questions/24066904/most-pythonic-way-to-extend-a-list-to-exactly-a-certain-length
def pad_list(some_list, target_len):
    return some_list[:target_len] + [""]*(target_len - len(some_list))

df['tokens'] = df.apply(lambda x: pad_list(x['tokens'],m),axis=1)

In [7]:
df.head()

Unnamed: 0,text,sent,tokens,len
0,so <user> site crashes everytime i try to book...,-3,"[so, <user>, site, crashes, everytime, i, try,...",32
1,theme of week : ask the lord for strength & pe...,-2,"[theme, of, week, :, ask, the, lord, for, stre...",28
2,"<user> why announcing so late , it will be har...",-3,"[<user>, why, announcing, so, late, ,, it, wil...",24
3,the greatest happiness is seeing someone you l...,3,"[the, greatest, happiness, is, seeing, someone...",17
4,omg so grateful to have an education but ive b...,1,"[omg, so, grateful, to, have, an, education, b...",31


In [8]:
np.shape(df['tokens'].iloc[0])

(80,)

In [25]:
# Get the elmo embeddings
def elmo_tweet_embedder(tokens,len_list):
    
    tokens_input = tokens #load a tweet
    tokens_length = len_list # get length of tweet

    #create embedding
    embedding_tensor = elmo(inputs={"tokens":tokens_input,"sequence_len":tokens_length},
                            signature="tokens", as_dict=True)["word_emb"] # <-- passing in a list instead of [word]

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        embedding = sess.run(embedding_tensor)
        return embedding


In [12]:
df.head()

Unnamed: 0,text,sent,tokens,len
0,so <user> site crashes everytime i try to book...,-3,"[so, <user>, site, crashes, everytime, i, try,...",32
1,theme of week : ask the lord for strength & pe...,-2,"[theme, of, week, :, ask, the, lord, for, stre...",28
2,"<user> why announcing so late , it will be har...",-3,"[<user>, why, announcing, so, late, ,, it, wil...",24
3,the greatest happiness is seeing someone you l...,3,"[the, greatest, happiness, is, seeing, someone...",17
4,omg so grateful to have an education but ive b...,1,"[omg, so, grateful, to, have, an, education, b...",31


In [51]:
# Resource to split data into smaller batches: http://bit.ly/2P4J8HJ
text_batches = [df['tokens'][i:i+100] for i in range(0,df.shape[0],100)]
len_lists = [[m] * len(x) for x in text_batches]

# Create list of sentimens (y values)

y = df['sent'].to_numpy()

In [42]:
embeddings = []
for i in range(0,len(text_batches)):
    elmo_train = elmo_tweet_embedder(text_batches[i].tolist(),len_lists[i])
    embeddings.append(elmo_train)

In [43]:
for x in embeddings:
    print(np.shape(x))

(100, 80, 512)
(100, 80, 512)
(100, 80, 512)
(100, 80, 512)
(49, 80, 512)


In [44]:
embeddings = np.concatenate(embeddings, axis=0)
np.shape(embeddings)

(449, 80, 512)

In [50]:
type(embeddings)

numpy.ndarray