In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle

# prevent tensorflow from using GPU. Otherwise, run out of memory
# https://stackoverflow.com/questions/44552585/prevent-tensorflow-from-accessing-the-gpu
# os.environ["CUDA_VISIBLE_DEVICES"]="-1"

import tensorflow_hub as hub
import tensorflow as tf

import nltk
import regex as re
import emoji as em

import itertools as it

elmo = hub.Module("https://tfhub.dev/google/elmo/2")

W0415 17:04:47.558882 139878782445376 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [2]:
# FUNCTIONS

# process tweet
def preprocess(tweet):
    #define hashtag pattern
    hashtag_define = re.compile ('#')
    #define mention pattern 
    mentions_define = re.compile('@[^\s]+')
    #define link pattern
    link_define = re.compile('https?://[^\s]+')
    # Haystack define
    haystack_define = re.compile('(RT)')
    # define long spaces
    extra_spaces_define = re.compile('\s{2,}')
    #remove hashtags
    tweet_refine = hashtag_define.sub('',tweet)
    # remove mentions
    tweet_refine = mentions_define.sub('',tweet_refine)
    # remove links
    tweet_refine = link_define.sub('',tweet_refine)
    # remove haystack
    tweet_refine = haystack_define.sub('',tweet_refine)
    #replace long spaces with one space
    tweet_refine = extra_spaces_define.sub(' ',tweet_refine)
    # convert emoticons into words
    tweet_refine = em.demojize(tweet_refine)
    return tweet_refine

# Create tokens with simple split on whitespace
def simple_token(s):
    return s.split()

# Get token length
def token_length(s):
    return len(s)

# Create "embedding lists" of equal size -- pad with empty characters, e.g. ""
# https://stackoverflow.com/questions/24066904/most-pythonic-way-to-extend-a-list-to-exactly-a-certain-length
def pad_list(some_list, target_len):
    return some_list[:target_len] + [""]*(target_len - len(some_list))

# Get the elmo embeddings
def elmo_tweet_embedder(tokens,len_list):
    
    tokens_input = tokens #load a tweet
    tokens_length = len_list # get length of tweet

    #create embedding
    embedding_tensor = elmo(inputs={"tokens":tokens_input,"sequence_len":tokens_length},
                            signature="tokens", as_dict=True)["word_emb"] # <-- passing in a list instead of [word]

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        embedding = sess.run(embedding_tensor)
        return embedding

# combine all together
def make_embeddings(df, max_seq_len):
    m = max_seq_len
    
    # create a token column in df
    df['tokens'] = df['text'].apply(simple_token)
    print("Token creation complete")
    
    # pad the tokens
    df['tokens'] = df.apply(lambda x: pad_list(x['tokens'],m),axis=1)
    print("Token padding complete")
    
    # split data into smaller batches of size 100. http://bit.ly/2P4J8HJ
    text_batches = [df['tokens'][i:i+7000] for i in range(0,df.shape[0],7000)]
    len_lists = [[m] * len(x) for x in text_batches]

    # create list of sentiments (y values)
    y = df['sent'].to_numpy()
    
    # make embeddings
    embeddings = []
    for i in range(0,len(text_batches)):
        print("batch",i)
        elmo_train = elmo_tweet_embedder(text_batches[i].tolist(),len_lists[i])
        embeddings.append(elmo_train)
        
    embeddings = np.concatenate(embeddings, axis=0)
    
    return embeddings, y

In [3]:
#ORIGINAL

# path = '/home/tim/Documents/Sentiment/Data/dictionary_raw'
# file = 'vocab_complete.txt'

path = '/home/tim/Documents/Sentiment/Data/dictionary_raw'
file = 'text.txt'

p = os.path.join(path,file)

# open file
f = open(p, "r")
lines = f.readlines()

#remove new line symbol
l=[]
for x in lines:
    x = x.rstrip("\n")
    l.append(x)
    
# create df
col_names = ['sent']
df = pd.DataFrame(l,columns=col_names)
df['text']=df['sent']
f.close()
df.head()

Unnamed: 0,sent,text
0,<user>,<user>
1,.,.
2,:,:
3,rt,rt
4,",",","


In [4]:
len(df['sent'])

33

In [5]:
X, y = make_embeddings(df, 1)
d = dict()
for i in range(len(y)):
    d[y[i]] = X[i][0]

Token creation complete
Token padding complete
batch 0


In [6]:
# Create Pickles
pickle_path = '/home/tim/Documents/Sentiment/Data/pickles'

# elmo embeddings
pickle_out = open(os.path.join(pickle_path,'elmo_embeddings.pickle'),"wb")
pickle.dump(d,pickle_out)
pickle_out.close()