In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle

# prevent tensorflow from using GPU. Otherwise, run out of memory
# https://stackoverflow.com/questions/44552585/prevent-tensorflow-from-accessing-the-gpu
# os.environ["CUDA_VISIBLE_DEVICES"]="-1"

import tensorflow_hub as hub
import tensorflow as tf

import nltk
import regex as re
import emoji as em

elmo = hub.Module("https://tfhub.dev/google/elmo/2")

W0415 11:01:52.426144 140286343653184 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [2]:
# FUNCTIONS

# process tweet
def preprocess(tweet):
    #define hashtag pattern
    hashtag_define = re.compile ('#')
    #define mention pattern 
    mentions_define = re.compile('@[^\s]+')
    #define link pattern
    link_define = re.compile('https?://[^\s]+')
    # Haystack define
    haystack_define = re.compile('(RT)')
    # define long spaces
    extra_spaces_define = re.compile('\s{2,}')
    #remove hashtags
    tweet_refine = hashtag_define.sub('',tweet)
    # remove mentions
    tweet_refine = mentions_define.sub('',tweet_refine)
    # remove links
    tweet_refine = link_define.sub('',tweet_refine)
    # remove haystack
    tweet_refine = haystack_define.sub('',tweet_refine)
    #replace long spaces with one space
    tweet_refine = extra_spaces_define.sub(' ',tweet_refine)
    # convert emoticons into words
    tweet_refine = em.demojize(tweet_refine)
    return tweet_refine

# Create tokens with simple split on whitespace
def simple_token(s):
    return s.split()

# Get token length
def token_length(s):
    return len(s)

# Create "embedding lists" of equal size -- pad with empty characters, e.g. ""
# https://stackoverflow.com/questions/24066904/most-pythonic-way-to-extend-a-list-to-exactly-a-certain-length
def pad_list(some_list, target_len):
    return some_list[:target_len] + [""]*(target_len - len(some_list))

# Get the elmo embeddings
def elmo_tweet_embedder(tokens,len_list):
    
    tokens_input = tokens #load a tweet
    tokens_length = len_list # get length of tweet

    #create embedding
    embedding_tensor = elmo(inputs={"tokens":tokens_input,"sequence_len":tokens_length},
                            signature="tokens", as_dict=True)["word_emb"] # <-- passing in a list instead of [word]

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        embedding = sess.run(embedding_tensor)
        return embedding

# combine all together
def make_embeddings(df, max_seq_len):
    m = max_seq_len
    
    # create a token column in df
    df['tokens'] = df['text'].apply(simple_token)
    
    # pad the tokens
    df['tokens'] = df.apply(lambda x: pad_list(x['tokens'],m),axis=1)
    
    # split data into smaller batches of size 100. http://bit.ly/2P4J8HJ
    text_batches = [df['tokens'][i:i+100] for i in range(0,df.shape[0],100)]
    len_lists = [[m] * len(x) for x in text_batches]

    # create list of sentiments (y values)
    y = df['sent'].to_numpy()
    
    # make embeddings
    embeddings = []
    for i in range(0,len(text_batches)):
        elmo_train = elmo_tweet_embedder(text_batches[i].tolist(),len_lists[i])
        embeddings.append(elmo_train)
        
    embeddings = np.concatenate(embeddings, axis=0)
    
    return embeddings, y

In [15]:
path = '/home/tim/Documents/Sentiment/Data/sent140_raw'
file = 'training.1600000.processed.noemoticon.csv'

p = os.path.join(path,file)
col_names = ['sent','date','q_type','handle','text']
df = pd.read_csv(p, sep=',', encoding='latin-1',names=col_names)
df.head()

Unnamed: 0,sent,date,q_type,handle,text
0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [16]:
df['text'] = df['text'].apply(preprocess)

In [17]:
df.head()

Unnamed: 0,sent,date,q_type,handle,text
0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"- Awww, that's a bummer. You shoulda got Davi..."
0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,I dived many times for the ball. Managed to s...
0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"no, it's not behaving at all. i'm mad. why am..."


In [18]:
X, y = make_embeddings(df, 80)

KeyboardInterrupt: 

In [9]:
path = '/home/tim/Documents/Sentiment/Data/processed/'
file_list = os.listdir(path)

# create the data set names
names = []
for f in file_list:
    f = f.split('.')
    l = ["x_"+f[0],"y_"+f[0]]
    names.append(l)

data_names = names
    
# index through files and create x and y sets
for i in range(0,len(file_list)):
    
    s = names
   
    p = os.path.join(path,file_list[i])
    col_names = ['text','sent']
    df = pd.read_csv(p,delimiter="\t",names=col_names)

    print(s[i][0], "start")
    s[i][0], s[i][1] = make_embeddings(df, 80) 

x_train = s[0][0]
y_train = s[0][1]
x_dev = s[1][0]
y_dev = s[1][1]
x_test = s[2][0]
y_test = s[2][1]

x_train start
x_dev start
x_test start


In [15]:
# Create Pickles
pickle_path = '/home/tim/Documents/Sentiment/Data/pickles'

# x_train
pickle_out = open(os.path.join(pickle_path,'x_train.pickle'),"wb")
pickle.dump(x_train, pickle_out)
pickle_out.close()

# y_train
pickle_out = open(os.path.join(pickle_path,'y_train.pickle'),"wb")
pickle.dump(y_train, pickle_out)
pickle_out.close()

# x_dev
pickle_out = open(os.path.join(pickle_path,'x_dev.pickle'),"wb")
pickle.dump(x_dev, pickle_out)
pickle_out.close()

# y_dev
pickle_out = open(os.path.join(pickle_path,'y_dev.pickle'),"wb")
pickle.dump(y_dev, pickle_out)
pickle_out.close()

# x_test
pickle_out = open(os.path.join(pickle_path,'x_test.pickle'),"wb")
pickle.dump(x_test, pickle_out)
pickle_out.close()

# y_test
pickle_out = open(os.path.join(pickle_path,'y_test.pickle'),"wb")
pickle.dump(y_test, pickle_out)
pickle_out.close()