In [None]:
%tensorflow_version 1.x
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
pd.set_option('display.max_colwidth', 200)
import tensorflow_hub as hub
import tensorflow as tf 
print(tf.__version__) # use 1.15 version for elmo

# load spacy's language model to lemmatize later on
nlp = spacy.load('en', disable=['parser', 'ner'])

TensorFlow 1.x selected.
1.15.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# load elmo
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)


In [None]:
# read train and test data: columns: id, (label), tweet
train = pd.read_csv("/content/drive/My Drive/machine_learning/data_tweet_sentiment/train_tweets.csv")
test = pd.read_csv("/content/drive/My Drive/machine_learning/data_tweet_sentiment/test_tweets.csv")


In [None]:
# check shape + content of training set
train.shape
train


Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!
...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile #sony #music #headphones https://instagram.com/p/5spiNsJ_c9/
7916,7917,0,"We would like to wish you an amazing day! Make every minute count #tls #today #iphone #accessories #news #life February 23, 2017 at 0…"
7917,7918,0,Helping my lovely 90 year old neighbor with her iPad this morning has just made me realise that 'I' don't actually need an I pad!
7918,7919,0,"Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/"


In [None]:
# check class distribution: return a series containing counts of relative frequencies of the labels
train['label'].value_counts(normalize = True)

0    0.744192
1    0.255808
Name: label, dtype: float64

In [None]:
# make Cleaner class
class Cleaner:
    def __init__(self, df_row):
        self.df_row = df_row
        
    # remove URL links
    def remove_url(self):
        no_url = re.sub(r'http\S+', '', self.df_row) # re: remove http + non-white-space character
        return no_url # string
    
    # remove everything that is not whitespace or word character + lower words
    def remove_punct(self):
        no_punct = re.sub(r'[^\w\s]','',self.df_row.lower()) 
        return no_punct
    
    # remove numbers 
    def remove_nums(self):
        no_nums = re.sub('[0-9]', "", self.df_row)
        return no_nums # string
    
    # remove whitespace
    def remove_whitespace(self):
        no_whitespace = self.df_row.strip()
        return no_whitespace # string
    
    # normalize text by lemmatizing
    def lemmatize(self):
        # get lemma, keep pronouns
        doc = [token.lemma_ if token.lemma_ != '-PRON-' else token.orth_ for token in nlp(self.df_row)] 
        lemmas = ' '.join(doc)
        return lemmas
        
        
        

In [None]:
# apply Cleaner class methods to train and test set

train['clean_tweet'] = train['tweet'].apply(lambda row: Cleaner(row).remove_url())
train['clean_tweet'] = train['clean_tweet'].apply(lambda row: Cleaner(row).remove_punct())
train['clean_tweet'] = train['clean_tweet'].apply(lambda row: Cleaner(row).remove_nums())
train['clean_tweet'] = train['clean_tweet'].apply(lambda row: Cleaner(row).remove_whitespace())
train['clean_tweet'] = train['clean_tweet'].apply(lambda row: Cleaner(row).lemmatize())

test['clean_tweet'] = test['tweet'].apply(lambda row: Cleaner(row).remove_url())
test['clean_tweet'] = test['clean_tweet'].apply(lambda row: Cleaner(row).remove_punct())
test['clean_tweet'] = test['clean_tweet'].apply(lambda row: Cleaner(row).remove_nums())
test['clean_tweet'] = test['clean_tweet'].apply(lambda row: Cleaner(row).remove_whitespace())
test['clean_tweet'] = test['clean_tweet'].apply(lambda row: Cleaner(row).lemmatize())

In [None]:
# check if methods were applied right
train

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test android app beautiful cute health iger iphoneonly iphonesia iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally a transparant silicon case thank to my uncle yay sony xperia s sonyexperias
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,we love this would you go talk makememorie unplug relax iphone smartphone wifi connect
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,i be wire i know i be george i be make that way iphone cute daventry home
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple will not even talk to me about a question i have unless i pay them for their stupid support
...,...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile #sony #music #headphones https://instagram.com/p/5spiNsJ_c9/,live out loud lol liveoutloud selfie smile sony music headphone
7916,7917,0,"We would like to wish you an amazing day! Make every minute count #tls #today #iphone #accessories #news #life February 23, 2017 at 0…",we would like to wish you an amazing day make every minute count tls today iphone accessory news life february at
7917,7918,0,Helping my lovely 90 year old neighbor with her iPad this morning has just made me realise that 'I' don't actually need an I pad!,help my lovely year old neighbor with her ipad this morning have just make me realise that i do not actually need an i pad
7918,7919,0,"Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/",finally get my smart pocket wifi stay connected anytimeanywhere ipad and samsung s gadget


In [None]:
# to not waste too much memory, get word embeddings in batches and not in one go
# split train and test set into batches of 100 samples each, store them in lists

# split data into chunks of 100 from start to end in steps of 100
list_train = [train[i:i+100]for i in range(0,train.shape[0], 100)] 
list_test = [test[i:i+100] for i in range(0,test.shape[0],100)]

In [None]:
# create vectors

def get_elmo_vectors(x):
  
  # convert series(tweet) to list
  # default: the module takes untokenized sentences as input
  # elmo: the weighted sum of the 3 layers, where the weights are trainable
  embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]
    
 # initialize session with variables 
  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    
    # return average of elmo features
    return sess.run(tf.reduce_mean(embeddings,1))



In [None]:
# get elmo embeddings
elmo_train = [get_elmo_vectors(x['clean_tweet']) for x in list_train]
elmo_test = [get_elmo_vectors(x['clean_tweet']) for x in list_test]


In [None]:
# concatenate back into single array
elmo_train_array = np.concatenate(elmo_train, axis = 0)
elmo_test_array = np.concatenate(elmo_test, axis = 0)

In [None]:
# save the vectors as pickle-files
pickle_train = open("elmo_train.pickle","wb")
pickle.dump(elmo_train_array, pickle_train)
pickle_train.close()


pickle_test = open("elmo_test.pickle","wb")
pickle.dump(elmo_test_array, pickle_test)
pickle_test.close()