In [15]:
################################################
# intialize glove
################################################
# >pip install glove_python
# >wget http://www.google.com/search?q=glove+word+embeddings+download+those+dope+pre+trained+vectors
# >unzip dope_glove_shit.zip
# >cp cmps242_hw5_config.py.example cmps242_hw5_config.py
# >echo "set GLOVE_LOCATION in cmps242_hw5_config.py to one of those files"
################################################
from cmps242_hw5_config import *
import numpy as np

print("Manually importing glove vectors")

# glove data
glove_data = dict()
max_glove_val = 0
with open(GLOVE_LOCATION, encoding='utf8') as glovein:
    for line in glovein:
        line = line.split()
        word = line[0]
        vec = [float(line[i]) for i in range(1,len(line))]
        for g in vec:
            if abs(g) > max_glove_val: max_glove_val = abs(g)
        glove_data[word] = vec
    print("Imported {} words, with max glove val {}".format(len(glove_data), max_glove_val))

glove_dict_length = len(vec)
def get_glove_vector(tokens, glove_information=glove_data, glove_vec_length=glove_dict_length):
    temp_vec = [[0.0] for _ in range(glove_vec_length)]

    for word in tokens:
        if word not in glove_information: continue
        glove_vec = glove_information[word]
        for i in range(glove_vec_length):
            temp_vec[i].append(glove_vec[i])
    
    vec = []
    for i in range(glove_vec_length):
        vec.append(np.mean(temp_vec[i]))
        vec.append(np.std(temp_vec[i]))

    return vec

Manually importing glove vectors
Imported 400000 words, with max glove val 5.4593


In [16]:
################################################
# file parsing functions
################################################
from nltk.tokenize import TweetTokenizer
import string, re
import collections
import numpy as np
import glove
from glove.glove_cython import fit_vectors, transform_paragraph

# definitions
HC="HillaryClinton"
DT="realDonaldTrump"
NA="none"
HANDLES = [HC,DT,NA]
HANDLE_MAP = {NA:-1, HC:0, DT:1}

# read csv file, return handles and tweets
def parse_tweet_csv(file, file_encoding="utf8"):
    # init
    handles, tweets = [], []
    
    # read file
    linenr = -1
    with open(file, encoding=file_encoding) as input:
        try:
            for line in input:
                linenr += 1
                if linenr == 0: continue
                
                # get contents
                line = line.split(",")
                if line[0] in HANDLES: #label and irst line of tweet
                    handles.append(line[0])
                    tweet = ','.join(line[1:])
                    tweets.append(tweet)
                else: #second+ line of tweet
                    tweet = tweets.pop()
                    tweet += ','.join(line)
                    tweets.append(tweet)
        except Exception as e:
            print("Exception at line {}: {}".format(linenr, e))
            raise e
    
    # sanity checks
    assert len(handles) == len(tweets)
    print("Found {} tweets in {} lines".format(len(tweets), linenr + 1))
    
    # return data
    return handles, tweets


##########################################
### coverting tweet strings to numbers ###

# coverting labels to integers
def int_labels(labels):
    return list(map(lambda x: HANDLE_MAP[x], labels))

#tokenizing
_tokenizer = TweetTokenizer()
_punctuation = set(string.punctuation)
def tokenize(tweet, lowercase=True, strip_urls=True, strip_punctuation=True):
    tokens = _tokenizer.tokenize(tweet)
    if lowercase: tokens = list(map(lambda x: x.lower(), tokens))
    if strip_urls: tokens = list(filter(lambda x: not x.startswith("http"), tokens))
    if strip_punctuation: #https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
        tokens = list(filter(lambda x: x.startswith(u'@') or x.startswith(u'#') or x not in _punctuation and not re.match(u"[^\w\d'\s$]+", x), tokens))
    return tokens


# get all tweets
def import_text(tweets):
    return [get_glove_vector(tokenize(tweet)) for tweet in tweets]

In [17]:
################################################
# get raw test data
################################################
import random

# init
TEST_RATIO = 0.1
assert TEST_RATIO > 0 and TEST_RATIO < 1

# get data
text_handles, raw_tweets = parse_tweet_csv("train.csv")
handles = int_labels(text_handles)
tweets = import_text(raw_tweets)   
data_vector_size = len(tweets[0])

### validation
for i in range(1):
    tweet = raw_tweets[random.randint(0, len(raw_tweets))]
    print(tokenize(tweet))
    print(get_glove_vector(tokenize(tweet)))
    print()
# for handle in int_labels(handles[0:7]):
#     print(handle)

Found 4743 tweets in 6251 lines
['the', 'first', 'in', 'a', 'new', 'series', 'a', 'message', 'from', 'your', 'potential', 'next', 'president', 'on', 'pregnancy']
[0.2992816875, 0.31553719851067452, 0.39200687499999998, 0.20576131146667095, -0.078937818749999986, 0.43497225748163271, 0.036343374999999997, 0.51416561809485017, 0.29410268750000002, 0.50567872089211929, 0.28678837500000004, 0.71390184460872097, -0.58061412499999998, 0.43955127129933247, -0.16351031249999998, 0.30235273835305321, 0.096402960624999992, 0.61435490196876263, -0.047386687500000003, 0.34502558174027448, -0.13158525000000004, 0.32708791771622731, -0.040319874999999998, 0.5057580890309461, -0.35178218750000001, 0.46987363940082588, 0.13452987500000002, 0.37077039688024499, 0.40985453125000004, 0.41723479237832428, 0.0028652499999999963, 0.4251665935958957, -0.25630468750000002, 0.46280584956095772, -0.17510508125000002, 0.38136119423626341, -0.49419000000000002, 0.74748610662673864, -0.16181806249999997, 0.4204325

In [19]:
################################################
# split test data into train and test
################################################
import pandas as pd

LABEL = 'handle'
DATA = 'tweet_data'
LENGTH = 'length'

# we get floats from glove, so we need to convert them to integers.
# the glove floats are both positive and negative, and I'm not sure
# what the min/max are, so we normalize over that value, and then
# scale them to a particular granularity
# TODO how do we give floats to tensorflow?
# TODO what's the min/max we'll get from this glove library?

# this is so the glove output is in integer form (maybe not necessary)
FLOAT_GRANULARITY = (1 << 16)
VOCAB_SIZE = 2 * FLOAT_GRANULARITY + 1 # +/- and exclusive

# this is so we can define a range for the values
# value_max = 0
# for tweet in tweets:
#     for x in tweet:
#         if abs(x) > value_max: value_max = abs(x)
# print("Got max value of {}".format(value_max))
value_max = 10 # so we're not dependent on what the data is

# split into test and train
train_labels, train_data, test_labels, test_data = list(), list(), list(), list()
for handle, tweet in zip(handles, tweets):
#     if np.isnan(tweet[0]): continue #a row of all nan's happens with data that glove can't understand (like, all hashtags)
#     tweet = [list(map(lambda x: int(x / value_max * FLOAT_GRANULARITY + FLOAT_GRANULARITY), word)) for word in tweet]
    tweet = list(map(lambda x: int(x / value_max * FLOAT_GRANULARITY + FLOAT_GRANULARITY), tweet))
    if random.random() < TEST_RATIO:
        test_labels.append(handle)
        test_data.append(tweet)
    else:
        train_labels.append(handle)
        train_data.append(tweet)

# document and validate
print("Separated into {} train and {} test ({}%)\n".format(len(train_data), len(test_data), 
                                                         int(100.0 * len(test_data) / len(raw_tweets))))
assert len(train_labels) == len(train_data) and len(train_data) > 0
assert len(test_labels) == len(test_data) and len(test_data) > 0
assert len(test_labels) > len(tweets) * (TEST_RATIO - .05)
assert len(test_labels) < len(tweets) * (TEST_RATIO + .05) 

# save to dataframe
train = pd.DataFrame({
    LABEL: train_labels,
    DATA: train_data,
    LENGTH: [data_vector_size for _ in range(len(train_data))]
})
test = pd.DataFrame({
    LABEL: test_labels,
    DATA: test_data,
    LENGTH: [data_vector_size for _ in range(len(test_data))]
})
print(train.head())

Separated into 4263 train and 480 test (10%)

   handle  length                                         tweet_data
0       0     100  [67949, 68325, 66056, 68118, 65716, 68268, 649...
1       0     100  [66689, 67325, 65331, 68355, 66513, 67626, 648...
2       0     100  [67627, 67406, 64699, 66789, 66828, 67378, 641...
3       0     100  [67154, 69252, 65796, 67757, 66097, 68693, 646...
4       1     100  [64800, 69204, 67689, 67783, 64180, 68449, 658...


In [23]:
################################################
# initializing our tensor
#
# based off of blogpost david parks showed us:
# https://r2rt.com/recurrent-neural-networks-in-tensorflow-iii-variable-length-sequences.html
################################################
import tensorflow as tf
 
 
class DataIterator():
    def __init__(self, df):
        self.df = df
        self.size = len(self.df)
        self.epochs = 0
        self.shuffle()

    def shuffle(self):
        self.df = self.df.sample(frac=1).reset_index(drop=True)
        self.cursor = 0

    def next_batch(self, n):
        if self.cursor + n - 1 > self.size:
            self.epochs += 1
            self.shuffle()
        res = self.df.ix[self.cursor:self.cursor + n - 1]
        start_idx = self.cursor
        self.cursor += n
        # return res[DATA], res[LABEL], res[LENGTH]
        # the above line fails.  an error is thrown when tf attempts to call np.asarray on this.
        # what is different about how our data is organized compared to the blog post this came from?
        # TODO 
        data = res[DATA]
        labels = res[LABEL]
        length = res[LENGTH]
        return np.asarray([data[i] for i in range(start_idx, start_idx + len(data))]), \
               np.asarray([labels[i] for i in range(start_idx, start_idx + len(labels))]), \
               np.asarray([length[i] for i in range(start_idx, start_idx + len(length))])


def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()


def build_graph(vocab_size = VOCAB_SIZE, state_size = 64, batch_size = 256, num_classes = 2):

    reset_graph()

    # Placeholders
    x = tf.placeholder(tf.int32, [batch_size, None]) # [batch_size, num_steps]
    seqlen = tf.placeholder(tf.int32, [batch_size])
    y = tf.placeholder(tf.int32, [batch_size])
    keep_prob = tf.placeholder_with_default(1.0, [])

    # Embedding layer
    embeddings = tf.get_variable('embedding_matrix', [vocab_size, state_size])
    rnn_inputs = tf.nn.embedding_lookup(embeddings, x)

    # RNN
    cell = tf.nn.rnn_cell.GRUCell(state_size)
    init_state = tf.get_variable('init_state', [1, state_size],
                                 initializer=tf.constant_initializer(0.0))
    init_state = tf.tile(init_state, [batch_size, 1])
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, sequence_length=seqlen,
                                                 initial_state=init_state)

    # Add dropout, as the model otherwise quickly overfits
    rnn_outputs = tf.nn.dropout(rnn_outputs, keep_prob)
    idx = tf.range(batch_size)*tf.shape(rnn_outputs)[1] + (seqlen - 1)
    # last_rnn_output = tf.gather(tf.reshape(rnn_outputs, [-1, state_size]), idx)
    # last_rnn_output = tf.gather_nd(rnn_outputs, tf.pack([tf.range(batch_size), seqlen-1], axis=1))
    last_rnn_output = tf.gather_nd(rnn_outputs, tf.stack([tf.range(batch_size), seqlen-1], axis=1))

    # Softmax layer
    with tf.variable_scope('softmax'):
        W = tf.get_variable('W', [state_size, num_classes]) # weights?
        b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0)) # bias?
    logits = tf.matmul(last_rnn_output, W) + b
    preds = tf.nn.softmax(logits)
    correct = tf.equal(tf.cast(tf.argmax(preds,1),tf.int32), y)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)

    return {
        'x': x,
        'seqlen': seqlen,
        'y': y,
        'dropout': keep_prob,
        'loss': loss,
        'ts': train_step,
        'preds': preds,
        'accuracy': accuracy
    }


def train_graph(g, batch_size = 256, num_epochs = 10, iterator = DataIterator):
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tr = iterator(train)
        te = iterator(test)

        step, accuracy = 0, 0
        tr_losses, te_losses = [], []
        current_epoch = 0
        while current_epoch < num_epochs:
            step += 1
            batch = tr.next_batch(batch_size)
            feed = {g['x']: batch[0], g['y']: batch[1], g['seqlen']: batch[2], g['dropout']: 0.6}
            accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
            accuracy += accuracy_

            if tr.epochs > current_epoch:
                current_epoch += 1
                tr_losses.append(accuracy / step)
                step, accuracy = 0, 0

                #eval test set
                te_epoch = te.epochs
                while te.epochs == te_epoch:
                    step += 1
                    batch = te.next_batch(batch_size)
                    feed = {g['x']: batch[0], g['y']: batch[1], g['seqlen']: batch[2]}
                    accuracy_ = sess.run([g['accuracy']], feed_dict=feed)[0]
                    accuracy += accuracy_

                te_losses.append(accuracy / step)
                step, accuracy = 0,0
                print("Accuracy after epoch", current_epoch, " - tr:", tr_losses[-1], "- te:", te_losses[-1])

    return tr_losses, te_losses

In [24]:
################################################
# explore our data iterator
################################################

# validate data iterator
d = DataIterator(test).next_batch(3)
print('Input sequences:\n', 
      "{}: \n{}\n".format(type(d[0]), d[0]), 
      "{}: \n{}\n".format(type(d[0][0]), d[0][0]), 
      "{}: \n{}\n".format(type(d[0][0][0]), d[0][0][0]), 
      end='\n\n')
print('Target values\n', 
      "{}: \n{}\n".format(type(d[1]), d[1]), 
      "{}: \n{}\n".format(type(d[1][0]), d[1][0]), 
      end='\n\n')
print('Sequence lengths\n', 
      "{}: \n{}\n".format(type(d[2]), d[2]), 
      "{}: \n{}\n".format(type(d[2][0]), d[2][0]), 
      end='\n\n')

Input sequences:
 <class 'numpy.ndarray'>: 
[[66390 69131 64966 69233 65793 69143 63732 67488 68058 68137 65824 69315
  64480 67910 65208 68601 65901 68737 63126 69413 65431 67197 65555 70369
  63442 68068 63911 67308 67688 69270 65319 69450 65732 68883 64773 67937
  64342 70660 62962 68460 66118 68047 67701 68850 65890 68623 64129 67787
  64520 69453 54661 69881 64148 69142 63278 68227 66913 68666 64407 68610
  85149 71597 67158 69010 62752 69216 61195 69882 66518 68091 66591 68988
  66198 68159 64202 68892 66105 67391 66564 68652 62665 68051 66413 68137
  68748 68884 66601 68067 65115 69194 65265 69465 63262 67922 65482 68022
  65720 67351 64676 68335]
 [67339 68446 65015 68451 66220 68868 65765 68795 68674 67771 64848 67980
  62950 67469 65744 68951 64185 67757 64658 69829 65567 68524 67901 69259
  62854 67333 65437 67360 68206 69978 69344 67992 66833 68117 65248 67550
  64034 68992 63029 69029 66107 67844 65763 68201 66179 68132 65013 67889
  67293 68394 55246 69388 63875 68814 655

In [25]:
################################################
# run it!
################################################

# this fails, just like us
g = build_graph()
tr_losses, te_losses = train_graph(g)

Accuracy after epoch 1  - tr: 0.513097426471 - te: 0.490234375
Accuracy after epoch 2  - tr: 0.63427734375 - te: 0.4921875
Accuracy after epoch 3  - tr: 0.718017578125 - te: 0.46875
Accuracy after epoch 4  - tr: 0.84228515625 - te: 0.5234375
Accuracy after epoch 5  - tr: 0.84912109375 - te: 0.5078125
Accuracy after epoch 6  - tr: 0.88427734375 - te: 0.578125
Accuracy after epoch 7  - tr: 0.84912109375 - te: 0.546875
Accuracy after epoch 8  - tr: 0.74853515625 - te: 0.4921875
Accuracy after epoch 9  - tr: 0.85498046875 - te: 0.5234375
Accuracy after epoch 10  - tr: 0.92041015625 - te: 0.56640625


In [None]:
#TODO: we're overfitting, why?

#TODO: try something better than averaging the word vec values
# maybe we could do three-d arrays?  encode each word and pad the data

#TODO: we strip a lot away (ie punctuation, smilies) and lose other
# data to glove (#hashtags, @handles). how can we keep this?

#TODO: how do we run this on the test dataset?