In [1]:
import numpy as np
import os
import tensorflow as tf
from IPython.display import clear_output, Image, display, HTML
from tensorflow.contrib.layers import variance_scaling_initializer
from tensorflow.contrib.layers import fully_connected
from tensorflow.contrib.data import Dataset, Iterator
from sklearn.metrics import f1_score, classification_report
import datetime

###### Do not modify here ###### 
def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = graph_def
    #strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))
###### Do not modify  here ######

###### Do not modify here ###### 

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)



## Data Preparing

In [2]:
from nltk.tokenize import TweetTokenizer
import pandas as pd

tknzr = TweetTokenizer()

# Load Training data
en_train = pd.read_csv('data/supervised_phase/en_full/en_full.tsv', delimiter='\t', names=["id", "lang", "polarity", "tweet"])
en_train.head()

Unnamed: 0,id,lang,polarity,tweet
0,264183816548130816,en,positive,Gas by my house hit $3.39!!!! I'm going to Cha...
1,263405084770172928,en,negative,"Theo Walcott is still shit, watch Rafa and Joh..."
2,262163168678248449,en,negative,"its not that I'm a GSP fan, i just hate Nick D..."
3,264249301910310912,en,negative,Iranian general says Israel's Iron Dome can't ...
4,264105751826538497,en,positive,with J Davlar 11th. Main rivals are team Polan...


In [3]:
# Load Testing data
en_test = pd.read_csv('data/supervised_phase/en_full/en_test.tsv', delimiter='\t', names=["id", "lang", "polarity", "tweet"])
en_test.head()

Unnamed: 0,id,lang,polarity,tweet
0,11378,en,neutral,"Picturehouse's, Pink Floyd's, 'Roger Waters: T..."
1,11379,en,neutral,Order Go Set a Watchman in store or through ou...
2,11380,en,negative,If these runway renovations at the airport pre...
3,11381,en,neutral,If you could ask an onstage interview question...
4,11382,en,positive,A portion of book sales from our Harper Lee/Go...


In [4]:
en_full = pd.concat([en_train['tweet'],en_test['tweet']])
en_full.head()

0    Gas by my house hit $3.39!!!! I'm going to Cha...
1    Theo Walcott is still shit, watch Rafa and Joh...
2    its not that I'm a GSP fan, i just hate Nick D...
3    Iranian general says Israel's Iron Dome can't ...
4    with J Davlar 11th. Main rivals are team Polan...
Name: tweet, dtype: object

In [5]:
def word2index(tweet):
    """
    Convert a tweet to a sequence of word index 
    """
    return [vocabulary_dict.get(token)[0] for token in tknzr.tokenize(tweet.lower()) if vocabulary_dict.get(token) != None]

def polarity2label(polarity):
    if polarity =='negative':  return 0
    elif polarity =='neutral': return 1
    elif polarity =='positive':return 2 

In [6]:
# en_full['index_rep'] = en_full['tweet'].map(word2index)
en_train['label'] = en_train['polarity'].map(polarity2label)
en_train.head()

Unnamed: 0,id,lang,polarity,tweet,label
0,264183816548130816,en,positive,Gas by my house hit $3.39!!!! I'm going to Cha...,2
1,263405084770172928,en,negative,"Theo Walcott is still shit, watch Rafa and Joh...",0
2,262163168678248449,en,negative,"its not that I'm a GSP fan, i just hate Nick D...",0
3,264249301910310912,en,negative,Iranian general says Israel's Iron Dome can't ...,0
4,264105751826538497,en,positive,with J Davlar 11th. Main rivals are team Polan...,2


In [7]:
en_test['label'] = en_test['polarity'].map(polarity2label)
en_test.head()

Unnamed: 0,id,lang,polarity,tweet,label
0,11378,en,neutral,"Picturehouse's, Pink Floyd's, 'Roger Waters: T...",1
1,11379,en,neutral,Order Go Set a Watchman in store or through ou...,1
2,11380,en,negative,If these runway renovations at the airport pre...,0
3,11381,en,neutral,If you could ask an onstage interview question...,1
4,11382,en,positive,A portion of book sales from our Harper Lee/Go...,2


In [8]:
from tensorflow.contrib import learn
# Build vocabulary
max_document_length = max([len(tknzr.tokenize(tweet)) for tweet in  en_full])
# max_document_length = 44
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length).fit(en_full)

tweets_train = np.array(list(vocab_processor.transform(en_train['tweet'])))
tweets_test  = np.array(list(vocab_processor.transform(en_test['tweet'])))

In [9]:
print(tweets_train.shape)
print(tweets_test.shape)
tweets_train[1]

(18044, 53)
(20632, 53)


array([15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 13, 27,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0])

## Prepare Training + Testing Data

In [10]:
# from sklearn.model_selection import ShuffleSplit
# rs = ShuffleSplit(n_splits=1, test_size=.1, random_state=0)

senti_train = en_train['label'].as_matrix()
senti_test  = en_test['label'].as_matrix()

# for train_index, test_index in rs.split(senti):
#     X_train = tweets[train_index]
#     y_train = senti[train_index]

#     X_test = tweets[test_index]
#     y_test = senti[test_index]
# X_train.shape

## Model Construction

In [11]:
import pickle

# Load pre train Word2vec
wb_matrix = np.load("data/embed_tweets_en_200M_200D/embedding_matrix.npy")
print(wb_matrix.shape)
vocabulary_dict = pickle.load(open("data/embed_tweets_en_200M_200D/vocabulary.pickle", "rb"))
print(len(vocabulary_dict))

(1859185, 200)
1859184


In [12]:
vocabulary_dict.get('am')

(134, 2480704)

In [13]:
 # initial matrix with random uniform
initW = np.random.uniform(-0.25,0.25,(len(vocab_processor.vocabulary_), wb_matrix.shape[1]))
# load any vectors from the word2vec
for word, vector in vocabulary_dict.items():
    idx = vocab_processor.vocabulary_.get(word)
    if idx != 0:
        initW[idx] = wb_matrix[vector[0]]
initW.shape

(75465, 200)

In [29]:
n_input = max_document_length
n_output = 3
learning_rate = 1

embedding_size = initW.shape[1]
filter_sizes = [4,3]
num_filters = 200
pooling_size = 4
pooling_strides = 2
epochs_num = 2000
batch_size = 1024

In [30]:
reset_graph()

X = tf.placeholder(tf.int32, shape = (None, n_input), name = "Input_X")
y = tf.placeholder(tf.int32, shape = (None), name = "Y")
# mode = tf.placeholder(tf.bool, name = "Mode")

# Load Embedding Model
with tf.device('/cpu:0'), tf.name_scope("embedding"):
    word2vec = tf.Variable(tf.constant(0.0, shape = initW.shape),
                    trainable=False, name="word2vec") # trainable=False, means not update these embeddings

embedded_chars = tf.nn.embedding_lookup(word2vec, X)
embedded_chars_expanded = tf.expand_dims(embedded_chars, -1) # ex: convert [[1,2]] to [[1],[2]], that is shape (2,) to (2,1)

# 1st convolution layer
conv1 = tf.layers.conv2d(embedded_chars_expanded, 
                         filters = num_filters, 
                         kernel_size = (filter_sizes[0], initW.shape[1]),
                         strides = (1,1), 
                         kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
                         activation = tf.nn.relu,
                         name="Convolution_1st"
                        )

pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[4, 1], strides=2)

# 2nd convolution layer
conv2 = tf.layers.conv2d(pool1, 
                         filters = num_filters, 
                         kernel_size = (filter_sizes[1], 1),
                         strides = (1,1), 
                         kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
                         activation = tf.nn.relu,
                         name="Convolution_2nd"
                        )

pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[22, 1], strides=1)

# Dense Layer, Combine all the pooled features
pool2_flat = tf.reshape(pool2, [-1, 200])

dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu, name = "Fully_connect")
# dense = tf.layers.dropout(inputs = dense, rate = 0.3, training = mode)

# Logits Layer
logits = tf.layers.dense(inputs=dense, 
                         units=n_output, 
                         activation=tf.nn.softmax, 
                         kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
                         name = "Softmax")

# Define Loss Function
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits, name="Cross_Entropy"))

# Define Training Process
train_step = tf.train.AdadeltaOptimizer(learning_rate).minimize(cross_entropy)

# Define Accuracy
predicted_class = tf.argmax(logits,1, output_type=tf.int32)
correct_predict = tf.equal(y, predicted_class) # [True, False ..., True]
accuracy = tf.reduce_mean(tf.cast(correct_predict, tf.float32)) # [True, False ..., True] --> [1,0,...,1]


(?, 22, 1, 200)
(?, 1, 1, 200)
(?, 200)
(?, 1024)


In [31]:
with tf.Session() as sess:
    show_graph(tf.get_default_graph().as_graph_def())
    # Initialize all variables
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    
    sess.run(word2vec.assign(initW))# Assign the pretrain word2vec


In [32]:
# prepare the training batch
train_data = tf.contrib.data.Dataset.from_tensor_slices((tweets_train, senti_train)).batch(batch_size).repeat()
train_iterator = train_data.make_one_shot_iterator() # Create an iterator to go through the training data
train_next_batch = train_iterator.get_next()

In [33]:
round_of_epochs = int(tweets_train.shape[0]/batch_size)

saver = tf.train.Saver() # to store the model

with tf.Session() as sess:
    # Initialize all variables
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    
    sess.run(word2vec.assign(initW))# Assign the pretrain word2vec
    
    for epochs in range(epochs_num): # starting the training process and set the epochs_num
        for _ in range(round_of_epochs):
            train, label = sess.run(train_next_batch) # Get the mini-batch data sample
            sess.run(train_step, feed_dict={X:train, y:label}) # Feed the features, labe, training_mode  to network to train
        
        if epochs % 5 ==0:
            loss, pred = sess.run([cross_entropy,predicted_class], feed_dict={X:train, y:label})
            print("{} - {} epoch, loss:{:.3f}, train f1 score:{:.3f}, test f1 score:{:.3f}\n".format(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                epochs+1,
                loss, 
#                 f1_score(label, pred, labels=[0,2], average='weighted'),
                f1_score(label, pred, average='weighted'),
#                 f1_score(y_test, sess.run(predicted_class, feed_dict={X:X_test, y:y_test}), labels=[0,2] , average='weighted'),
                f1_score(senti_test, sess.run(predicted_class, feed_dict={X:tweets_test, y:senti_test}), average='weighted')
                )
            )

    

  'precision', 'predicted', average, warn_for)


2017-11-15 22:50:10 - 1 epoch, loss:1.0257307291030884, train f1 score:0.326, test f1 score:0.174

2017-11-15 22:50:13 - 6 epoch, loss:0.9689231514930725, train f1 score:0.412, test f1 score:0.191

2017-11-15 22:50:16 - 11 epoch, loss:0.9984511137008667, train f1 score:0.434, test f1 score:0.460

2017-11-15 22:50:19 - 16 epoch, loss:0.9568789005279541, train f1 score:0.531, test f1 score:0.527

2017-11-15 22:50:22 - 21 epoch, loss:0.9013761281967163, train f1 score:0.605, test f1 score:0.505

2017-11-15 22:50:25 - 26 epoch, loss:0.966029167175293, train f1 score:0.502, test f1 score:0.513

2017-11-15 22:50:28 - 31 epoch, loss:0.8189324736595154, train f1 score:0.677, test f1 score:0.514

2017-11-15 22:50:31 - 36 epoch, loss:0.8928053379058838, train f1 score:0.612, test f1 score:0.535

2017-11-15 22:50:34 - 41 epoch, loss:0.9093162417411804, train f1 score:0.551, test f1 score:0.355

2017-11-15 22:50:37 - 46 epoch, loss:0.8761085271835327, train f1 score:0.615, test f1 score:0.528

201

2017-11-15 22:54:18 - 411 epoch, loss:0.5743576884269714, train f1 score:0.977, test f1 score:0.545

2017-11-15 22:54:21 - 416 epoch, loss:0.5788005590438843, train f1 score:0.973, test f1 score:0.544

2017-11-15 22:54:24 - 421 epoch, loss:0.5843080878257751, train f1 score:0.966, test f1 score:0.541

2017-11-15 22:54:27 - 426 epoch, loss:0.5777226686477661, train f1 score:0.974, test f1 score:0.530

2017-11-15 22:54:30 - 431 epoch, loss:0.5768163204193115, train f1 score:0.975, test f1 score:0.531

2017-11-15 22:54:33 - 436 epoch, loss:0.5774677991867065, train f1 score:0.974, test f1 score:0.545

2017-11-15 22:54:36 - 441 epoch, loss:0.5727989673614502, train f1 score:0.979, test f1 score:0.543

2017-11-15 22:54:39 - 446 epoch, loss:0.5716313123703003, train f1 score:0.979, test f1 score:0.539

2017-11-15 22:54:42 - 451 epoch, loss:0.5794675350189209, train f1 score:0.972, test f1 score:0.536

2017-11-15 22:54:45 - 456 epoch, loss:0.5706514120101929, train f1 score:0.980, test f1 sco

2017-11-15 22:58:26 - 821 epoch, loss:0.5783333778381348, train f1 score:0.973, test f1 score:0.532

2017-11-15 22:58:29 - 826 epoch, loss:0.5688344836235046, train f1 score:0.982, test f1 score:0.543

2017-11-15 22:58:32 - 831 epoch, loss:0.5754411220550537, train f1 score:0.975, test f1 score:0.546

2017-11-15 22:58:35 - 836 epoch, loss:0.5773730874061584, train f1 score:0.974, test f1 score:0.546

2017-11-15 22:58:38 - 841 epoch, loss:0.5706867575645447, train f1 score:0.980, test f1 score:0.537

2017-11-15 22:58:41 - 846 epoch, loss:0.575846791267395, train f1 score:0.975, test f1 score:0.523

2017-11-15 22:58:44 - 851 epoch, loss:0.5783627033233643, train f1 score:0.973, test f1 score:0.529

2017-11-15 22:58:47 - 856 epoch, loss:0.5658817887306213, train f1 score:0.985, test f1 score:0.536

2017-11-15 22:58:50 - 861 epoch, loss:0.5750337839126587, train f1 score:0.976, test f1 score:0.535

2017-11-15 22:58:53 - 866 epoch, loss:0.5784603953361511, train f1 score:0.973, test f1 scor

2017-11-15 23:02:32 - 1226 epoch, loss:0.5784942507743835, train f1 score:0.973, test f1 score:0.543

2017-11-15 23:02:35 - 1231 epoch, loss:0.5792518258094788, train f1 score:0.972, test f1 score:0.510

2017-11-15 23:02:38 - 1236 epoch, loss:0.5757313966751099, train f1 score:0.975, test f1 score:0.547

2017-11-15 23:02:41 - 1241 epoch, loss:0.5738145112991333, train f1 score:0.978, test f1 score:0.537

2017-11-15 23:02:44 - 1246 epoch, loss:0.5746135711669922, train f1 score:0.977, test f1 score:0.533

2017-11-15 23:02:47 - 1251 epoch, loss:0.5718821883201599, train f1 score:0.980, test f1 score:0.546

2017-11-15 23:02:50 - 1256 epoch, loss:0.5696995258331299, train f1 score:0.981, test f1 score:0.538

2017-11-15 23:02:53 - 1261 epoch, loss:0.5774914622306824, train f1 score:0.974, test f1 score:0.545

2017-11-15 23:02:56 - 1266 epoch, loss:0.5705638527870178, train f1 score:0.980, test f1 score:0.549

2017-11-15 23:02:59 - 1271 epoch, loss:0.5764760971069336, train f1 score:0.975, t

2017-11-15 23:06:37 - 1631 epoch, loss:0.5763995051383972, train f1 score:0.975, test f1 score:0.541

2017-11-15 23:06:40 - 1636 epoch, loss:0.5688577890396118, train f1 score:0.982, test f1 score:0.543

2017-11-15 23:06:43 - 1641 epoch, loss:0.5745015144348145, train f1 score:0.976, test f1 score:0.542

2017-11-15 23:06:46 - 1646 epoch, loss:0.5763840675354004, train f1 score:0.975, test f1 score:0.549

2017-11-15 23:06:49 - 1651 epoch, loss:0.5707613229751587, train f1 score:0.980, test f1 score:0.524

2017-11-15 23:06:52 - 1656 epoch, loss:0.5760618448257446, train f1 score:0.975, test f1 score:0.540

2017-11-15 23:06:55 - 1661 epoch, loss:0.5782226920127869, train f1 score:0.973, test f1 score:0.536

2017-11-15 23:06:58 - 1666 epoch, loss:0.564801037311554, train f1 score:0.986, test f1 score:0.535

2017-11-15 23:07:02 - 1671 epoch, loss:0.5723692774772644, train f1 score:0.979, test f1 score:0.526

2017-11-15 23:07:05 - 1676 epoch, loss:0.5764958262443542, train f1 score:0.975, te