In [1]:
import numpy as np
import os
import tensorflow as tf
from IPython.display import clear_output, Image, display, HTML
from tensorflow.contrib.layers import variance_scaling_initializer
from tensorflow.contrib.layers import fully_connected
from tensorflow.contrib.data import Dataset, Iterator
from sklearn.metrics import f1_score, classification_report
import datetime
import matplotlib.pyplot as plt
###### Do not modify here ###### 
def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = graph_def
    #strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))
###### Do not modify  here ######

###### Do not modify here ###### 

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)



## Data Preparing

In [26]:
from nltk.tokenize import TweetTokenizer
import pandas as pd

tknzr = TweetTokenizer()

def filter_tweet(tweet, token = tknzr):
    tweet = re.sub('https?:\/\/\S+','URLTOK',tweet.lower().strip()) # url
    tweet = re.sub('@(?:[a-zA-Z]+|[0-9 \/]+)', 'USRTOK', tweet) # mention
    tweet = re.sub('(\:|\=)(?:\)|\-|\(|D| )+', '', tweet) # emoticon
    return token.tokenize(tweet) # return tokenized

# transform the tweet sentence to numerical representation
def word_transform(tweet_set, max_length = 60):
    set_array = []
    for tweet in tweet_set:
        tweet_array = [0] * max_length
        for i, word in enumerate(tweet):
            if vocabulary_dict.get(word):
                tweet_array[i] = vocabulary_dict[word]
            else:
                tweet_array[i] = 0
        set_array.append(tweet_array)
    return np.array(set_array)

In [None]:
# Load Training data
en_train = pd.read_csv('data/supervised_phase/en_full/en_full.tsv', delimiter='\t', names=["id", "lang", "polarity", "tweet"])
en_train.head()

In [3]:
# Load Testing data
en_test = pd.read_csv('data/supervised_phase/en_full/en_test.tsv', delimiter='\t', names=["id", "lang", "polarity", "tweet"])
en_test.head()

Unnamed: 0,id,lang,polarity,tweet
0,11378,en,neutral,"Picturehouse's, Pink Floyd's, 'Roger Waters: T..."
1,11379,en,neutral,Order Go Set a Watchman in store or through ou...
2,11380,en,negative,If these runway renovations at the airport pre...
3,11381,en,neutral,If you could ask an onstage interview question...
4,11382,en,positive,A portion of book sales from our Harper Lee/Go...


In [4]:
en_full = pd.concat([en_train['tweet'],en_test['tweet']])
en_full.head()

0    Gas by my house hit $3.39!!!! I'm going to Cha...
1    Theo Walcott is still shit, watch Rafa and Joh...
2    its not that I'm a GSP fan, i just hate Nick D...
3    Iranian general says Israel's Iron Dome can't ...
4    with J Davlar 11th. Main rivals are team Polan...
Name: tweet, dtype: object

## Convert to index sentences

In [38]:
def sentence_token(tweet):
    """
    Convert a tweet to a sequence of word index 
    """
    return [token for token in tknzr.tokenize(tweet.lower())]

def polarity2label(polarity):
    if polarity =='negative':  return 0
    elif polarity =='neutral': return 1
    elif polarity =='positive':return 2 

In [40]:
train_token = en_train['tweet'].map(sentence_token)
test_token = en_test['tweet'].map(sentence_token)

In [46]:
from collections import defaultdict
vocabulary_dict = {}
word_index = 0

for tweet in list(test_token) + list(train_token):
    for word in tweet:
        if word not in vocabulary_dict:
            vocabulary_dict[word] = word_index
            word_index +=1
len(vocabulary_dict)

66386

In [6]:
# en_full['index_rep'] = en_full['tweet'].map(word2index)
en_train['label'] = en_train['polarity'].map(polarity2label)
en_train.head()

Unnamed: 0,id,lang,polarity,tweet,label
0,264183816548130816,en,positive,Gas by my house hit $3.39!!!! I'm going to Cha...,2
1,263405084770172928,en,negative,"Theo Walcott is still shit, watch Rafa and Joh...",0
2,262163168678248449,en,negative,"its not that I'm a GSP fan, i just hate Nick D...",0
3,264249301910310912,en,negative,Iranian general says Israel's Iron Dome can't ...,0
4,264105751826538497,en,positive,with J Davlar 11th. Main rivals are team Polan...,2


In [7]:
en_test['label'] = en_test['polarity'].map(polarity2label)
en_test.head()

Unnamed: 0,id,lang,polarity,tweet,label
0,11378,en,neutral,"Picturehouse's, Pink Floyd's, 'Roger Waters: T...",1
1,11379,en,neutral,Order Go Set a Watchman in store or through ou...,1
2,11380,en,negative,If these runway renovations at the airport pre...,0
3,11381,en,neutral,If you could ask an onstage interview question...,1
4,11382,en,positive,A portion of book sales from our Harper Lee/Go...,2


In [8]:
from tensorflow.contrib import learn
# Build vocabulary
max_document_length = max([len(tknzr.tokenize(tweet)) for tweet in  en_full])
# max_document_length = 44
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length).fit(en_full)

tweets_train = np.array(list(vocab_processor.transform(en_train['tweet'])))
tweets_test  = np.array(list(vocab_processor.transform(en_test['tweet'])))

In [9]:
print(tweets_train.shape)
print(tweets_test.shape)
tweets_train[1]

(18044, 53)
(20632, 53)


array([15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 13, 27,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0])

In [48]:
max_length = 60
tweets_train = word_transform(train_token, max_length = max_length)
print(tweets_train.shape)
tweets_test = word_transform(test_token, max_length = max_length)
print(tweets_test.shape)

(18044, 60)
(20632, 60)


In [49]:
print(tweets_train.shape)
print(tweets_test.shape)
tweets_train[1]

(18044, 60)
(20632, 60)


array([38816, 27262,    14,   742,  3757,     1,    19, 27737,    39,
        1468,  1153,   117,   477,    21,   488,    18,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0])

## Prepare Training + Testing Data

In [10]:
# from sklearn.model_selection import ShuffleSplit
# rs = ShuffleSplit(n_splits=1, test_size=.1, random_state=0)

senti_train = en_train['label'].as_matrix()
senti_test  = en_test['label'].as_matrix()

# for train_index, test_index in rs.split(senti):
#     X_train = tweets[train_index]
#     y_train = senti[train_index]

#     X_test = tweets[test_index]
#     y_test = senti[test_index]
# X_train.shape

## Load Pretrain Word2Vec Model (#DIM 52)

In [50]:
import pickle

# Load pre train Word2vec
wb_matrix = np.load("data/embedding/en_word2vec_52.npy")
print(wb_matrix.shape)
vocabulary_dict_ = pickle.load(open("data/embedding/vocabulary_dict_52.pickle", "rb"))
print(len(vocabulary_dict_))

(9770612, 52)
9770611


In [52]:
print(vocabulary_dict.get('hi'))
print(vocabulary_dict_.get('hi'))

1924
246


In [53]:
 # initial matrix with random uniform
initW = np.random.uniform(-0.25,0.25,(len(vocabulary_dict), wb_matrix.shape[1]))
# load any vectors from the word2vec
# for word, vector in vocabulary_dict.items():
#     idx = vocab_processor.vocabulary_.get(word)
#     if idx != 0:
#         initW[idx] = wb_matrix[vector]
# initW.shape
for word, index in vocabulary_dict.items():
#     print(word)
#     print(index)
    idx = vocabulary_dict_.get(word)
#     print(idx)
    
    if idx != 0 and idx is not None:
        initW[index] = wb_matrix[idx]
        
initW.shape

(66386, 52)

In [14]:
del wb_matrix, vocabulary_dict

# Supervised Learning

In [59]:
n_input = max_length
n_output = 3
learning_rate = 1

embedding_size = initW.shape[1]
filter_sizes = [4,3]
num_filters = 200
pooling_size = 4
pooling_strides = 2
epochs_num = 25
batch_size = 64

In [81]:
reset_graph()

X = tf.placeholder(tf.int32, shape = (None, n_input), name = "Input_X")
y = tf.placeholder(tf.int32, shape = (None), name = "Y")
# mode = tf.placeholder(tf.bool, name = "Mode")

# Load Embedding Model
with tf.device('/cpu:0'), tf.name_scope("embedding"):
    word2vec = tf.Variable(tf.constant(0.0, shape = initW.shape),
                    trainable=False, name="word2vec") # trainable=False, means not update these embeddings

embedded_chars = tf.nn.embedding_lookup(word2vec, X)
embedded_chars_expanded = tf.expand_dims(embedded_chars, -1) # ex: convert [[1,2]] to [[1],[2]], that is shape (2,) to (2,1)

# 1st convolution layer
conv1 = tf.layers.conv2d(embedded_chars_expanded, 
                         filters = num_filters, 
                         kernel_size = (filter_sizes[0], initW.shape[1]),
                         strides = (1,1), 
#                          padding="same",
                         kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
                         
                         activation = tf.nn.relu,
                         name="Convolution_1st"
                        )

pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[4, 1], strides=2)

# 2nd convolution layer
conv2 = tf.layers.conv2d(pool1, 
                         filters = num_filters, 
                         kernel_size = (filter_sizes[1], 1),
                         strides = (1,1), 
                         padding="same",
                         kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
                         activation = tf.nn.relu,
                         name="Convolution_2nd"
                        )
print(conv2.shape)
pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[27, 1], strides=1)

# Dense Layer, Combine all the pooled features
pool2_flat = tf.reshape(pool2, [-1, num_filters])

dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu, name = "Fully_connect")
# dense = tf.layers.dropout(inputs = dense, rate = 0.3, training = mode)

# Logits Layer
logits = tf.layers.dense(inputs=dense, 
                         units=n_output, 
                         activation=tf.nn.softmax, 
#                          kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
                         name = "Softmax")

# Define Loss Function
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits, name="Cross_Entropy"))

# Define Training Process
train_step = tf.train.AdadeltaOptimizer(learning_rate, epsilon=1e-6).minimize(cross_entropy)

# Define Accuracy
predicted_class = tf.argmax(logits,1, output_type=tf.int32)
correct_predict = tf.equal(y, predicted_class) # [True, False ..., True]
accuracy = tf.reduce_mean(tf.cast(correct_predict, tf.float32)) # [True, False ..., True] --> [1,0,...,1]


(?, 27, 1, 200)


In [82]:
# with tf.Session() as sess:
#     show_graph(tf.get_default_graph().as_graph_def())
#     # Initialize all variables
#     sess.run(tf.local_variables_initializer())
#     sess.run(tf.global_variables_initializer())
    
#     sess.run(word2vec.assign(initW))# Assign the pretrain word2vec


In [83]:
# prepare the training batch
train_data = tf.contrib.data.Dataset.from_tensor_slices((tweets_train, senti_train)).batch(batch_size).repeat()
train_iterator = train_data.make_one_shot_iterator() # Create an iterator to go through the training data
train_next_batch = train_iterator.get_next()

In [None]:
round_of_epochs = int(tweets_train.shape[0]/batch_size)

train_loss = []
test_loss  = []

train_f1 = []
test_f1 = []
x_axis = np.arange(0., epochs_num, 1)
saver = tf.train.Saver() # to store the model

with tf.Session() as sess:
    # Initialize all variables
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    
    sess.run(word2vec.assign(initW))# Assign the pretrain word2vec
    
    for epochs in range(epochs_num): # starting the training process and set the epochs_num
        for _ in range(round_of_epochs):
#             print('_')
            train, label = sess.run(train_next_batch) # Get the mini-batch data sample
            sess.run(train_step, feed_dict={X:train, y:label}) # Feed the features, labe, training_mode  to network to train
#         print(epochs)
#         if epochs % 5 ==0:
        loss, pred, acc = sess.run([cross_entropy,predicted_class,accuracy], feed_dict={X:train, y:label})
        t_loss, t_pred = sess.run([cross_entropy,predicted_class], feed_dict={X:tweets_test, y:senti_test})
        train_loss.append(loss)
        test_loss.append(t_loss)
        train_f1.append(f1_score(label, pred, average='weighted'))
        test_f1.append(f1_score(senti_test, t_pred, average='weighted'))
#         if epochs % 10 ==0:
        print("{} - {:4d} epoch, loss:{:.3f}, train accuracy:{:.3f}, train f1 score:{:.3f}, test f1 score:{:.3f}".format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            epochs+1,
            loss, 
            acc,
            f1_score(label, pred, average='weighted'),
            f1_score(senti_test, t_pred, average='weighted')
            )
        )
    plt.rcParams['font.size'] = 14
    plt.plot(x_axis, train_loss, 'r', x_axis, test_loss, 'b')
    plt.legend(['Train', 'Test'], loc='upper right')
    plt.xlabel('Epochs')
    plt.ylabel('Loss(cross entropy)')
    plt.show()
    plt.clf()
    plt.plot(x_axis, train_f1, x_axis, test_f1)
    plt.legend(['Train', 'Test'], loc='upper right')
    plt.xlabel('Epochs')
    plt.ylabel('Loss(cross entropy)')
    plt.show()

    

  'precision', 'predicted', average, warn_for)


2017-11-24 17:19:48 -    1 epoch, loss:0.874, train accuracy:0.672, train f1 score:0.540, test f1 score:0.193
2017-11-24 17:19:49 -    2 epoch, loss:0.964, train accuracy:0.562, train f1 score:0.483, test f1 score:0.383
2017-11-24 17:19:50 -    3 epoch, loss:0.971, train accuracy:0.547, train f1 score:0.496, test f1 score:0.405
2017-11-24 17:19:51 -    4 epoch, loss:0.838, train accuracy:0.719, train f1 score:0.647, test f1 score:0.451
2017-11-24 17:19:51 -    5 epoch, loss:0.806, train accuracy:0.734, train f1 score:0.636, test f1 score:0.469
2017-11-24 17:19:52 -    6 epoch, loss:0.979, train accuracy:0.562, train f1 score:0.458, test f1 score:0.471
2017-11-24 17:19:53 -    7 epoch, loss:0.768, train accuracy:0.812, train f1 score:0.777, test f1 score:0.538
2017-11-24 17:19:54 -    8 epoch, loss:1.028, train accuracy:0.531, train f1 score:0.449, test f1 score:0.511
2017-11-24 17:19:55 -    9 epoch, loss:0.892, train accuracy:0.672, train f1 score:0.640, test f1 score:0.518
2017-11-24