In [1]:
import numpy as np
import os
import tensorflow as tf
from IPython.display import clear_output, Image, display, HTML
from tensorflow.contrib.layers import variance_scaling_initializer
from tensorflow.contrib.layers import fully_connected
from tensorflow.contrib.data import Dataset, Iterator
from sklearn.metrics import f1_score, classification_report
import datetime

###### Do not modify here ###### 
def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = graph_def
    #strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))
###### Do not modify  here ######

###### Do not modify here ###### 

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)



## Data Preparing

In [2]:
from nltk.tokenize import TweetTokenizer
import pandas as pd

tknzr = TweetTokenizer()

# Load data
en_full = pd.read_csv('data/supervised_phase/en_full/en_full.tsv.txt', delimiter='\t', names=["id", "lang", "polarity", "tweet"])
en_full.head()

Unnamed: 0,id,lang,polarity,tweet
0,264183816548130816,en,positive,Gas by my house hit $3.39!!!! I'm going to Cha...
1,263405084770172928,en,negative,"Theo Walcott is still shit, watch Rafa and Joh..."
2,262163168678248449,en,negative,"its not that I'm a GSP fan, i just hate Nick D..."
3,264249301910310912,en,negative,Iranian general says Israel's Iron Dome can't ...
4,264105751826538497,en,positive,with J Davlar 11th. Main rivals are team Polan...


In [3]:
def word2index(tweet):
    """
    Convert a tweet to a sequence of word index 
    """
    return [vocabulary_dict.get(token)[0] for token in tknzr.tokenize(tweet.lower()) if vocabulary_dict.get(token) != None]

def polarity2label(polarity):
    if polarity =='negative':  return 0
    elif polarity =='neutral': return 1
    elif polarity =='positive':return 2 

In [4]:
# en_full['index_rep'] = en_full['tweet'].map(word2index)
en_full['label'] = en_full['polarity'].map(polarity2label)
en_full.head()

Unnamed: 0,id,lang,polarity,tweet,label
0,264183816548130816,en,positive,Gas by my house hit $3.39!!!! I'm going to Cha...,2
1,263405084770172928,en,negative,"Theo Walcott is still shit, watch Rafa and Joh...",0
2,262163168678248449,en,negative,"its not that I'm a GSP fan, i just hate Nick D...",0
3,264249301910310912,en,negative,Iranian general says Israel's Iron Dome can't ...,0
4,264105751826538497,en,positive,with J Davlar 11th. Main rivals are team Polan...,2


In [5]:
from tensorflow.contrib import learn
# Build vocabulary
max_document_length = max([len(tknzr.tokenize(tweet)) for tweet in  en_full['tweet']])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
tweets = np.array(list(vocab_processor.fit_transform(en_full['tweet'])))

## Prepare Training + Testing Data

In [6]:
from sklearn.model_selection import ShuffleSplit
rs = ShuffleSplit(n_splits=1, test_size=.1, random_state=0)

senti = en_full['label'].as_matrix()

for train_index, test_index in rs.split(senti):
    X_train = tweets[train_index]
    y_train = senti[train_index]

    X_test = tweets[test_index]
    y_test = senti[test_index]
X_train.shape

(16239, 42)

## Model Construction

In [7]:
import pickle

# Load pre train Word2vec
wb_matrix = np.load("data/embed_tweets_en_200M_200D/embedding_matrix.npy")
print(wb_matrix.shape)
vocabulary_dict = pickle.load(open("data/embed_tweets_en_200M_200D/vocabulary.pickle", "rb"))
print(len(vocabulary_dict))

(1859185, 200)
1859184


In [8]:
vocabulary_dict.get('am')

(134, 2480704)

In [9]:
 # initial matrix with random uniform
initW = np.random.uniform(-0.25,0.25,(len(vocab_processor.vocabulary_), wb_matrix.shape[1]))
# load any vectors from the word2vec
for word, vector in vocabulary_dict.items():
    idx = vocab_processor.vocabulary_.get(word)
    if idx != 0:
        initW[idx] = wb_matrix[vector[0]]
   

In [10]:
n_input = max_document_length
n_output = 3
learning_rate = 0.0017

embedding_size = initW.shape[1]
filter_sizes = [4,3]
num_filters = 200
pooling_size = 4
pooling_strides = 2
epochs_num = 2000
batch_size = 32

In [11]:
reset_graph()

X = tf.placeholder(tf.int32, shape = (None, n_input), name = "Input_X")
y = tf.placeholder(tf.int32, shape = (None), name = "Y")
# mode = tf.placeholder(tf.bool, name = "Mode")

# Load Embedding Model
with tf.name_scope("embedding"):
    word2vec = tf.Variable(tf.constant(0.0, shape = initW.shape),
                    trainable=False, name="word2vec") # trainable=False, means not update these embeddings

embedded_chars = tf.nn.embedding_lookup(word2vec, X)
embedded_chars_expanded = tf.expand_dims(embedded_chars, -1) # ex: convert [[1,2]] to [[1],[2]], that is shape (2,) to (2,1)

# 1st convolution layer
conv1 = tf.layers.conv2d(embedded_chars_expanded, 
                         filters = num_filters, 
                         kernel_size = (filter_sizes[0], initW.shape[1]),
                         strides = (1,1), 
                         kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
                         activation = tf.nn.relu,
                         name="Convolution_1st"
                        )

pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[4, 1], strides=2)

# 2nd convolution layer
conv2 = tf.layers.conv2d(pool1, 
                         filters = num_filters, 
                         kernel_size = (filter_sizes[1], 1),
                         strides = (1,1), 
                         kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
                         activation = tf.nn.relu,
                         name="Convolution_2nd"
                        )

pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[4, 1], strides=2)

# Dense Layer, Combine all the pooled features
pool2_flat = tf.reshape(pool2, [-1, 7*200])

dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu, name = "Fully_connect")
# dense = tf.layers.dropout(inputs = dense, rate = 0.3, training = mode)

# Logits Layer
logits = tf.layers.dense(inputs=dense, units=n_output, activation=tf.nn.softmax, name = "Softmax")

# Define Loss Function
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits, name="Cross_Entropy"))

# Define Training Process
train_step = tf.train.AdadeltaOptimizer(0.0017).minimize(cross_entropy)

# Define Accuracy
predicted_class = tf.argmax(logits,1, output_type=tf.int32)
correct_predict = tf.equal(y, predicted_class) # [True, False ..., True]
accuracy = tf.reduce_mean(tf.cast(correct_predict, tf.float32)) # [True, False ..., True] --> [1,0,...,1]


In [12]:
# with tf.Session() as sess:
#     show_graph(tf.get_default_graph().as_graph_def())
#     # Initialize all variables
#     sess.run(tf.local_variables_initializer())
#     sess.run(tf.global_variables_initializer())
    
#     sess.run(word2vec.assign(initW))# Assign the pretrain word2vec


In [13]:
# prepare the training batch
train_data = tf.contrib.data.Dataset.from_tensor_slices((X_train,y_train)).batch(batch_size).repeat()
train_iterator = train_data.make_one_shot_iterator() # Create an iterator to go through the training data
train_next_batch = train_iterator.get_next()

In [None]:
round_of_epochs = int(X_train.shape[0]/batch_size)

saver = tf.train.Saver() # to store the model

with tf.Session() as sess:
    # Initialize all variables
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    
    sess.run(word2vec.assign(initW))# Assign the pretrain word2vec
    
    for epochs in range(epochs_num): # starting the training process and set the epochs_num
        for _ in range(round_of_epochs):
            train, label = sess.run(train_next_batch) # Get the mini-batch data sample
            sess.run(train_step, feed_dict={X:train, y:label}) # Feed the features, labe, training_mode  to network to train
        
        if epochs % 25 ==0:
            loss, pred = sess.run([cross_entropy,predicted_class], feed_dict={X:train, y:label})
            print("{} - {} epoch, loss:{}, train f1 score:{:.3f}, test f1 score:{:.3f}\n".format(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                epochs+1,
                loss, 
                f1_score(label, pred, average='macro'),
                f1_score(y_test, sess.run(predicted_class, feed_dict={X:X_test, y:y_test}), average='macro')
                )
            )

    

  'precision', 'predicted', average, warn_for)


2017-11-14 23:28:56 - 1 epoch, train f1 score:0.231, test f1 score:0.221

2017-11-14 23:29:31 - 26 epoch, train f1 score:0.391, test f1 score:0.349

2017-11-14 23:30:07 - 51 epoch, train f1 score:0.385, test f1 score:0.359

2017-11-14 23:30:44 - 76 epoch, train f1 score:0.370, test f1 score:0.369

2017-11-14 23:31:20 - 101 epoch, train f1 score:0.443, test f1 score:0.375

2017-11-14 23:31:55 - 126 epoch, train f1 score:0.415, test f1 score:0.384

2017-11-14 23:32:31 - 151 epoch, train f1 score:0.395, test f1 score:0.385

2017-11-14 23:33:07 - 176 epoch, train f1 score:0.411, test f1 score:0.390

