# Define Routine to Clean Data

In [1]:
import traceback
import re, string
def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>1]
    #return words list
    return words
  except:
    print (traceback.print_exc())
    return ""

# Load, Clean and Split Data into Training and Testing set

In [3]:
#Load training and Test data
import pandas as pd
df = pd.read_csv('labeledTrainData.tsv', header=0, delimiter="\t", quoting=3)
df['cleanReview'] = df['review'].apply(clean_str)


#Splitting data in test and training
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

# Load Gensim Word2vec model

In [4]:
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

model = gensim.models.Word2Vec.load('/tmp/w2v-movie-review')

#Display model shape
embedding_size = model.wv.syn0.shape[1] #How many features per word
model.wv.syn0.shape

Using TensorFlow backend.
2017-08-29 10:48:20,304 : INFO : loading Word2Vec object from /tmp/w2v-movie-review
2017-08-29 10:48:21,093 : INFO : loading wv recursively from /tmp/w2v-movie-review.wv.* with mmap=None
2017-08-29 10:48:21,093 : INFO : setting ignored attribute syn0norm to None
2017-08-29 10:48:21,093 : INFO : setting ignored attribute cum_table to None
2017-08-29 10:48:21,093 : INFO : loaded /tmp/w2v-movie-review


(28296, 300)

# Document to Matrix Routine

In [5]:
import numpy as np

def doc2Matrix(df,start, end, n_words):
    batch = np.empty(shape=(end-start,n_words,embedding_size))
    for i in range(start,end):
        words_in_doc = df['cleanReview'][i-start]
        for j in range(n_words):
            if (j < len(words_in_doc)):
                try:
                    batch[i,j] = model.wv[words_in_doc[j]]
                except:
                    batch[i,j] = np.random.uniform(-0.25,0.25,embedding_size)  #Unknown word
            else:
                batch[i,j] = np.random.uniform(-0.25,0.25,embedding_size)   #Padding
    return batch

In [6]:
xx = doc2Matrix(train_df,0,100,200)
xx.shape

(100, 200, 300)

# Define Model Parameters

In [7]:
logs_path='/tmp/session9/cnn-nlp-parallel/v1'  #Place to store Model and events
save_path=logs_path + '/'   
max_learning_rate = 0.003 #0.03, 0.0001, 0.0002
min_learning_rate = 0.0001
n_classes = 2
batch_size = 100
n_words = 200 #How many words per document
training_epochs = 2
train_pkeep = 0.75
test_pkeep = 1.0

# Build TensorFlow Graph

In [8]:
# three convolutional layers with their channel counts, and a
# fully connected layer (tha last layer has 10 softmax neurons)
K = 24  # first convolutional layer output depth
L = 24  # second convolutional layer output depth
M = 24  # third convolutional layer
N = 200  # fully connected layer

# Define Inputs

In [9]:
import tensorflow as tf
tf.reset_default_graph()

#Define Inputs
with tf.name_scope('input'):
    # None -> batch size can be any size, with n_features
    x = tf.placeholder(tf.float32, shape=[None, n_words, embedding_size], name="x-input") 
    # target n_classes output classes
    y_ = tf.placeholder(tf.int32, shape=[None], name="y-input")
    
    #dropout rate
    pkeep = tf.placeholder(tf.float32)
    X1 = tf.reshape(x,[-1,n_words, embedding_size,1])    #reshape format for input to 2 dimension
    y_one_hot = tf.one_hot(indices=y_,depth=2)  #One hot encoding

# Convolution Layer 1

In [10]:
with tf.name_scope("Conv1"):    
    W1 = tf.Variable(tf.truncated_normal([3, embedding_size, 1, K], stddev=0.1))  # 5x5 patch, 1 input channel, K output channels
    b1 = tf.Variable(tf.ones([K])/10)
    stride = 1
    Y1C = tf.nn.conv2d(X1, W1, strides=[1, stride, stride, 1], padding='VALID')
    Y1 = tf.nn.relu(Y1C + b1)
    pool1 = tf.nn.max_pool(Y1,
                          ksize=[1,n_words-3+1,1,1],
                          strides=[1,1,1,1],
                          padding='VALID',
                          name='pool1')

In [11]:
pool1  #Display dimension

<tf.Tensor 'Conv1/pool1:0' shape=(?, 1, 1, 24) dtype=float32>

# Convolution Layer 2

In [12]:
with tf.name_scope("Conv2"):
    W2 = tf.Variable(tf.truncated_normal([4, embedding_size, 1, L], stddev=0.1))  # 5x5 patch, K input channel, L output channels
    b2 = tf.Variable(tf.ones([L])/10)
    stride = 1
    #Y2C = tf.nn.conv2d(Y1, W2, strides=[1, stride, stride, 1], padding='SAME')
    Y2C = tf.nn.conv2d(X1, W2, strides=[1, stride, stride, 1], padding='VALID')
    Y2 = tf.nn.relu(Y2C + b2)
    
    pool2 = tf.nn.max_pool(Y2,
                          ksize=[1,n_words-4+1,1,1],
                          strides=[1,1,1,1],
                          padding='VALID',
                          name='pool2')

In [13]:
pool2  #Display dimension

<tf.Tensor 'Conv2/pool2:0' shape=(?, 1, 1, 24) dtype=float32>

# Convolution Layer 3

In [14]:
with tf.name_scope("Conv3"):
    W3 = tf.Variable(tf.truncated_normal([5, embedding_size, 1, M], stddev=0.1))  # 4x4 patch, L input channel, M output channels
    b3 = tf.Variable(tf.ones([M])/10)
    stride = 1
    Y3C = tf.nn.conv2d(X1, W3, strides=[1, stride, stride, 1], padding='VALID')
    Y3 = tf.nn.relu(Y3C + b3)
    pool3 = tf.nn.max_pool(Y3,
                          ksize=[1,n_words-5+1,1,1],
                          strides=[1,1,1,1],
                          padding='VALID',
                          name='pool3')

In [15]:
pool3 #display information

<tf.Tensor 'Conv3/pool3:0' shape=(?, 1, 1, 24) dtype=float32>

# Output Layer

In [16]:
with tf.name_scope("Output"):
    pool = tf.concat([pool1,pool2,pool3],1)  
    YY = tf.reshape(pool, shape=[-1, K+L+M])
    Y4de = tf.nn.dropout(YY,pkeep)
    # y is our prediction
    W = tf.Variable(tf.truncated_normal([K+L+M, n_classes] ,stddev=0.1))
    b = tf.Variable(tf.zeros([n_classes]))
    Ylogits = tf.matmul(Y4de, W) + b
    y = tf.nn.softmax(Ylogits)

# Define Loss, Optimization, Accuracy and Logging Information

In [17]:
# specify cost function
with tf.name_scope('Loss'):    
    #cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=y_)
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=y_one_hot)
    cross_entropy = tf.reduce_mean(cross_entropy)*100

In [18]:
# specify optimizer
with tf.name_scope('train'):
    # optimizer is an "operation" which we can execute in a session
    learn_rate = tf.placeholder(tf.float32)
    #train_op = tf.train.GradientDescentOptimizer(learn_rate).minimize(cross_entropy)
    train_op = tf.train.AdamOptimizer(learn_rate).minimize(cross_entropy)

In [19]:
with tf.name_scope('Accuracy'):
    # Prediction
    prediction = tf.argmax(y,1,name="Predict")
    #Accuracy
    #correct_prediction = tf.equal(prediction, tf.argmax(y_,1))
    correct_prediction = tf.equal(prediction, tf.argmax(y_one_hot,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32),name="accuracy")

In [20]:
# create a summary for our cost and accuracy
training_learn_rate = tf.summary.scalar("learning_rate", learn_rate)
training_loss = tf.summary.scalar("training_loss", cross_entropy)
training_accuracy = tf.summary.scalar("training_accuracy", accuracy)
test_loss = tf.summary.scalar("test_loss", cross_entropy)
test_accuracy = tf.summary.scalar("test_accuracy", accuracy)
#Create a Saver to save the graph
saver = tf.train.Saver()

# Executing Graph

In [21]:
import math
#Start Graph execution
testX = doc2Matrix(test_df, 0, test_df.shape[0], n_words)
testY = test_df['sentiment']
with tf.Session() as sess:
    # variables need to be initialized before we can use them
    sess.run(tf.global_variables_initializer())

    # create log writer object
    writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())

    # perform training cycles
    for epoch in range(training_epochs):
        
        # number of batches in one epoch
        #batch_count = int(trainX.shape[0]/batch_size)
        batch_count = int(train_df.shape[0]/batch_size)
        total_decay_steps = training_epochs * batch_count
        
        for i in range(batch_count):            
            batch_df = train_df[i*batch_size:i*batch_size+batch_size]
            batch_df.reset_index(inplace=True)
            batch_x = doc2Matrix(batch_df,0,batch_size,n_words)
            batch_y = batch_df['sentiment']
            
            current_step =  epoch * batch_count + i
            lr = min_learning_rate +(max_learning_rate-min_learning_rate)*math.exp(-current_step/total_decay_steps)

            # perform the operations we defined earlier on batch
            _,acc,loss,lr_value = sess.run([train_op, training_accuracy,training_loss,training_learn_rate], feed_dict={x: batch_x, y_: batch_y, learn_rate: lr, pkeep: train_pkeep})
         
            #log training accuracy and loss
            writer.add_summary(acc, epoch * batch_count + i)
            writer.add_summary(loss, epoch * batch_count + i)   
            writer.add_summary(lr_value, epoch * batch_count + i)            
                        
        #Test loss and accuracy
        test_acc,acc,loss,a_loss = sess.run([accuracy,test_accuracy,test_loss,cross_entropy],feed_dict={x: testX, y_: testY, pkeep: test_pkeep})
        writer.add_summary(acc, epoch * batch_count + i)
        writer.add_summary(loss, epoch * batch_count + i)   
        if epoch % 5 == 0: 
            print ("Epoch: ", epoch)
            print ('Test Accuracy',test_acc)
                
    print ("Accuracy: ", accuracy.eval(feed_dict={x: testX, y_: testY, pkeep: test_pkeep}))
    
    #Save the model
    saver.save(sess, save_path + "model.ckpt")
    print ("done")

Epoch:  0
Test Accuracy 0.838
Accuracy:  0.8548
done
