# Sentiment Analysis on Movie Reviews
--------
## Classify the sentiment of sentences from the Rotten Tomatoes dataset

"There's a thin line between likably old-fashioned and fuddy-duddy, and The Count of Monte Cristo ... never quite settles on either side."

The Rotten Tomatoes movie review dataset is a corpus of movie reviews used for sentiment analysis, originally collected by Pang and Lee [1]. In their work on sentiment treebanks, Socher et al. [2] used Amazon's Mechanical Turk to create fine-grained labels for all parsed phrases in the corpus. This competition presents a chance to benchmark your sentiment-analysis ideas on the Rotten Tomatoes dataset. You are asked to label phrases on a scale of five values: negative, somewhat negative, neutral, somewhat positive, positive. Obstacles like sentence negation, sarcasm, terseness, language ambiguity, and many others make this task very challenging.

In [149]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re

from numpy import log
half = -log(0.5)

In [150]:
def list2dict(ls):
    dic = dict()
    for term in ls:
        if term not in dic:
            dic[term] = 1
        else:
            dic[term] += 1
    return dic

def data_formating(raw_df):
    df = raw_df
    df['Phrase'] = df.Phrase.apply(lambda s: s.lower())
    df['Phrase'] = df.Phrase.apply(lambda s: re.sub(r'(?<=[\w\d])([^\w\d\s])', lambda match: "{0}".format(" "+match.group(1)), s))
    df['Phrase'] = df.Phrase.apply(lambda s: re.sub(r'([^\w\d\s])(?=[\w\d])', lambda match: "{0}".format(match.group(1)+" "), s))
    #df = raw_df.assign(PhraseSplit=raw_df.Phrase.apply(lambda s: list2dict(str.split(s, " "))))
    df = df.assign(PhraseSplit=df.Phrase.apply(lambda s: str.split(str(s))))
    df = df.assign(WordTotal=df.PhraseSplit.apply(lambda l: len(l)), Length=data.Phrase.apply(len))
    idx_empty = df[df.Phrase==" "].index[0]
    df = df.drop(idx_empty)
    return df

def rvwhitespace(s):
    if s=='' or s==' ' :
        return ''
    if s[0]==' ':
        i=1
    else:
        i=0
    if s[-1]==' ':
        return s[i:-1]
    else:
        return s[i:]

In [151]:
data = pd.read_csv("train.tsv", sep="\t")
data = data_formating(data).reset_index().drop('index', 1).sort_values('WordTotal').reset_index().drop(['index','SentenceId'], axis=1)

In [152]:
word = data[data.WordTotal==1]
dic     = {word.Phrase.ix[i]: i for i in word.index}

def phraselst2idx(phraselst, replaceRarePhrase='the'):
    return [dic[phrase] if phrase in dic else dic[replaceRarePhrase] for phrase in phraselst]

data = data.assign(seqindex=data.PhraseSplit.apply(phraselst2idx))

In [153]:
NV = len(dic)
d = 40
P = tf.Variable(tf.random_normal([d, NV]))
Ws = tf.Variable(tf.random_normal([5, d]))
W  = tf.Variable(tf.random_normal([d, 2*d]))
b  = tf.Variable(tf.random_normal([d, 1]))

In [154]:
def getseqidx(t):
    return tf.py_func(lambda s: [data.seqindex.ix[idx] for idx in s], [[t]], [tf.int64])[0]

因為我的電腦裝不下 parse 完的 phrase tree structure，我簡化了模型：

 1. 以 1 向量為初始向量： ```p = ones(d,1)```
 2. 從句首到句末方向堆疊：``` p = tanh( W* [p; p_next] )```

In [155]:
def p_accumulate(p, x):
    p_next = tf.transpose([tf.gather( tf.transpose(P), x)])
    return tf.nn.tanh( tf.add(tf.matmul( W, tf.concat(0, [p, p_next] ) ), b)  )

def prob(idx):
    seqindex = getseqidx(idx)
    #p = P[:, seqindex[0] ]
    p = tf.transpose([tf.gather( tf.transpose(P), seqindex[0])])
    #for i in range(1, length.eval(session=sess)):
    #    p = tf.nn.tanh( tf.matmul( W, tf.transpose( [tf.concat(0, [p, P[:, seqindex[i]]] )] )  ) )
    p = tf.scan( p_accumulate, seqindex, initializer=tf.ones([d,1]) )
    return p[-1]
    
def selectfromSoftmax(p):
    return tf.nn.softmax(tf.transpose(tf.matmul(Ws, p)))

def single_cost(idx, sentiment):
    softmax = selectfromSoftmax( prob(idx) )
    return - tf.log( softmax[0, sentiment] )

def pred(idx):
    return tf.arg_max(selectfromSoftmax( prob(idx) ), 1)

input_idx =  tf.placeholder(tf.int32)
idx_tf = tf.placeholder(tf.int32)
sen_tf = tf.placeholder(tf.int32)

loss = single_cost(idx_tf, sen_tf)

#idx_tf_batch = tf.placeholder(tf.int32, shape=10)
#sen_tf_batch = tf.placeholder(tf.int32, shape=10)

#loss0 = single_cost(idx_tf_batch[0], sen_tf_batch[0])
#loss1 = single_cost(idx_tf_batch[1], sen_tf_batch[1])
#loss2 = single_cost(idx_tf_batch[2], sen_tf_batch[2])
#loss3 = single_cost(idx_tf_batch[3], sen_tf_batch[3])
#loss4 = single_cost(idx_tf_batch[4], sen_tf_batch[4])
#loss5 = single_cost(idx_tf_batch[5], sen_tf_batch[5])
#loss6 = single_cost(idx_tf_batch[6], sen_tf_batch[6])
#loss7 = single_cost(idx_tf_batch[7], sen_tf_batch[7])
#loss8 = single_cost(idx_tf_batch[8], sen_tf_batch[8])
#loss9 = single_cost(idx_tf_batch[9], sen_tf_batch[9])

#loss = loss0 + loss1 + loss2 + loss3 + loss4 + loss5 + loss6 + loss7 + loss8 + loss9

pred_idx = pred(input_idx)

In [156]:
idx = list(data.index)
sen = data.Sentiment.get_values()

max(idx)
#randint

156058

In [157]:
train = tf.train.GradientDescentOptimizer(0.03).minimize(loss)
init = tf.initialize_all_variables()

In [158]:
sess = tf.Session()
sess.run(init)

In [None]:
#train = tf.train.AdadeltaOptimizer(0.03).minimize(loss)
train = tf.train.GradientDescentOptimizer(0.03).minimize(loss)

epoches = 4
for epoch in range(epoches)
    prev_bad = 0
    for i in range(14813):
        # single mode
        #randint = np.random.randint(14813)
        rantint = i
        feed = {idx_tf: idx[randint], sen_tf: sen[randint]}

        # batch mode
        #randint = np.random.randint(14813, size=10)
        #feed_idx = [idx[j] for j in randint]
        #feed_sen = [sen[j] for j in feed_idx]
        #feed={idx_tf_batch:feed_idx, sen_tf_batch:feed_sen}

        sess.run(train, feed_dict=feed)
        loss_out = sess.run(loss, feed_dict=feed)
        if loss_out > half: # when the correct probability is less than 0.5
        #softmax = sess.run(selectfromSoftmax(prob(randint)))
            print(i,loss_out,i-prev_bad)
        #print(softmax[0])
            prev_bad = i

(25, 1.1165409, 25)
(69, 0.90598547, 44)
(87, 1.0433263, 18)
(147, 1.597728, 60)
(153, 1.1206322, 6)
(179, 0.71276522, 26)
(210, 2.4022431, 31)
(215, 1.1590656, 5)
(237, 1.0516987, 22)
(279, 1.5132756, 42)
(302, 1.1657186, 23)
(353, 5.6450472, 51)
(356, 1.9827741, 3)
(359, 2.8138723, 3)
(401, 2.0410736, 42)
(404, 0.94998306, 3)
(412, 0.77907169, 8)
(413, 1.0032108, 1)
(418, 1.0194936, 5)
(449, 0.88473731, 31)
(485, 2.4774117, 36)
(488, 1.5538841, 3)
(492, 0.92646694, 4)
(494, 2.047981, 2)
(503, 1.7571079, 9)
(520, 0.83273065, 17)
(571, 0.95607513, 51)
(602, 0.85681653, 31)
(605, 1.7011203, 3)
(620, 2.9291093, 15)
(662, 0.96637535, 42)
(705, 4.6598454, 43)
(710, 1.6649283, 5)
(719, 1.0779248, 9)
(739, 1.4163159, 20)
(749, 1.5777762, 10)
(752, 1.5333626, 3)
(764, 2.0324538, 12)
(771, 1.3375869, 7)
(775, 1.5368434, 4)
(776, 3.1243727, 1)
(864, 4.2147851, 88)
(870, 1.8587348, 6)
(896, 5.4300728, 26)
(918, 0.83892018, 22)
(957, 1.568354, 39)
(969, 1.0895612, 12)
(1081, 2.5244384, 112)
(1089

In [None]:
print('====Train the entired data====')
train = tf.train.GradientDescentOptimizer(0.03).minimize(loss)

epoches = 4
for epoch in range(epoches)
    bad = 0
    for i in range(156060):
        randint = i
        feed = {idx_tf: idx[randint], sen_tf: sen[randint]}

        #randint = np.random.randint(156060, size=10)
        #feed_idx = [idx[j] for j in randint]
        #feed_sen = [sen[j] for j in feed_idx]
        #feed={idx_tf_batch:feed_idx, sen_tf_batch:feed_sen}

        sess.run(train, feed_dict=feed)
        loss_out = sess.run(loss, feed_dict=feed)
        #print(i)
        if loss_out > half:
            #softmax = sess.run(selectfromSoftmax(prob(randint)))
            print(i,loss_out,i-bad) #,sen[randint])
            #print(softmax)
            bad = i

(1, 5.5327067, 1)
(2, 27.079372, 1)
(3, 17.74556, 1)
(5, 15.390068, 2)
(6, 18.147911, 1)
(8, 12.226184, 2)
(9, 4.0844469, 1)
(10, 13.733372, 1)
(11, 25.527685, 1)
(13, 11.920857, 2)
(14, 28.481321, 1)
(15, 39.526772, 1)
(16, 1.2739116, 1)
(18, 0.74682051, 2)
(19, 0.73512679, 1)
(20, 1.0731643, 1)
(25, 12.878964, 5)
(26, 33.585339, 1)
(27, 12.606395, 1)
(30, 0.95441437, 3)
(31, 10.491284, 1)
(32, 12.70898, 1)
(34, 21.472982, 2)
(35, 20.862719, 1)
(37, 17.514322, 2)
(38, 6.742384, 1)
(42, 7.2019887, 4)
(44, 5.8353324, 2)
(46, 17.097395, 2)
(47, 7.4517961, 1)
(48, 15.991641, 1)
(49, 17.792767, 1)
(50, 18.798925, 1)
(51, 26.68601, 1)
(53, 23.820343, 2)
(55, 20.869045, 2)
(57, 2.7744749, 2)
(59, 7.5353451, 2)
(60, 17.743935, 1)
(61, 0.97377902, 1)
(62, 27.949539, 1)
(64, 8.8830042, 2)
(65, 5.7538719, 1)
(68, 2.7350588, 3)
(69, 14.847012, 1)
(72, 32.501038, 3)
(73, 8.8933105, 1)
(75, 0.99761444, 2)
(77, 2.7518356, 2)
(78, 5.5928545, 1)
(80, 10.200979, 2)
(81, 8.6470127, 1)
(83, 12.270762, 2)

In [None]:
test = pd.read_csv("test.tsv", sep="\t")
test = data_formating(test)

In [None]:
test.Phrase = test.Phrase.apply(lambda s: str.strip(re.sub(r'(?<=[\w\d\s])([^\w\d\s])(?=[\w\d\s])', lambda match: "{0}".format(" "+match.group(1)+" "), s)))
test.PhraseSplit = test.Phrase.apply(str.split)

In [None]:
testPhrase = test.PhraseSplit.get_values()

In [None]:
#rare_dic = dict()
answer = np.zeros(test.shape[0])


for i in range(len(testPhrase)):
    phrase = testPhrase[i]
    testidx = phraselst2idx(phrase)
    # print(phrase)
    #print(sess.run( selectfromSoftmax( tf.scan( p_accumulate, tf.constant(testidx), initializer=tf.ones([d,1]) )[-1] ) ))
    ans = np.argmax(sess.run( selectfromSoftmax( tf.scan( p_accumulate, tf.constant(testidx), initializer=tf.ones([d,1]) )[-1] ))) 
    print(i)
    answer[i] = ans

In [100]:
samples = data.sample(1000)
sampleIdx = samples.index

answer = np.zeros(len(sampleIdx))

for i in range(len(sampleIdx)):
    phrase = sampleIdx[i]
    # print(phrase)
    #print(sess.run( selectfromSoftmax( tf.scan( p_accumulate, tf.constant(testidx), initializer=tf.ones([d,1]) )[-1] ) ))
    ans = sess.run( pred_idx, feed_dict={input_idx: phrase})
    #print(i)
    answer[i] = ans

In [101]:
diff = answer - samples.Sentiment
diff[diff==0.0].count()

344