In [136]:
%pylab inline
import numpy as np
import pandas as pd
import tensorflow as tf

from numpy import exp, dot, log
from sklearn.linear_model import LogisticRegression
matplotlib.style.use('ggplot')

Populating the interactive namespace from numpy and matplotlib


In [2]:
def list2dict(ls):
    dic = dict()
    for term in ls:
        if term not in dic:
            dic[term] = 1
        else:
            dic[term] += 1
    return dic

def data_formating(raw_df):
    df = raw_df.assign(PhraseSplit=raw_df.Phrase.apply(lambda s: list2dict(str.split(s, " "))))
    df = df.assign(WordTotal=df.PhraseSplit.apply(lambda l: len(l)))
    return df

In [4]:
train = pd.read_csv("train.tsv", sep="\t")
train = data_formating(train)

word = train[train.WordTotal == 1].reset_index().drop('index', 1)

In [131]:
def wordInitVecs(wordlst, d, sentiment = False):
    # Return a Initial Word Vectors with shape (dimension, numOfVocabulary) with column corresponding to each word.
    # And a dictionary recording indexes of word and sentiment in training data.
    Nvocab = len(wordlst)
    if sentiment==False:
        sentiment = 2*np.ones(Nvocab, dtype=int)
    lst = list(wordlst)
    dic = {lst[i]: i for i in xrange(Nvocab)}
    L = 0.0001*(np.random.rand(d, Nvocab) - 0.5)
    S = np.zeros((5, Nvocab))
    model = LogisticRegression()
    W = model.fit(L.T, sentiment).coef_
    for i in xrange(Nvocab):
        S[sentiment[i], i] = 1
    return L, W, S, dic

def softmax(L, W):
    expOfLW = exp(dot(W, L)) # 5 x |V|
    return expOfLW / expOfLW.sum(axis=0)

def costfunction(L, W, sentiment):
    M = len(sentiment)
    condProb = softmax(L, W)
    tmp = [ log(condProb[sentiment[i], i]) for i in xrange(M)]
    return -sum(tmp)

def costfunction_gradient_w(L, W, sentiment, condProb=None, lam=False):
    M = len(sentiment)
    if condProb == None:
        condProb = softmax(L, W)
    if lam == True:
        lam = 1
    S = np.zeros((5, M))
    for i in xrange(M):
        S[sentiment[i], i] = 1
    return -dot((S-condProb), L.T) + 2*lam*W

def costfunction_gradient_l(L, W, sentiment, condProb=None, lam=False):
    M = len(sentiment)
    if condProb == None:
        condProb = softmax(L, W)
    if lam == True:
        lam = 1
    S = np.zeros((5, M))
    for i in xrange(M):
        S[sentiment[i], i] = 1
    return -dot(W.T, (S-condProb)) + 2*lam*L

def tuning(initL, initW, sentiment, alpha = 0.01, lam=False):
    L = initL
    W = initW
    for iter in xrange(200):
        L = L - alpha * costfunction_gradient_l(L, W, sentiment, lam=lam)
        W = W - alpha * costfunction_gradient_w(L, W, sentiment, lam=lam)
    return L, W

In [257]:
wordlst = list(word.Phrase)
sentiment = list(word.Sentiment)
d = 40
L, W, S, dic = wordInitVecs(wordlst, d, sentiment)
alpha = 0.01

for iter in xrange(200):
    print(costfunction(L, W, sentiment))
    L = L - alpha * costfunction_gradient_l(L, W, sentiment)
    W = W - alpha * costfunction_gradient_w(L, W, sentiment)

alpha = 0.012
for iter in xrange(200):
    print(costfunction(L, W, sentiment))
    L = L - alpha * costfunction_gradient_l(L, W, sentiment)
    W = W - alpha * costfunction_gradient_w(L, W, sentiment)

26612.055877
26612.0557977
26612.0554121
26612.0529541
26612.0363464
26611.9225316
26611.139695
26605.7504934
26568.6660446
26314.6818667
24633.2400461
16125.330331
7506.31603124
6396.46263277
5821.47303312
5326.62272145
4868.11304536
4483.2303053
4198.34710469
3999.77507927
3858.95301936
3750.39800001
3650.20787006
3527.35223645
3331.27813561
2988.73141742
2461.32392246
1876.57951058
1443.07542848
1191.02835465
1048.80247928
963.397134995
908.14101133
870.002793963
842.266717778
821.233022479
804.73139119
791.416759575
780.413985891
771.129126876
763.14240335
756.146131051
749.905240881
744.233227188
738.974350416
733.992891424
729.162959748
724.362977276
719.468048046
714.346842383
708.854662774
702.83215396
696.09898294
688.455571378
679.679289585
669.532591223
657.766208345
644.139020312
628.433883566
610.49178485
590.238205132
567.71725092
543.102021061
516.687386964
488.843132859
459.944016937
430.290907505
400.061352352
369.317514524
338.0841049
306.47712572
274.834014578
243.77

In [258]:
up = train.assign(wordembed = None, bookkeeping = 0)
up.loc[up.WordTotal==1, 'wordembed'] = up[up.WordTotal==1].Phrase.apply(lambda s: L[:, dic[s]])
up.loc[up.WordTotal==1, 'bookkeeping'] = 1

In [None]:
def childPhrase(pid, sid):
    numword = up[(up.PhraseId==pid)].WordTotal.iloc[0]
    if numword==1:
        return (None, None)
    phrase = up[(up.PhraseId==pid)].Phrase.iloc[0]
    perhapChild = up[(up.WordTotal < numword) & (up.SentenceId==sid)]
    childs = perhapChild[perhapChild.Phrase.apply(lambda p: True if p in phrase else False)].sort_values('WordTotal', ascending=False)
    bigChild = childs.iloc[0]
    numchild = bigChild.WordTotal
    perhaplittlechild = child[child.WordTotal==numword-numchild]

    if bigChild.Phrase[0] == phrase[0]:
        regex = r"^"+ re.escape(bigChild.Phrase)+ r" (.+)$"
        match = re.search(regex, phrase)
        bigFirst = True
    else:
        regex = r"^(.+) "+ re.escape(bigChild.Phrase)+ r"$"
        match = re.search(regex, phrase)
        bigFirst = False

    littleChild = childs[childs.Phrase==match.group(1)].iloc[0]

    if bigFirst:
        return (bigChild.PhraseId, littleChild.PhraseId)
    else:
        return (littleChild.PhraseId, bigChild.PhraseId)

def node(pid, sid):
    numword = up[(up.PhraseId==pid)].WordTotal.iloc[0]
    if numword == 1:
        return tf.constant(up[(up.PhraseId==pid)].wordembed.iloc[0])
    childs = childPhrase(pid, sid)

In [None]:
p = tf.Variable()

In [255]:
pid = 3
sid = 1
numword = up[(up.PhraseId==pid)].WordTotal.iloc[0]

childs = childPhrase(pid, sid)
W_p = tf.Variable(tf.random_normal([d, 2*d]))


In [239]:
pid = 1
sid = 1
numword = 28

phrase = up[(up.PhraseId==pid)].Phrase.iloc[0]
perhapChild = up[(up.WordTotal < numword) & (up.SentenceId==sid)]

childs = perhapChild[perhapChild.Phrase.apply(lambda p: True if p in phrase else False)].sort_values('WordTotal', ascending=False)
bigChild = childs.iloc[0]
numchild = bigChild.WordTotal
perhaplittlechild = child[child.WordTotal==numword-numchild]
    
if bigChild.Phrase[0] == phrase[0]:
    regex = r"^"+ re.escape(bigChild.Phrase)+ r" (.+)$"
    match = re.search(regex, phrase)
    bigFirst = True
else:
    regex = r"^(.+) "+ re.escape(bigChild.Phrase)+ r"$"
    match = re.search(regex, phrase)
    bigFirst = False

littleChild = childs[childs.Phrase==match.group(1)].iloc[0]

In [256]:
childs

(4, 5)

In [195]:
phrase

'of escapades demonstrating the adage that what is good for the goose'

In [229]:
import re
regex = r"^(.+) "+ re.escape(bigChild.Phrase)+ r"$"
match = re.search(regex, phrase)

In [230]:
match.group(1)

'of'

In [219]:
re.escape(bigChild.Phrase)

'escapades\\ demonstrating\\ the\\ adage\\ that\\ what\\ is\\ good\\ for\\ the\\ goose'