In [50]:
%pylab inline
import numpy as np
import pandas as pd
import matplotlib

from numpy import exp, dot, log
from sklearn.linear_model import LogisticRegression
matplotlib.style.use('ggplot')

Populating the interactive namespace from numpy and matplotlib


In [2]:
def list2dict(ls):
    dic = dict()
    for term in ls:
        if term not in dic:
            dic[term] = 1
        else:
            dic[term] += 1
    return dic

def data_formating(raw_df):
    df = raw_df.assign(PhraseSplit=raw_df.Phrase.apply(lambda s: list2dict(str.split(s, " "))))
    df = df.assign(WordTotal=df.PhraseSplit.apply(lambda l: len(l)))
    return df

In [4]:
train = pd.read_csv("train.tsv", sep="\t")
train = data_formating(train)

word = train[train.WordTotal == 1].reset_index().drop('index', 1)

In [105]:
def wordInitVecs(wordlst, d, sentiment = False):
    # Return a Initial Word Vectors with shape (dimension, numOfVocabulary) with column corresponding to each word.
    # And a dictionary recording indexes of word and sentiment in training data.
    Nvocab = len(wordlst)
    if sentiment==False:
        sentiment = 2*np.ones(Nvocab, dtype=int)
    lst = list(wordlst)
    dic = {lst[i]: i for i in xrange(Nvocab)}
    L = 0.0001*np.random.rand(d, Nvocab) - 0.5
    S = np.zeros((5, Nvocab))
    model = LogisticRegression()
    W = model.fit(L.T, sentiment).coef_
    for i in xrange(Nvocab):
        S[sentiment[i], i] = 1
    return L, W, S, dic

In [102]:
def softmax(L, W):
    expOfLW = exp(dot(W, L)) # 5 x |V|
    return expOfLW / expOfLW.sum(axis=0)

def costfunction(L, W, sentiment):
    M = len(sentiment)
    condProb = softmax(L, W)
    tmp = [ log(condProb[sentiment[i], i]) for i in xrange(M)]
    return -sum(tmp)

def costfunction_gradient_w(L, W, sentiment, condProb=None):
    M = len(sentiment)
    if condProb == None:
        condProb = softmax(L, W)
    S = np.zeros((5, M))
    for i in xrange(M):
        S[sentiment[i], i] = 1
    return -dot((S-condProb), L.T)

def costfunction_gradient_l(L, W, sentiment, condProb=None):
    M = len(sentiment)
    if condProb == None:
        condProb = softmax(L, W)
    S = np.zeros((5, M))
    for i in xrange(M):
        S[sentiment[i], i] = 1
    return -dot(W.T, (S-condProb))

def tuning(initL, initW, sentiment, alpha = 0.015):
    L = initL
    W = initW
    for iter in xrange(200):
        L = L - alpha * costfunction_gradient_l(L, W, sentiment)
        W = W - alpha * costfunction_gradient_w(L, W, sentiment)
    return L, W

In [99]:
wordlst = list(word.Phrase)
sentiment = list(word.Sentiment)

In [118]:
L, W, S, dic = wordInitVecs(wordlst, 40, sentiment)
alpha = 0.01

for iter in xrange(200):
    print(costfunction(L, W, sentiment))
    L = L - alpha * costfunction_gradient_l(L, W, sentiment)
    W = W - alpha * costfunction_gradient_w(L, W, sentiment)

alpha = 0.005
for iter in xrange(200):
    print(costfunction(L, W, sentiment))
    L = L - alpha * costfunction_gradient_l(L, W, sentiment)
    W = W - alpha * costfunction_gradient_w(L, W, sentiment)

14925.8956815
4083630.62925
inf
290726.422028
303314.981839
235547.454954
167711.895942
206557.944536
74934.4305019
175539.48588
118677.017655
46117.6586924
102840.974275
89297.93072
144808.114729
118844.302168
191099.140309
58558.7708655
183935.732737
16816.4573756
109431.217971
70492.2311757
54078.861222
49707.8223842
68300.6092142
21785.6521307
29505.4588956
26701.5399434
16682.1450702
24260.8138735
17709.2452142
13693.2945801
20413.0023551
24967.3488162
13684.2770261
4847.32412662
12775.8457288
24405.7302823
15439.165385
8698.02220283
3947.14885153
10935.365742
11857.3534322
7157.23836565
4233.48277236
2582.34309368
1411.00178767
581.578747145
337.278337728
332.97916873
440.124825947
313.285561739
291.328822765
284.852042965
286.3872972
272.553571559
267.59016464
265.310402938
256.477012639
251.462285657
245.77625306
241.688190163
232.59673892
226.121051966
217.933627024
211.187933463
199.813848633
190.453881419
179.741607354
169.834932903
157.351684314
146.405975899
134.787132293


In [120]:
softmax(L, W)

array([[  2.75289396e-171,   2.74990376e-171,   2.76731876e-171, ...,
          2.78981031e-171,   2.78056224e-171,   2.75879359e-171],
       [  3.04009935e-186,   3.02981790e-186,   3.05676745e-186, ...,
          3.08214586e-186,   3.07253767e-186,   3.04686461e-186],
       [  1.00000000e+000,   1.00000000e+000,   1.00000000e+000, ...,
          1.00000000e+000,   1.00000000e+000,   1.00000000e+000],
       [  3.78660088e-184,   3.78161414e-184,   3.80931798e-184, ...,
          3.84092499e-184,   3.82620365e-184,   3.79663298e-184],
       [  8.42220619e-160,   8.42022483e-160,   8.46763013e-160, ...,
          8.52928148e-160,   8.49872783e-160,   8.44285967e-160]])