## Overall Process of Latent Aspect Rating Analysis

In [1]:
## Define Review Class
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk.data

with open( 'stopwords.dat', 'r' ) as stopfile:
    en_stop = stopfile.readlines()
    en_stop = [ item.split('\n')[0] for item in en_stop if not item in ['\n'] ]

class Review:
    # Initialization
    def __init__(self, hotel='', title='', rating='', aspects='', content='' ):
        self.hotel = hotel
        self.title = title
        self.rating = rating
        self.aspects = aspects
        self.content = content
        
    def Sentences( self ):
        tokenizer = nltk.data.load( 'tokenizers/punkt/english.pickle' )
        self.sentences = tokenizer.tokenize( self.title.lower() + self.content.lower() )
        
    def pre_process( self ):
        tokenizer = RegexpTokenizer( r'\w+' )
        temp = []
        for sentence in self.sentences:
            tokens = tokenizer.tokenize( sentence )
            tokens_nostop = [ item for item in tokens if not unicode( item ) in en_stop ]
            temp.append( tokens_nostop )
        self.tokens = temp
        
    def removeTerms( self, Vocabulary ):
        terms_removed = []
        for sentence in self.tokens:
            new_tokens = [ item for item in sentence if  item in Vocabulary ]
            terms_removed.append( new_tokens )
        self.terms_removed = terms_removed
        
    def Stemmer( self ):
        p_stemmer = PorterStemmer()
        sentences_final = []
        for sentence in self.terms_removed:
            sentence = [ p_stemmer.stem( item ) for item in sentence ]
            sentences_final.append( sentence )
        self.sentences_final = sentences_final
        
    def Lemmatizer( self ):
        lemmatizer = WordNetLemmatizer()
        sentences_final = []
        for sentence in self.terms_removed:
            sentence = [ lemmatizer.lemmatize( item ) for item in sentence ]
            sentences_final.append( sentence )
        self.sentences_final = sentences_final       

In [103]:
## Load TripAdvisor Reviews which includes Overall Rating, Aspect Rating and review contents

import csv

TripAdvisor = []
with open( 'tripadvisor_reviews.csv' , 'r' ) as csvfile:
    Reviewreader = csv.reader( csvfile, delimiter=',' )
    for row in Reviewreader:
        hotel = row[1].decode('utf-8')
        title = row[2].decode('utf-8')
        rating = float( row[5] ) 
        aspects = row[6].decode('utf-8')
        content = row[7].decode('utf-8')
        TripAdvisor.append( Review( hotel, title, rating, aspects, content ) )
    print 'Tripadvisor loaded.'

Tripadvisor loaded.


In [None]:
# Get Preprocessing for all the reviews
i = 0
for idx in range( len( TripAdvisor ) ):
    TripAdvisor[ idx ].Sentences()
    TripAdvisor[ idx ].pre_process()
    i += 1
    if i%10000 == 0:
        print i, 'reviews have been processed.'

In [105]:
# Count the frequency of each token

from collections import Counter

Tokens = []
for review in TripAdvisor:
    for tokens in review.tokens:
        Tokens += tokens
        
TermList = Counter( Tokens )

In [106]:
# Get terms occuring less than 5 in the corpus
RemoveTermList = []
for token in Tokens:
    if TermList[ token ] < 5:
        RemoveTermList.append( token )

RemoveTermList = list( set( RemoveTermList ) )
Tokens = list( set( Tokens ) )

In [107]:
# Build the vocabulary for the corpus
Vocabulary = []
i = 0
print len( Tokens ), len( RemoveTermList )

Vocabulary =  list( set( Tokens ).difference( set( RemoveTermList ) ) )
print len( Vocabulary )

158722 118896
39826


In [None]:
# Remove terms appearing less than 5 times
i=0
for idx in range( len( TripAdvisor ) ):
    TripAdvisor[ idx ].removeTerms( Vocabulary )
    i += 1
    if i%10000 == 0:
        print i

In [None]:
# Stemming each word to its root with Porter Stemmer
i = 0
for idx in range( len( TripAdvisor ) ):
    TripAdvisor[ idx ].Stemmer()
    i += 1
    if i%10000 == 0:
        print i
        
# Preprocessing done

## Data Saving and Loading

In [112]:
# Save Variables into file
import pickle

with open( 'TripAdvisorReviews.pickle', 'w' ) as TripAdvisorfile:
    pickle.dump( [ TripAdvisor ] , TripAdvisorfile )

In [121]:
with open( 'Vocabulary.pickle', 'w' ) as Vfile:
    pickle.dump( [Vocabulary], Vfile )

In [37]:
# Load saved processed reviews and vocabulary
import pickle

with open( 'TripAdvisorReviews.pickle', 'r' ) as TripAdvisorfile:
    [ TripAdvisor ] = pickle.load( TripAdvisorfile )
    
with open( 'Vocabulary.pickle', 'r' ) as Vfile:
    [ Vocabulary ] = pickle.load( Vfile )

## Replace Stemming with Lemmatization

In [40]:
from nltk.stem import WordNetLemmatizer

def Lemmatizer( self ):
    lemmatizer = WordNetLemmatizer()
    sentences_final = []
    for sentence in self.terms_removed:
        sentence = [ lemmatizer.lemmatize( item ) for item in sentence ]
        sentences_final.append( sentence )
    return sentences_final

In [41]:
i=0
for idx in range( len( TripAdvisor ) ):
    TripAdvisor[ idx ].final_sentences_final = Lemmatizer( TripAdvisor[ idx ] )
    i += 1
    if i%10000 == 0:
        print i

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000


In [1]:
import pickle

with open( 'Vocabulary.pickle', 'r' ) as Vfile:
    [ Vocabulary ] = pickle.load( Vfile )

In [5]:
TripSentences_final = [ review.sentences_final for review in TripAdvisor ]

with open( 'review_sentences.pickle', 'w') as sentence_file:
    pickle.dump( [ TripSentences_final ], sentence_file )

## Create Small Data Collection -- Sample 20000 reviews

In [57]:
import pickle

with open( 'ReviewToSentences.pickle', 'r' ) as sentencefile:
    [ ReviewToSentences ] = pickle.load( sentencefile )

In [31]:
import random

SampleSize = 2000

SampledIdx = random.sample( xrange( len( ReviewToSentences ) ) , SampleSize )
SampledSentences = []

for idx in SampledIdx:
    ReviewSentences = [ Sentences[i] for i in ReviewToSentences[ idx ] ]
    SampledSentences += ReviewSentences

In [47]:
with open( 'SampledData-2000.pickle', 'r' ) as savefile:
    SampledIdx, Sentences = pickle.load( savefile )

In [52]:
print Sentences[90]

[u'stay', u'sound', u'sleeper']


In [43]:
SampledSentences = []
for idx in SampledIdx:
    ReviewSentences = TripAdvisor[ idx ].final_sentences_final
    SampledSentences += ReviewSentences

In [46]:
with open( 'SampledData-2000.pickle', 'w' ) as savefile:
    pickle.dump( [ SampledIdx, SampledSentences ] , savefile )

## Aspect Segmentation -- Bootstrapping

In [53]:
# Load seed Topic List
import csv

To = {}
with open( 'hotel_bootstrapping.dat', 'r' ) as seedfile:
    Reader = csv.reader( seedfile, delimiter='\t' )
    for row in Reader:
        To[ row[0].decode('utf-8') ] = row[1:]
        print row[0], row[1:]

<value> ['value', 'price', 'quality', 'worth']
<room> ['room', 'suite', 'view', 'bed']
<location> ['location', 'traffic', 'minute', 'restaurant']
<cleanliness> ['clean', 'dirty', 'maintain', 'smell']
<front desk> ['stuff', 'check', 'help', 'reservation']
<service> ['service', 'food', 'breakfast', 'buffet']
<business service> ['business', 'center', 'computer', 'internet']


In [17]:
import pickle

with open( 'review_sentences.pickle', 'r' ) as sentence_file:
    [ TripSentences_final ] = pickle.load( sentence_file )

In [18]:
Sentences = []
ReviewToSentences = []

index = 0
for idx in range( len( TripSentences_final ) ):
    reviewTosentence = []
    for sentence in TripSentences_final[ idx ]:
        Sentences.append( sentence )
        reviewTosentence.append( index )
        index += 1
    ReviewToSentences.append( reviewTosentence )

In [19]:
import pickle

with open( 'ReviewToSentences.pickle', 'w' ) as sentencefile:
    pickle.dump( [ ReviewToSentences ] , sentencefile )

In [9]:
import pickle

with open( 'sentences.pickle', 'w' ) as sentencefile:
    pickle.dump( [ Sentences ] , sentencefile )

In [29]:
#with open( 'sentences.pickle', 'r' ) as sentence_file:
#    [ Sentences ] = pickle.load( sentence_file )

Sentences = SampledSentences
    
with open( 'Vocabulary.pickle', 'r' ) as Vfile:
    [ Vocabulary ] = pickle.load( Vfile )

p = 5  # selection threshold
max_iter = 10 # iteration step limit
tf_cut = 10 # Term frequency filtering

iter_num = 0 # Initialize iter_num and keyword list changed.
Flag = True

Progress = open('RunningProgress.csv', 'a')

ProWriter = csv.writer( Progress , delimiter=',' )

while iter_num < max_iter and Flag:
    Flag = False # set keyword list to unchanged
    iter_num += 1
    print 'Iteration', iter_num, 'begins:'
    
    # Match the aspect keywords in each sentence and record its matching
    Sentence_aspects = []
    for sentence in Sentences:
        aspectcount = {}
        aspects = [] # aspects of this sentence
        countMax = 0 # maximum count(i)
        for key in T.keys():
            count = 0
            for keyword in T[ key ]:
                count += sentence.count( keyword )
            aspectcount[ key ] = count
            if count > countMax:
                countMax = count
                    
        # Assign aspect i
        for key in T.keys():
            if aspectcount[ key ] == countMax:
                aspects.append( key )
            
        Sentence_aspects.append( aspects )
                   
    # sentence annotation with aspect assignment done
    print 'Aspect Annotation Done.'
    
    
    for key in T.keys():
        chi_list = []
        count = 0
        for j, word in enumerate( Vocabulary ):
            C_1, C_2, C_3, C_4, C = 0, 0, 0, 0, 0
            for idx,sentence in enumerate( Sentences ):
                C += sentence.count( word )
                if key in Sentence_aspects[ idx ]:
                    C_1 += sentence.count( word )
                    if not word in sentence:
                        C_3 += 1.0
                else:
                    C_2 += sentence.count( word )
                    if not word in sentence:
                        C_4 += 1.0
            if j%10000 == 0:
                print j
            # Calculate the chi for each word
            denominator = (C_1 + C_3) * (C_2 + C_4) * (C_1 + C_2) * (C_3 + C_4)
            nominator = C * ( C_1*C_4 - C_2*C_3 ) * ( C_1*C_4 - C_2*C_3 )
            if denominator > 0 and C_1 + C_2 > tf_cut:
                chi = nominator / float( denominator )
            else:
                chi = 0.0
            chi_list.append( ( word, chi ) )
            count += 1
            
            if count % 1000 == 0:
                print iter_num,  count, key
                
        Chi_sorted = sorted( chi_list , key=lambda tup: tup[1] , reverse = True )
    
        # Joint top p words into aspect keywords list
        len_original = len( T[ key ] )
        for idx in range( p ):
            T[ key ].append( Chi_sorted[ idx ][0] )
        T[ key ] = list( set( T[ key ] ) )
    
        # Test whether the aspect keyword list has changed.
        if len( T[ key ] ) > len_original:
            Flag = True
            
Progress.close()
            
with open('SentenceAspects_Tkey.pickle', 'w') as resultfile:
    pickle.dump( [ TripSentence_aspects, T ], resultfile )

Iteration 1 begins:
Aspect Annotation Done.
0
1 1000 <service>
1 2000 <service>
1 3000 <service>
1 4000 <service>
1 5000 <service>
1 6000 <service>
1 7000 <service>
1 8000 <service>
1 9000 <service>
1 10000 <service>
10000
1 11000 <service>
1 12000 <service>
1 13000 <service>
1 14000 <service>
1 15000 <service>
1 16000 <service>
1 17000 <service>
1 18000 <service>
1 19000 <service>
1 20000 <service>
20000
1 21000 <service>
1 22000 <service>
1 23000 <service>
1 24000 <service>
1 25000 <service>
1 26000 <service>
1 27000 <service>
1 28000 <service>
1 29000 <service>
1 30000 <service>
30000
1 31000 <service>
1 32000 <service>
1 33000 <service>
1 34000 <service>
1 35000 <service>
1 36000 <service>
1 37000 <service>
1 38000 <service>
1 39000 <service>
0
1 1000 <value>
1 2000 <value>
1 3000 <value>
1 4000 <value>
1 5000 <value>
1 6000 <value>


KeyboardInterrupt: 

## Load Sampled Data and Mapping from review to sentences

In [2]:
# Load saved processed reviews and vocabulary
import pickle

with open( 'TripAdvisorReviews.pickle', 'r' ) as TripAdvisorfile:
    [ TripAdvisor ] = pickle.load( TripAdvisorfile )
    
with open( 'Vocabulary.pickle', 'r' ) as Vfile:
    [ Vocabulary ] = pickle.load( Vfile )
    
with open( 'SampledData-2000.pickle', 'r' ) as savefile:
    SampledIdx, Sentences = pickle.load( savefile )
    
with open( 'ReviewToSentences.pickle', 'r' ) as sentencefile:
    [ ReviewToSentences ] = pickle.load( sentencefile )

In [3]:
## Directly Apply Keyword list from Hongning Wang's Paper
import csv

T = {}
with open( 'hotel_bootstrapping_May10.dat' , 'r') as KeywordFile:
    Reader = csv.reader( KeywordFile, delimiter=' ' )
    for row in Reader:
        T[ row[0].decode('utf-8') ] = row[1:]
        print row[0], row[1:]

﻿<value> ['ranges', 'half', 'accomodate', 'inclusive', 'deal', 'value', 'paying', 'worth', '!', 'extra', '$', 'pricy', 'overprice', '%', 'atmosphere', 'bargain', 'cost', 'charged', 'discount', 'fee', 'hotwire', 'price', 'vacation', 'tax', 'cash', 'quality', 'penny', 'low', 'cheap', 'bill', 'range', 'expectation', 'choice', 'usd', 'expensive', 'pesos', 'barter', 'priceline', 'honeymoon', 'paid', 'accomodation', 'rate', 'accommodate', 'money', 'pay', 'dollar', 'experience', 'resort', 'cheaper', 'definitely', 'anniversary', 'fraction', 'negotiate', 'dollars', 'star', 'quoted', 'rates', 'haggle', 'accommodation', 'prices', 'rating', 'taxes']
<room> ['door', 'carpet', 'house', 'furniture', 'bathroom', 'ready', 'comfortable', 'decor', 'furnish', 'open', 'hear', 'renovation', 'light', 'rooms', 'screen', 'mansion', 'condition', 'bathtub', 'bed', 'sink', 'windows', 'kitchen', 'quiet', 'conditioner', 'sleep', 'conditioning', 'double', 'bedroom', 'shampoo', 'size', 'upgrade', 'huge', 'beds', 'flo

In [4]:
## Aspect Annotation with Above Aspect Keyword list
Sentence_aspects = []
for idx, sentence in enumerate( Sentences ):
    aspectcount = {}
    aspects = [] # aspects of this sentence
    countMax = 0 # maximum count(i)
    for key in T.keys():
        count = 0
        for keyword in T[ key ]:
            count += sentence.count( keyword )
        aspectcount[ key ] = count
        if count > countMax:
            countMax = count
                    
    # Assign aspect i
    for key in T.keys():
        if aspectcount[ key ] == countMax:
            aspects.append( key )
            
    Sentence_aspects.append( aspects )
    if idx > 0 and idx%10000 == 0:
        print idx, 'sentence annotation done.'
                   
# sentence annotation with aspect assignment done
print 'Aspect Annotation Done.'

10000 sentence annotation done.
Aspect Annotation Done.


## Corpus Building

In [5]:
# Load Sentences_Aspects and Aspect keyword list
import pickle

# Use Keyword bootstrapped in own corpus
#with open( 'Result-2000-7Processes.pickle', 'r' ) as resultfile:
#    [ Sentence_aspects, T ] = pickle.load( resultfile )
    
# Check Aspect Keyword list
for key in T.keys():
    print key, T[ key ]
    

# Also extract ground truth aspect rating
SampledTripAdvisor = []
sentence_idx = 0
for j, idx in enumerate( SampledIdx ):
    temp = TripAdvisor[ idx ]
    
    sentences = []
    sentence_aspects = []
    for x in range( len( ReviewToSentences[ idx ] ) ):
        sentences.append( Sentences[ sentence_idx ] )
        sentence_aspects.append( Sentence_aspects[ sentence_idx ] )
        sentence_idx += 1
        
    temp.sentences_final = sentences
    temp.sentence_aspects = sentence_aspects
    
    if temp.aspects == 'NULL':
        temp.aspect_rating = {}
    else:
        temp.aspect_rating = {}
        aspectsList = temp.aspects.split('||||')
        for item in aspectsList:
            key, s_i = item.split('::')
            temp.aspect_rating[ key ] = float( s_i )
            
    SampledTripAdvisor.append( temp )
    if j>0 and j%1000 == 0:
        print j, 'reviews done.'

print len( SampledTripAdvisor )

<service> ['guide', 'highspeed', 'manager', 'bellman', 'e-mail', 'luggage', 'elevator', 'free', 'help', 'drink', 'shelf', 'lugged', 'smile', 'wine', 'serve', 'rude', 'continental', 'calls', 'wi-fi', 'router', 'reservation', 'pc', 'cafe', 'eat', 'bar', 'courteous', 'newspaper', 'buffets', 'computer', 'breakfast', 'carte', 'check-in', 'desk', 'laundry', 'computers', 'massage', 'wired', 'concierge', 'waiter', 'extremely', 'frontdesk', 'connectivity', 'front', 'friendly', 'access', 'greet', 'printer', 'route', 'club', 'broadband', 'polite', 'high-speed', 'hi-speed', 'agressive', 'buffet', 'pcs', 'request', 'lunch', 'gym', 'welcome', 'connection', 'fax', 'property', 'printing', 'emails', 'helpful', 'dinner', 'inform', 'stuff', 'email', 'food', 'gem', 'management', 'checkout', 'internet', 'modem', 'connect', 'fitness', 'garage', 'security', 'restaurant', 'drinks', 'immodium', 'network', 'staff', 'wireless', 'ethernet', 'facility', 'ate', 'lounge', 'lan', 'supply', 'printers', 'office', 'rece

## Test Match between Review content and Sentence & Aspects

In [6]:
print SampledTripAdvisor[1].content
print SampledTripAdvisor[1].sentences_final
print SampledTripAdvisor[1].sentence_aspects
print SampledTripAdvisor[1].aspect_rating

Upon arrival we were greeted by Val at the front desk...he gave us an amazing room with a view of the Times Square New Year's Eve crystal ball. Thanks, Val...your great!! Our stay went great...location was perfect...room was large and very nice. We would definitely stay here again! Thank you Hilton for an amazing New Year's vacation
[[u'great', u'year', u'eve', u'experience', u'arrival', u'greeted', u'val', u'front', u'desk', u'amazing', u'room', u'view', u'time', u'square', u'year', u'eve', u'crystal', u'ball'], [u'val', u'great'], [u'stay', u'great', u'location', u'perfect', u'room', u'large', u'nice'], [u'definitely', u'stay'], [u'hilton', u'amazing', u'year', u'vacation']]
[[u'<room>'], [u'<location>'], [u'<location>', u'<room>'], [u'<location>', u'\ufeff<value>', u'<room>'], [u'\ufeff<value>']]
{u'SleepQuality': 5.0, u'Service': 5.0, u'Cleanliness': 5.0, u'Value': 5.0, u'Location': 5.0, u'Rooms': 5.0}


## Create $W_{d}$ for each review d

In [8]:
import numpy as np

TripAdvisor = SampledTripAdvisor

count = 0
for idx in range( len( TripAdvisor ) ):
    wd = []
    for key in T.keys():
        wdi = []
        for word in Vocabulary:
            word_frequency = 0
            A_i_total_counts = 0
            # word frequency of w_j in text of A_i
            for sen_idx in range( len( TripAdvisor[ idx ].sentences_final ) ):
                if key in TripAdvisor[ idx ].sentence_aspects[ sen_idx ]:
                    word_frequency += TripAdvisor[ idx ].sentences_final[ sen_idx ].count( word )
                    A_i_total_counts += len( TripAdvisor[ idx ].sentences_final[ sen_idx ] )
            if A_i_total_counts == 0:
                wdi.append( 0.0 )
            else:
                wdi.append( word_frequency / float( A_i_total_counts ) )
        wd.append( wdi )
    TripAdvisor[ idx ].wd = np.array( wd ) 
    count += 1
    if count % 500 == 0:
        print count, 'reviews, transformed.'

500 reviews, transformed.
1000 reviews, transformed.
1500 reviews, transformed.
2000 reviews, transformed.


## E-step Updating
$$L(d) = (\hat{\alpha}_d-\mu)^\intercal\Sigma^{-1}(\hat{\alpha}_d-\mu) +
\frac{(r_d - \alpha_d^\intercal S_d)^2}{\delta^2} + \gamma \sum_{i=1}^k
\alpha_{di}(S_{di} - r_d )^2$$

$$\frac{\partial L(d)}{\partial \alpha_{di}} =
\frac{2(\alpha_d^\intercal S_d - r_d)}{\delta^2} \frac{\partial
\alpha_d^\intercal S_d}{\partial
\hat{\alpha}_{di}} + \gamma \frac{\partial \sum_{j=1}^k \alpha_{dj}(S_{dj} -
r_d)^2 }{\partial \hat{\alpha}_{di}} + \frac{\partial
(\hat{\alpha}_d-\mu)^\intercal\Sigma^{-1}(\hat{\alpha}_d-\mu)}{\partial
\hat{\alpha}_{di}}$$

$$\frac{\partial \alpha_d^T S_d}{\partial \hat{\alpha}_{di}} = \alpha_{di}
\sum_{j=1}^k \left[ \tau(j=i)S_{dj}(1-\alpha_{di}) - \tau(j\ne
i)S_{dj}\alpha_{dj} \right]$$

$$\frac{\partial \sum_{j=1}^k \alpha_{dj}(S_{dj} -
r_d)^2}{\partial \hat{\alpha}_{di}} = \alpha_{di} \sum_{j=1}^k \left[
\tau(j=i)(S_{dj}-r_d)^2(1-\alpha_{di}) - \tau(j\ne i)(S_{dj} - r_d)^2\alpha_{dj}
\right]$$

$$\frac{\partial
(\hat{\alpha}_d-\mu)^\intercal\Sigma^{-1}(\hat{\alpha}_d-\mu)}{\partial
\hat{\alpha}_{di}} = 2 (\hat{\alpha}_d - \mu) \Sigma^{-1} \cdot I \cdot
\frac{\partial \hat{\alpha}_d}{\partial \hat{\alpha}_{di}} =
2\sum_{j=1}^k \Sigma_{ji}^{-1}(\hat{\alpha}_{dj} - \mu_j)$$

In [34]:
import numpy as np
import math
from numpy.linalg import inv
from scipy.optimize import minimize

gamma = 0.5

# Infer S_d for each review
def Estep_sd( review, beta ):
    aspectLength = beta.shape[0]
    aspect_rating = []
    for rowid in range( aspectLength ):
        s_di = review.wd[ rowid ].dot( beta[ rowid ].T )
        aspect_rating.append( s_di )
    return np.array( [ aspect_rating ] ).T

# Function L( alphad_hat )
def L_alphad_hat( alphad_hat, mu, Sigma_inv, rating, delta_square, Sd, gamma ):
    term1 = ( alphad_hat - mu ).T.dot( Sigma_inv ).dot( alphad_hat - mu )[0,0]
    
    expsum = np.sum( np.exp( alphad_hat ) )
    alphad = np.exp( alphad_hat ) / expsum
    term2 = ( rating - alphad.T.dot( Sd ) )** 2 / delta_square
    
    term3 = gamma * alphad.dot( ( Sd - rating ) ** 2 )
    return term1 + term2 + term3

# Derivative of function L( alphad_hat )
def dLdaphad_hat( alphad_hat, mu, Sigma_inv, rating, delta_square, Sd, gamma ):
    expsum = np.sum( np.exp( alphad_hat ) )
    alphad = np.exp( alphad_hat ) / expsum
    
    aspectLength = mu.shape[0]
    
    derivative1 = np.zeros( ( aspectLength , aspectLength ) )
    derivative2 = np.zeros( ( aspectLength , aspectLength ) )
    for i in range( aspectLength ):
        indicator = np.zeros( ( aspectLength ,1 ) )
        indicator[ i, 0 ] = 1.0
        derivative1[ i, i ] = Sd.T.dot( -alphad + indicator )[0,0]
        derivative2[ i, i ] = ( ( Sd - rating ) ** 2 ).T.dot( -alphad + indicator )[0,0]
    
    term1 = 2 * ( alphad.T.dot( Sd ) - rating ) / delta_square * ( derivative1.dot( alphad ) )
    
    term2 = gamma * ( derivative2.dot( alphad ) )
    
    term3 = 2 * ( alphad_hat - mu ).T.dot( Sigma_inv ).T
    return term1 + term2 + term3

# Infer alphad based on LBFGS algorithm
def Estep_alphad( alphad_hat0, review, mu, Sigma, delta_square, gamma ):
    
    Sigma_inv = inv( Sigma )
    
    print 'Begin \hat{alpha}_d inference.'
    res = minimize( L_alphad_hat, alphad_hat0, 
                   args=( mu, Sigma_inv, review.rating, delta_square, review.Sd, gamma ),
                   method='bfgs', jac=dLdaphad_hat, tol= 1e-2, options={'maxiter':500,'disp':True} )
    print '\hat{alpha}_d inference done.'
    
    return res.x

## M-step Updating
$$
\mu_{(t+1)} = \underset{\mu}{\arg\min} \sum_{ d\in D} (\hat{\alpha}_d -
\mu)\Sigma^{-1}(\hat{\alpha}_d - \mu) = \frac{1}{|D|} \sum_{d \in D}
\hat{\alpha}_d
$$
$$
\begin{split}
\Sigma_{(t+1)} = & \underset{\Sigma}{\arg\min} \sum_{ d\in D}
\left[ (\hat{\alpha}_d - \mu)\Sigma^{-1}(\hat{\alpha}_d - \mu) + \log |\Sigma|
\right ] \\
= & \frac{1}{|D|} \sum_{d \in D} (\hat{\alpha}_d -
\mu_{(t+1)})^\intercal (\hat{\alpha}_d - \mu_{(t+1)})
\end{split}
$$

\begin{equation*}
L(D) = \sum_{d \in D} \left[ \log\delta^2 + \frac{(r_d -
\alpha_d^TS_d)^2}{\delta^2} + \gamma \sum_{i=1}^k \alpha_{di}(S_{di} - r_d )^2 \right] + \lambda \beta^\intercal
\beta
\end{equation*}

\begin{equation*}
\begin{split}
\frac{\partial L(\beta)}{\partial \beta_i} = & \sum_{d \in D} \left[
\frac{ 2( \alpha_d^\intercal S_d - r_d ) }{\delta^2} \frac{\partial
\alpha_d^\intercal S_d}{\partial \beta_i} + 2 \gamma \alpha_{di} (S_{di} -
r_d) \frac{\partial S_{di}}{\partial \beta_i} \right] + 2\lambda\beta_i \\
= & 2 \sum_{d \in D} \alpha_{di} \left[
\frac{ ( \alpha_d^\intercal S_d - r_d ) }{\delta^2} + \gamma (S_{di}
- r_d) \right] \frac{\partial S_{di}}{\partial \beta_i} + 2\lambda\beta_i
\end{split}
\end{equation*}

\begin{equation*}
\delta_{(t+1)}^2 = \underset{\delta}{\arg\min} \sum_{d \in D} \left[ \log
\delta^2 + \frac{(r_d - \alpha_d^\intercal S_d)^2}{\delta^2} \right] =
\frac{1}{|D|} \sum_{d \in D} (r_d - \alpha_d^\intercal S_d)^2
\end{equation*}

In [35]:
# Update mu
def Mstep_mu(  TripAdvisor ):
    mu = np.zeros_like( TripAdvisor[0].alphad )
    for review in TripAdvisor:
        mu += review.alphad_hat
    mu /= len( TripAdvisor )
    return mu

# Update Sigma
def Mstep_Sigma( TripAdvisor, mu ):
    aspectLength = TripAdvisor[0].alphad_hat.shape[0]
    Sigma = np.zeros( ( aspectLength, aspectLength ) )
    for review in TripAdvisor:
        Sigma += ( review.alphad_hat - mu ).T.dot( review.alphad_hat - mu )
    Sigma /= len( TripAdvisor )
    return Sigma

def Mstep_delta_square( TripAdvisor ):
    aspectLength = TripAdvisor[0].alphad_hat.shape[0]
    delta_square = np.zeros( ( aspectLength, aspectLength ) )
    for review in TripAdvisor:
        delta_square += ( review.rating - review.alphad.T.dot( review.Sd ) ) ** 2
    delta_square /= len( TripAdvisor )
    return delta_square

def L_beta( beta, TripAdvisor, delta_square, gamma, reg_lambda ):
    L = 0
    for review in TripAdvisor:
        L += ( review.rating - review.alphad.T.dot( review.Sd ) ) ** 2 / delta_square + gamma * ( review.alphad.T.dot( ( review.Sd - review.rating ) ** 2 ) )
    L += reg_lambda * np.trace( beta.T.dot( beta ) )
    return

def dLdbeta( beta, TripAdvisor, delta_square, gamma, reg_lambda ):
    Rows, Cols = TripAdvisor[0].wd.shape
    dldbeta = np.zeros( ( Rows, Cols ) )
    for review in TripAdvisor:
        M_alphad = np.diag( review.alphad.T[0] )
        dldbeta += np.diag( ( M_alphad.dot( ( review.alphad.T.dot( review.Sd ) - review.rating ) / delta_square
                                + gamma * ( review.Sd - review.rating ) ) ).T[0] ).dot( np.diag( review.Sd.T[0] ).dot( review.wd ) )
    dldbeta += reg_lambda * beta
    return dldbeta

def Mstep_beta( beta0, TripAdvisor, delta_square, gamma, reg_lambda ):
    
    print 'Begin beta inference.'
    res = minimize( L_beta, beta0, args=( TripAdvisor, delta_square, gamma, reg_lambda ),
                   method='bfgs', jac=dLdbeta, tol= 1e-2, options={'maxiter':5000,'disp':True} )
    print 'beta inference done.'
    
    return res.x

## Maximum Likelihood estimator
1. For each review d infer $s_d$ and $\alpha_d$ with current $\Theta_t$
2. Based on $r_d$ and $\alpha_d$ maximization, update $\Theta_{(t+1)}$

In [38]:
import numpy as np
from numpy.linalg import inv
from numpy.random import uniform

# Calculate likelihood value of the whole corpus
def L_D( TripAdvisor, Sigma, mu, delta_square, gamma, beta, reg_lambda ):
    L = 0
    for review in TripAdvisor:     
        term1 = np.log( np.linalg.det( Sigma ) ) 
        term2 = ( review.alphad_hat - mu ).T.dot( inv( Sigma ) ).dot( review.alphad_hat - mu )
        term3 = np.log( delta_square ) 
        term4 = ( review.rating - review.alphad.T.dot( review.Sd ) ) / delta_square 
        term5 = gamma * review.alphad.T.dot( ( review.Sd - review.rating )** 2 ) 
        L += ( term1 + term2 + term3 + term4 + term5 )
    L += reg_lambda * np.sum( beta ** 2 )
    return L

# Initialize iteration parameters
iter_num = 0
max_iter = 10
convergence = 1e-4
diff = 10

# Initialize corpus parameters
aspectLength = len( T.keys() )
vocaLength = len( Vocabulary )

Sigma = np.identity( aspectLength )
mu = 2 * uniform( size= aspectLength ).reshape( aspectLength, 1 ) - 1.0

delta_square = 1.0
gamma = 0.5

beta = []
for i in range( aspectLength ):
    beta.append( np.random.sample( vocaLength ) )
beta = np.array( beta )

reg_lambda = 2.0
L0 = 1.0


while ( diff > convergence and iter_num < max_iter ) or iter_num < min( 8, max_iter ):
    
    # E-step
    for idx in range( len( TripAdvisor ) ):
        TripAdvisor[ idx ].Sd = Estep_sd( TripAdvisor[ idx ] , beta )
        alphad_hat0 = np.zeros( ( aspectLength, 1 ) )
        TripAdvisor[ idx ].alphad_hat = Estep_alphad( alphad_hat0, TripAdvisor[ idx ],
                                                     mu, Sigma, delta_square, gamma )
        expSum = np.sum( np.exp( TripAdvisor[ idx ].alphad_hat ) )
        TripAdvisor[ idx ].alphad = np.exp( TripAdvisor[ idx ].alphad_hat ) / expSum
    
    # M-step
    mu = Mstep_mu(  TripAdvisor )
    
    # Avoid update Sigma too often
    if iter_num % 4 == 3:
        Sigma = Mstep_Sigma( TripAdvisor, mu )
        
    delta_square = Mstep_delta_square( TripAdvisor )
    beta = Mstep_beta( beta, TripAdvisor, delta_square, gamma, reg_lambda )
    
    L1 = L_D( TripAdvisor, Sigma, mu, delta_square, gamma, beta, reg_lambda )
    
    diff = ( L1 - L0 ) / float( L0 )
    
    L0 = L1
    
    iter_num += 1
    print 'Iteration:', iter_num, 'done.'

Begin \hat{alpha}_d inference.


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
import pickle

with open('Final_Results', 'w' ) as resultFile:
    pickle.dump( [ TripAdvisor, Sigma, mu, delta_square, gamma, beta, reg_lambda ] , resultFile )

## Reference
Wang, H., Lu, Y., and Zhai, C. (2010). Latent aspect rating analysis on review text data: a rating regression approach. In Proceedings of the 16th ACM SIGKDD international conference on Knowledge discovery and data mining, pages 783–792. ACM.