In [102]:
## Define Review Class
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import nltk.data

with open( 'stopwords.dat', 'r' ) as stopfile:
    en_stop = stopfile.readlines()
    en_stop = [ item.split('\n')[0] for item in en_stop if not item in ['\n'] ]

class Review:
    # Initialization
    def __init__(self, hotel='', title='', rating='', aspects='', content='' ):
        self.hotel = hotel
        self.title = title
        self.rating = rating
        self.aspects = aspects
        self.content = content
        
    def Sentences( self ):
        tokenizer = nltk.data.load( 'tokenizers/punkt/english.pickle' )
        self.sentences = tokenizer.tokenize( self.title.lower() + self.content.lower() )
        
    def pre_process( self ):
        tokenizer = RegexpTokenizer( r'\w+' )
        temp = []
        for sentence in self.sentences:
            tokens = tokenizer.tokenize( sentence )
            tokens_nostop = [ item for item in tokens if not unicode( item ) in en_stop ]
            temp.append( tokens_nostop )
        self.tokens = temp
        
    def removeTerms( self, Vocabulary ):
        terms_removed = []
        for sentence in self.tokens:
            new_tokens = [ item for item in sentence if  item in Vocabulary ]
            terms_removed.append( new_tokens )
        self.terms_removed = terms_removed
        
    def Stemmer( self ):
        p_stemmer = PorterStemmer()
        sentences_final = []
        for sentence in self.terms_removed:
            sentence = [ p_stemmer.stem( item ) for item in sentence ]
            sentences_final.append( sentence )
        self.sentences_final = sentences_final

In [103]:
## Load TripAdvisor Reviews which includes Overall Rating, Aspect Rating and review contents

import csv

TripAdvisor = []
with open( 'tripadvisor_reviews.csv' , 'r' ) as csvfile:
    Reviewreader = csv.reader( csvfile, delimiter=',' )
    for row in Reviewreader:
        hotel = row[1].decode('utf-8')
        title = row[2].decode('utf-8')
        rating = float( row[5] ) 
        aspects = row[6].decode('utf-8')
        content = row[7].decode('utf-8')
        TripAdvisor.append( Review( hotel, title, rating, aspects, content ) )
    print 'Tripadvisor loaded.'

Tripadvisor loaded.


In [104]:
# Get Preprocessing for all the reviews
i = 0
for idx in range( len( TripAdvisor ) ):
    TripAdvisor[ idx ].Sentences()
    TripAdvisor[ idx ].pre_process()
    i += 1
    if i%10000 == 0:
        print i, 'reviews have been processed.'

10000 reviews have been processed.
20000 reviews have been processed.
30000 reviews have been processed.
40000 reviews have been processed.
50000 reviews have been processed.
60000 reviews have been processed.
70000 reviews have been processed.
80000 reviews have been processed.
90000 reviews have been processed.
100000 reviews have been processed.
110000 reviews have been processed.
120000 reviews have been processed.
130000 reviews have been processed.
140000 reviews have been processed.
150000 reviews have been processed.
160000 reviews have been processed.
170000 reviews have been processed.
180000 reviews have been processed.
190000 reviews have been processed.
200000 reviews have been processed.
210000 reviews have been processed.
220000 reviews have been processed.
230000 reviews have been processed.
240000 reviews have been processed.
250000 reviews have been processed.
260000 reviews have been processed.
270000 reviews have been processed.
280000 reviews have been processed.
2

In [105]:
# Count the frequency of each token

from collections import Counter

Tokens = []
for review in TripAdvisor:
    for tokens in review.tokens:
        Tokens += tokens
        
TermList = Counter( Tokens )

In [106]:
# Get terms occuring less than 5 in the corpus
RemoveTermList = []
for token in Tokens:
    if TermList[ token ] < 5:
        RemoveTermList.append( token )

RemoveTermList = list( set( RemoveTermList ) )
Tokens = list( set( Tokens ) )

In [107]:
# Build the vocabulary for the corpus
Vocabulary = []
i = 0
print len( Tokens ), len( RemoveTermList )

Vocabulary =  list( set( Tokens ).difference( set( RemoveTermList ) ) )
print len( Vocabulary )

158722 118896
39826


In [109]:
# Remove terms appearing less than 5 times
i=0
for idx in range( len( TripAdvisor ) ):
    TripAdvisor[ idx ].removeTerms( Vocabulary )
    i += 1
    if i%10000 == 0:
        print i

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000


In [110]:
# Stemming each word to its root with Porter Stemmer
i = 0
for idx in range( len( TripAdvisor ) ):
    TripAdvisor[ idx ].Stemmer()
    i += 1
    if i%10000 == 0:
        print i
        
# Preprocessing done

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000


In [112]:
# Save Variables into file
import pickle

with open( 'TripAdvisorReviews.pickle', 'w' ) as TripAdvisorfile:
    pickle.dump( [ TripAdvisor ] , TripAdvisorfile )

Aspect Segmentation -- Bootstrapping

In [113]:
# Load seed Topic List
T = {}
with open( 'hotel_bootstrapping.dat', 'r' ) as seedfile:
    Reader = csv.reader( seedfile, delimiter='\t' )
    for row in Reader:
        T[ row[0].decode('utf-8') ] = row[1:]
        print row[0], row[1:]

<value> ['value', 'price', 'quality', 'worth']
<room> ['room', 'suite', 'view', 'bed']
<location> ['location', 'traffic', 'minute', 'restaurant']
<cleanliness> ['clean', 'dirty', 'maintain', 'smell']
<front desk> ['stuff', 'check', 'help', 'reservation']
<service> ['service', 'food', 'breakfast', 'buffet']
<business service> ['business', 'center', 'computer', 'internet']


In [None]:
p = 5  # selection threshold
max_iter = 10 # iteration step limit


iter_num = 0 # Initialize iter_num and keyword list changed.
Flag = True

while iter_num < max_iter and Flag:
    Flag = False # set keyword list to unchanged
    
    # Match the aspect keywords in each sentence and record its matching
    for idx in range( len( TripAdvisor ) ):
        aspectCounts = [] # count(i) for each aspect
        sentence_aspects = []
        for sentence in TripAdvisor[ idx ].sentences_final:
            aspectcount = {}
            aspects = [] # aspects of this sentence
            countMax = 0 # maximum count(i)
            for key in T.keys():
                count = 0
                for keyword in T[ key ]:
                    count += sentence.count( keyword )
                aspectcount[ key ] = count
                if count > countMax:
                    countMax = count
            AspectCounts.append( aspectcount )
        
            for key in T.keys():
                if aspectcount[ key ] == countMax:
                    aspects.append( key )
            
            sentence_aspects.append( aspects )
                   
        TripAdvisor[ idx ].aspectCounts = aspectCounts
        TripAdvisor[ idx ].sentence_aspects = sentence_aspects
    # sentence annotation with aspect assignment done
    
    # Calculate chi^2 value for each word in Vocabulary under every aspect
    for key in T.keys():
        chi_list = []
        for word in Vocabulary:
            C_1, C_2, C_3, C_4, C = 0, 0, 0, 0, 0
            for review in TripAdvisor:
                for idx in range( len( review.sentences_final ) ):
                    C += review.sentences_final[idx].count( word )
                    if key in review.sentence_aspects[ idx ]:
                        C_1 += review.sentences_final[idx].count( word )
                        if not word in review.sentences_final[ idx ]:
                            C_3 += 1
                    else:
                        C_2 += review.sentences_final[idx].count( word )
                        if not word in review.sentences_final[ idx ]:
                            C_4 += 1
            denominator = (C_1 + C_3) * (C_2 + C_4) * (C_1 + C_2) * (C_3 + C_4)
            nominator = C * ( C_1*C_4 - C_2*C_3 ) * ( C_1*C_4 - C_2*C_3 )
            if denominator > 0:
                chi = nominator / float( denominator )
            else:
                chi = 0.0
            chi_list.append( ( word, chi ) )
        Chi_sorted = sorted( chi_list , key=lambda tup: tup[1] , reverse = True )
    
        # Joint top p words into aspect keywords list
        len_original = len( T[ key ] )
        for idx in range( p ):
            T[ key ].append( Chi_sorted[ idx ][0] )
        T[ key ] = list( set( T[ key ] ) )
    
        # Test whether the aspect keyword list has changed.
        if len( T[ key ] ) > len_original:
            Flag = True
    iter_num += 1
    print iter_num

## Create $W_{d}$ for each review d

In [None]:
import numpy as np

for idx in range( len( TripAdvisor ) ):
    TripAdvisor[ idx ].