## REVRANK: A Fully Unsupervised Algorithm for Selecting the Most Helpful Book Reviews
### by Oren Tsur and Ari Rappoport

In [1]:
import nltk
import math
import time
import itertools
import operator
import pandas as pd
from nltk.corpus import brown, stopwords
AllReviews = pd.read_csv('data/eletronic_sample_counts.csv.gz')
min_comments=30
min_votes=5

### build Virtual Core Review

In [2]:
def virtual_core(reviewsDf, external_freq, c, m):
    
    texts = [nltk.word_tokenize(text) for text in reviewsDf.reviewText]
    tokens = list(itertools.chain(*texts))
    reviews_freq = nltk.FreqDist(tokens)

    dominance = dict([])
    
    for word in dict(reviews_freq):
        if external_freq[word] > 0 and math.log(external_freq[word],2) > 0 and (word.lower() not in stopwords.words('english')) and (len(word) > 2):
            dominance[word] = reviews_freq[word] * c * ( 1 / math.log(external_freq[word],2) )
    
    
    return sorted(dominance.items(), key=operator.itemgetter(1), reverse=True)[:m]

In [3]:
brown_freq = nltk.FreqDist(brown.words(categories="news"))
grouped = AllReviews[AllReviews['tot'].astype(int)>min_votes].groupby('asin')

# evaluate virtual core function
for name, group in grouped:
    
    ProductReview = AllReviews[AllReviews['asin']==name]    
    review_core = virtual_core(ProductReview, brown_freq, 3, 200)
    
    break

### build Review Score

In [4]:
def review_score(text, core, mean):
    core_dict = dict(core)
    review_vector = nltk.FreqDist(nltk.word_tokenize(text))
    length = len(nltk.word_tokenize(text))
    score = 0

    for word in review_vector:
        if word in core_dict:
            score += review_vector[word]

    p = 1
    if length < mean:
        p = 20
            
    return (float(1) / p) * (float(score) / length)

In [5]:
# evaluate review_score function
mean = ProductReview.word_count.mean()
review_s = review_score(ProductReview.reviewText[0], review_core, mean)
review_s

0.005445544554455446

### run RevRank for all instances

In [6]:
def zeroFun(x): return 0

total = len(grouped)
run = 1
performance=[]

for name, group in grouped:
    
    ProductReview = AllReviews[AllReviews['asin']==name] 
    
    if ( len(ProductReview)>min_comments ):
    
        ProductReview['revRank'] = ProductReview.apply(zeroFun,axis=1)

        start = time.time()
        
        review_core = virtual_core(ProductReview, brown_freq, 3, 200)
        mean = ProductReview.word_count.mean()

        for t in ProductReview.T.to_dict().values():
            review_s = review_score(t['reviewText'], review_core, mean)
            AllReviews.loc[t.values()[0],'revRank'] = review_s

        end = time.time()
        elapsed = (end - start)
            
        tempo={}
        tempo['product']=name
        tempo['tot_product']= len(ProductReview)
        tempo['time']=elapsed
        performance.append(tempo)
            
        print("Run %d size %d for %d" % (run, len(ProductReview), total))
            
        #break

pd.DataFrame(performance).to_csv('data/performance_revRank.csv.gz', compression='gzip')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Run 1 size 40 for 383
Run 1 size 33 for 383
Run 1 size 31 for 383
Run 1 size 32 for 383
Run 1 size 35 for 383
Run 1 size 35 for 383
Run 1 size 34 for 383
Run 1 size 45 for 383
Run 1 size 32 for 383
Run 1 size 39 for 383
Run 1 size 68 for 383
Run 1 size 58 for 383
Run 1 size 40 for 383
Run 1 size 40 for 383
Run 1 size 31 for 383
Run 1 size 71 for 383
Run 1 size 32 for 383
Run 1 size 36 for 383
Run 1 size 41 for 383
Run 1 size 63 for 383
Run 1 size 32 for 383
Run 1 size 57 for 383
Run 1 size 62 for 383
Run 1 size 70 for 383
Run 1 size 49 for 383
Run 1 size 41 for 383
Run 1 size 32 for 383
Run 1 size 32 for 383
Run 1 size 60 for 383
Run 1 size 78 for 383
Run 1 size 62 for 383
Run 1 size 45 for 383
Run 1 size 47 for 383
Run 1 size 50 for 383
Run 1 size 32 for 383
Run 1 size 40 for 383
Run 1 size 40 for 383
Run 1 size 37 for 383
Run 1 size 33 for 383
Run 1 size 39 for 383
Run 1 size 49 for 383
Run 1 size 54 for 383
Run 1 size 34 for 383
Run 1 size 34 for 383
Run 1 size 50 for 383
Run 1 size

In [8]:
#AllReviews.to_csv('data/eletronic_sample_counts.csv.gz', compression='gzip')