## REVRANK: A Fully Unsupervised Algorithm for Selecting the Most Helpful Book Reviews
### by Oren Tsur and Ari Rappoport

In [81]:
import nltk
import math
import time
import itertools
import operator
import numpy as np
import pandas as pd
import NDCG as ndcg
from nltk.corpus import brown, stopwords
AllReviews = pd.read_csv('data/eletronic_sample_counts.csv.gz')
min_comments=30
min_votes=5

### build Virtual Core Review

In [82]:
def virtual_core(reviewsDf, external_freq, c, m):
    
    texts = [nltk.word_tokenize(text) for text in reviewsDf.reviewText]
    tokens = list(itertools.chain(*texts))
    reviews_freq = nltk.FreqDist(tokens)

    dominance = dict([])
    
    for word in dict(reviews_freq):
        if external_freq[word] > 0 and math.log(external_freq[word],2) > 0 and (word.lower() not in stopwords.words('english')) and (len(word) > 2):
            dominance[word] = reviews_freq[word] * c * ( 1 / math.log(external_freq[word],2) )
    
    
    return sorted(dominance.items(), key=operator.itemgetter(1), reverse=True)[:m]

### build Review Score

In [83]:
def review_score(text, core, mean):
    core_dict = dict(core)
    review_vector = nltk.FreqDist(nltk.word_tokenize(text))
    length = len(nltk.word_tokenize(text))
    score = 0

    for word in review_vector:
        if word in core_dict:
            score += review_vector[word]

    p = 1
    if length < mean:
        p = 20
            
    return (float(1) / p) * (float(score) / length)

### run RevRank for all instances

In [84]:
def zeroFun(x): return 0
brown_freq = nltk.FreqDist(brown.words(categories="news"))

AllReviews['revRank'] = AllReviews.apply(zeroFun,axis=1)
grouped = AllReviews[AllReviews.tot > min_votes].groupby('asin')

total = len(grouped)
run = 0
performance=[]

for name, group in grouped:

    ProductReview = AllReviews[AllReviews['asin']==name] 
    
    if ( len(ProductReview)>min_comments):
    
        run += 1
    
        ProductReview['revRank'] = ProductReview.apply(zeroFun,axis=1)

        start = time.time()
        
        review_core = virtual_core(ProductReview, brown_freq, 3, 200)
        mean = ProductReview.word_count.mean()

        for t in ProductReview.T.to_dict().values():
            review_s = review_score(t['reviewText'], review_core, mean)
            AllReviews.loc[t.values()[0],'revRank'] = review_s

        end = time.time()
        elapsed = (end - start)
            
        tempo={}
        tempo['product']=name
        tempo['tot_product']= len(ProductReview)
        tempo['time']=elapsed
        performance.append(tempo)
            
        print("Run %d size %d for %d" % (run, len(ProductReview), total))
            
        #if run > 50:
            #break

#pd.DataFrame(performance).to_csv('data/performance_revRank.csv.gz', compression='gzip')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Run 1 size 40 for 383
Run 2 size 33 for 383
Run 3 size 31 for 383
Run 4 size 32 for 383
Run 5 size 35 for 383
Run 6 size 35 for 383
Run 7 size 34 for 383
Run 8 size 45 for 383
Run 9 size 32 for 383
Run 10 size 39 for 383
Run 11 size 68 for 383
Run 12 size 58 for 383
Run 13 size 40 for 383
Run 14 size 40 for 383
Run 15 size 31 for 383
Run 16 size 71 for 383
Run 17 size 32 for 383
Run 18 size 36 for 383
Run 19 size 41 for 383
Run 20 size 63 for 383
Run 21 size 32 for 383
Run 22 size 57 for 383
Run 23 size 62 for 383
Run 24 size 70 for 383
Run 25 size 49 for 383
Run 26 size 41 for 383
Run 27 size 32 for 383
Run 28 size 32 for 383
Run 29 size 60 for 383
Run 30 size 78 for 383
Run 31 size 62 for 383
Run 32 size 45 for 383
Run 33 size 47 for 383
Run 34 size 50 for 383
Run 35 size 32 for 383
Run 36 size 40 for 383
Run 37 size 40 for 383
Run 38 size 37 for 383
Run 39 size 33 for 383
Run 40 size 39 for 383
Run 41 size 49 for 383
Run 42 size 54 for 383
Run 43 size 34 for 383
Run 44 size 34 for 3

In [85]:
#AllReviews.to_csv('data/eletronic_sample_counts.csv.gz', compression='gzip')

In [86]:
print len(AllReviews[AllReviews.revRank>0])
print len(AllReviews[AllReviews.powerWithStar>0])

19680
19756


In [87]:
AllReviews = AllReviews[AllReviews.revRank>0]
AllReviews.shape

(19680, 33)

In [88]:
ndcg_pr_hs_len=ndcg.calc_ndcg(AllReviews,'revRank',1) 
print "revRank ndcg 1="+str(np.mean(ndcg_pr_hs_len))
ndcg_pr_hs_len=ndcg.calc_ndcg(AllReviews,'revRank',3) 
print "revRank ndcg 3="+str(np.mean(ndcg_pr_hs_len))
ndcg_pr_hs_len=ndcg.calc_ndcg(AllReviews,'revRank',5) 
print "revRank ndcg 5="+str(np.mean(ndcg_pr_hs_len))
#revRank ndcg 1=0.914285714286
#revRank ndcg 3=0.896109540119
#revRank ndcg 5=0.862381460188

revRank ndcg 1=0.914285714286
revRank ndcg 3=0.896109540119
revRank ndcg 5=0.862381460188


In [89]:
ndcg_mhr=ndcg.calc_ndcg(AllReviews,'powerWithStar',1) 
print "MHR ndcg 1="+str(np.mean(ndcg_mhr))
ndcg_mhr=ndcg.calc_ndcg(AllReviews,'powerWithStar',3) 
print "MHR ndcg 3="+str(np.mean(ndcg_mhr))
ndcg_mhr=ndcg.calc_ndcg(AllReviews,'powerWithStar',5) 
print "MHR ndcg 5="+str(np.mean(ndcg_mhr))

MHR ndcg 1=0.751322751323
MHR ndcg 3=0.837819412086
MHR ndcg 5=0.82686792279
