## An unsupervised approach to rank product reviews
### Jianwei Wu ; Bing Xu ; Sheng Li ; 2011

In [7]:
import nltk
import pandas as pd
import numpy as np
import MHR as mhr
import networkx as nx
from nltk import word_tokenize

In [8]:
def zeroFun(x): return 0
dfProducts = pd.read_csv('data/eletronic_sample_counts.csv.gz')
dfProducts['pageRank']=dfProducts.apply(zeroFun,axis=1)
dfProducts['hits']=dfProducts.apply(zeroFun,axis=1)
dfProducts.shape

(19756, 30)

In [9]:
def compute_tags(sentence):
    tags = []
    for token in nltk.pos_tag(nltk.word_tokenize(sentence.lower()), tagset = "universal"):
        if (token[1] in ['NOUN','VERB','ADJ']):
            tags.append(token[0])
    return tags

def compute_sentence(sentence1, sentence2):
    return (len(set(sentence1).intersection(sentence2)) > 0)

def compute_sentences_graph(sentences):
    sentences_count = len(sentences)
    matrix = np.zeros((sentences_count,sentences_count))

    for row in range(sentences_count):
        for col in range(sentences_count):
            matrix[row, col] = compute_sentence(sentences[row],sentences[col])

    return matrix

In [10]:
import time

# run your code


min_comments=30
min_votes=5

grouped=dfProducts[dfProducts['tot'].astype(int)>min_votes].groupby('asin')
total = len(grouped)
run = 1
performance=[]
for name, group in grouped:

    dffiltro = (dfProducts['asin']==name) & (dfProducts['tot'].astype(int)>min_votes)
    comments_count = dfProducts[dffiltro ]['tot'].values
    
    print("Run %d size %d for %d" % (run, len(comments_count), total))
    
    if ( (len(comments_count)>min_comments) ):
        
        start = time.time()
        
        sentences=[]
        
        print "computing"
        
        for t in dfProducts[dffiltro].T.to_dict().values():
            for s in nltk.sent_tokenize(t['reviewText']):
                sentences.append(compute_tags(s))   
                
        sentences_graph = compute_sentences_graph(sentences)
        nx_graph = nx.from_numpy_matrix(sentences_graph)
        pg_sentences = nx.pagerank_numpy(nx_graph)
        hits_sentences = nx.hits_numpy(nx_graph)

        count=0
        for t in dfProducts[dffiltro].T.to_dict().values():
            for s in nltk.sent_tokenize(t['reviewText']):
                #print t.values()
                dfProducts.loc[t.values()[0],'pageRank'] += float(pg_sentences[count])
                dfProducts.loc[t.values()[0],'hits'] += float(hits_sentences[1][count]) #authority score
                count += 1

            #break
        #break
        run += 1
        end = time.time()
        elapsed_phl = end - start
            
        df = dfProducts[dffiltro]
        start = time.time()
        df,ndcg_mhr = mhr.executeFromDf(df )
        end = time.time()
        elapsed_mhr = (end - start) #+ elapsed
        
        tempo={}
        tempo['product']=name
        tempo['tot_comments']=len(comments_count)
        tempo['elapsed_phl']=elapsed_phl
        tempo['elapsed_mhr']=elapsed_mhr
        performance.append(tempo)
        
        #if run > 5:
            #break
    
pd.DataFrame(performance).to_csv('data/performance_PR_HS_LEN2.csv.gz', compression='gzip')

Run 1 size 40 for 383
computing
product=1400532655 ndcg=0.877750740057 (0.877750740057)
Run 2 size 33 for 383
computing
product=B00000J061 ndcg=0.91824167245 (0.91824167245)
Run 3 size 31 for 383
computing
product=B00001P4ZH ndcg=0.920164849108 (0.920164849108)
Run 4 size 32 for 383
computing
product=B00001WRSJ ndcg=0.827050241882 (0.827050241882)
Run 5 size 35 for 383
computing
product=B000031KIM ndcg=0.963799464675 (0.963799464675)
Run 6 size 35 for 383
computing
product=B00004SB92 ndcg=0.903093600759 (0.903093600759)
Run 7 size 34 for 383
computing
product=B00004THCZ ndcg=0.963300255778 (0.963300255778)
Run 8 size 45 for 383
computing


  hubs=dict(zip(G.nodes(),map(float,h)))
  authorities=dict(zip(G.nodes(),map(float,a)))


product=B00004XOM3 ndcg=0.951106619418 (0.951106619418)
Run 9 size 32 for 383
computing
product=B00004ZCJE ndcg=0.891576681671 (0.891576681671)
Run 10 size 39 for 383
computing
product=B000053HC5 ndcg=0.812907577829 (0.812907577829)
Run 11 size 68 for 383
computing
product=B000053HH5 ndcg=0.9906983077 (0.9906983077)
Run 12 size 58 for 383
computing
product=B00005LEN4 ndcg=0.990614357408 (0.990614357408)
Run 13 size 40 for 383
computing
product=B000062VUO ndcg=0.823525159716 (0.823525159716)
Run 14 size 40 for 383
computing
product=B00006B7DA ndcg=0.931199840467 (0.931199840467)
Run 15 size 31 for 383
computing
product=B00006HMPK ndcg=0.963623965675 (0.963623965675)
Run 16 size 71 for 383
computing
product=B00006I53S ndcg=0.969403110808 (0.969403110808)
Run 17 size 32 for 383
computing
product=B00006I53X ndcg=0.887928580418 (0.887928580418)
Run 18 size 36 for 383
computing
product=B00006IS4X ndcg=0.919748252254 (0.919748252254)
Run 19 size 41 for 383
computing
product=B00006RVPW ndcg=0.

In [11]:
pd.DataFrame(performance)

Unnamed: 0,elapsed_mhr,elapsed_phl,product,tot_comments
0,0.067545,30.825284,1400532655,40
1,0.043664,13.501017,B00000J061,33
2,0.117195,11.874941,B00001P4ZH,31
3,0.045773,11.753863,B00001WRSJ,32
4,0.052466,18.857997,B000031KIM,35
5,0.045874,14.917513,B00004SB92,35
6,0.131763,12.132542,B00004THCZ,34
7,0.136842,18.403767,B00004XOM3,45
8,0.098339,6.469578,B00004ZCJE,32
9,0.051245,17.789017,B000053HC5,39


In [12]:
#dfProducts.to_csv('data/eletronic_sample_counts.csv.gz', compression='gzip')