## An unsupervised approach to rank product reviews
### Jianwei Wu ; Bing Xu ; Sheng Li ; 2011

In [1]:
import nltk
import pandas as pd
import numpy as np
import MHR as mhr
import networkx as nx
from nltk import word_tokenize

In [2]:
def zeroFun(x): return 0
dfProducts = pd.read_csv('data/eletronic_sample_counts.csv.gz')
dfProducts['pageRank']=dfProducts.apply(zeroFun,axis=1)
dfProducts['hits']=dfProducts.apply(zeroFun,axis=1)
dfProducts.shape

(19756, 30)

In [3]:
def compute_tags(sentence):
    tags = []
    for token in nltk.pos_tag(nltk.word_tokenize(sentence.lower()), tagset = "universal"):
        if (token[1] in ['NOUN','VERB','ADJ']):
            tags.append(token[0])
    return tags

def compute_sentence(sentence1, sentence2):
    return (len(set(sentence1).intersection(sentence2)) > 0)

def compute_sentences_graph(sentences):
    sentences_count = len(sentences)
    matrix = np.zeros((sentences_count,sentences_count))

    for row in range(sentences_count):
        for col in range(sentences_count):
            matrix[row, col] = compute_sentence(sentences[row],sentences[col])

    return matrix

In [6]:
import time

# run your code


min_comments=30
min_votes=5

grouped=dfProducts[dfProducts['tot'].astype(int)>min_votes].groupby('asin')
total = len(grouped)
run = 1
performance={}
for name, group in grouped:
    start = time.time()

    dffiltro = (dfProducts['asin']==name) & (dfProducts['tot'].astype(int)>min_votes)
    comments_count = dfProducts[dffiltro ]['tot'].values
    
    print("Run %d size %d for %d" % (run, len(comments_count), total))
    
    if ( (len(comments_count)>min_comments) ):
        sentences=[]
        
        print "computing"
        
        for t in dfProducts[dffiltro].T.to_dict().values():
            for s in nltk.sent_tokenize(t['reviewText']):
                sentences.append(compute_tags(s))   
                
        sentences_graph = compute_sentences_graph(sentences)
        nx_graph = nx.from_numpy_matrix(sentences_graph)
        pg_sentences = nx.pagerank_numpy(nx_graph)
        hits_sentences = nx.hits_numpy(nx_graph)

        count=0
        for t in dfProducts[dffiltro].T.to_dict().values():
            for s in nltk.sent_tokenize(t['reviewText']):
                #print t.values()
                dfProducts.loc[t.values()[0],'pageRank'] += float(pg_sentences[count])
                dfProducts.loc[t.values()[0],'hits'] += float(hits_sentences[1][count]) #authority score
                count += 1

            #break
        #break
    run += 1
    end = time.time()
    elapsed = end - start
    performance['product']=name
    performance['tot_comments']=comments_count
    performance['elapsed']=elapsed
    pd.DataFrame(performance).to_csv('data/performance_PR_HS_LEN.csv.gz', compression='gzip')

Run 1 size 40 for 383
computing
Run 2 size 33 for 383
computing
Run 3 size 31 for 383
computing
Run 4 size 32 for 383
computing
Run 5 size 35 for 383
computing
Run 6 size 35 for 383
computing
Run 7 size 34 for 383
computing
Run 8 size 45 for 383
computing
Run 9 size 32 for 383
computing
Run 10 size 39 for 383
computing
Run 11 size 68 for 383
computing
Run 12 size 58 for 383
computing
Run 13 size 40 for 383
computing
Run 14 size 40 for 383
computing
Run 15 size 31 for 383
computing
Run 16 size 71 for 383
computing
Run 17 size 32 for 383
computing
Run 18 size 36 for 383
computing
Run 19 size 41 for 383
computing
Run 20 size 63 for 383
computing
Run 21 size 32 for 383
computing
Run 22 size 57 for 383
computing
Run 23 size 62 for 383
computing
Run 24 size 70 for 383
computing
Run 25 size 49 for 383
computing
Run 26 size 41 for 383
computing
Run 27 size 32 for 383
computing
Run 28 size 32 for 383
computing
Run 29 size 60 for 383
computing
Run 30 size 78 for 383
computing
Run 31 size 62 for 

In [None]:
#dfProducts.to_csv('data/eletronic_sample_counts.csv.gz', compression='gzip')