## An unsupervised approach to rank product reviews
### Jianwei Wu ; Bing Xu ; Sheng Li ; 2011

In [1]:
import nltk
import pandas as pd
import numpy as np
import MHR as mhr
import networkx as nx

In [2]:
def zeroFun(x): return 0
dfProducts = pd.read_csv('data/eletronic_sample_counts.csv.gz')
dfProducts = dfProducts[dfProducts.overall==3]
dfProducts['pageRank']=dfProducts.apply(zeroFun,axis=1)
dfProducts['hits']=dfProducts.apply(zeroFun,axis=1)
dfProducts.shape

(2075, 21)

In [3]:
def compute_sentence(sentence1, sentence2):
    tags1 = []
    tags2 = []
    
    for token in nltk.pos_tag(nltk.word_tokenize(sentence1.lower()), tagset = "universal"):
        if (token[1] in ['NOUN','VERB','ADJ']):
            tags1.append(token[0])
            
    for token in nltk.pos_tag(nltk.word_tokenize(sentence2.lower()), tagset = "universal"):
        if (token[1] in ['NOUN','VERB','ADJ']):
            tags2.append(token[0])
    
    return (len(set(tags1).intersection(tags2)) > 0)

def compute_sentences_graph(sentences):
    sentences_count = len(sentences)
    matrix = np.zeros((sentences_count,sentences_count))

    for row in range(sentences_count):
        for col in range(sentences_count):
            matrix[row, col] = compute_sentence(sentences[row],sentences[col])

    return matrix

In [9]:
min_comments=10
min_votes=5

grouped=dfProducts[dfProducts['tot'].astype(int)>min_votes].groupby('asin')
for name, group in grouped:
    
    dffiltro = (dfProducts['asin']==name) & (dfProducts['tot'].astype(int)>min_votes)
    comments_count = dfProducts[dffiltro ]['tot'].values
    
    if ( (len(comments_count)==min_comments) ):
        sentences=[]
        
        print len(comments_count)
        
        for t in dfProducts[dffiltro].T.to_dict().values():
            for s in nltk.sent_tokenize(t['reviewText']):
                sentences.append(s)   
                
        sentences_graph = compute_sentences_graph(sentences)
        nx_graph = nx.from_numpy_matrix(sentences_graph)
        pg_sentences = nx.pagerank_numpy(nx_graph)
        hits_sentences = nx.hits_numpy(nx_graph)

        count=0
        for t in dfProducts[dffiltro].T.to_dict().values():
            for s in nltk.sent_tokenize(t['reviewText']):
                dfProducts.loc[t.values()[3],'pageRank'] += float(pg_sentences[count])
                dfProducts.loc[t.values()[3],'hits'] += float(hits_sentences[1][count]) #authority score
                count += 1

        break

10


In [10]:
dfProducts[dfProducts.pageRank>0]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,...,helpfulness,tot,word_count,sentence_count,unigram_count,pos_tag,adj,noun,pageRank,hits
2866,2866,146724,B0007Y79B2,"[16, 18]",3.0,"When I am walking to work, listening to music ...","10 18, 2005",A3F1VMAU8SBNYA,"Amazon Customer ""baroque recorderist""",Itchy and Scratchy,...,0.888889,18,190,13,126,<FreqDist with 12 samples and 223 outcomes>,17,43,0.302139,0.312322
2873,2873,146739,B0007Y79B2,"[5, 7]",3.0,Excellent functionality and features. Plays so...,"01 16, 2006",A3EEBE82C3HYRD,David,"Sweet player, but pricey and lacks FM tuner",...,0.714286,7,253,10,150,<FreqDist with 12 samples and 283 outcomes>,30,62,0.196139,0.175263
2880,2880,146751,B0007Y79B2,"[7, 16]",3.0,Being the only person in the family without an...,"10 5, 2005",A1X2XT1XWY7NED,G. L. Scott,Nano - 5 Stars Apple - minus 5 Stars,...,0.4375,16,326,9,178,<FreqDist with 11 samples and 330 outcomes>,24,84,0.269959,0.257108
2882,2882,146754,B0007Y79B2,"[3, 8]",3.0,I decided to stay away from iPods for a long t...,"12 30, 2005",A3BT3PUYG13M69,I. Smarius,not impressed,...,0.375,8,321,6,171,<FreqDist with 11 samples and 339 outcomes>,32,62,0.198491,0.206985
2893,2893,146773,B0007Y79B2,"[0, 11]",3.0,I bought the shuffle first and wrestled with t...,"01 14, 2006",A3G9BWSC4KOGFM,"Marsha L. Mcginnis ""Marsha""",Looks good . . . but?,...,0.0,11,178,14,122,<FreqDist with 11 samples and 199 outcomes>,16,42,0.237781,0.228338
2900,2900,146780,B0007Y79B2,"[10, 10]",3.0,"Please note that this is the old version (""Gen...","10 27, 2006",A34ZDM4OFYT4SP,Peter Headland,Warning - this is the old version,...,1.0,10,60,2,45,<FreqDist with 11 samples and 74 outcomes>,7,15,0.061766,0.065627
2901,2901,146781,B0007Y79B2,"[5, 10]",3.0,Take the very nice IPOD MINI add to it a weak ...,"10 31, 2005",A17BT73RVRQYSP,Peter Ingemi,over rated,...,0.5,10,97,1,58,<FreqDist with 10 samples and 100 outcomes>,9,23,0.038307,0.039915
2907,2907,146788,B0007Y79B2,"[9, 18]",3.0,The Nano definitely has plenty of cool factor....,"10 25, 2005",A2I89SHNF2WTHH,Shamanjunior,"Cool Design, Mediocre Sound, Scratches Easily",...,0.5,18,154,10,100,<FreqDist with 10 samples and 176 outcomes>,11,43,0.24037,0.241095
2911,2911,146794,B0007Y79B2,"[11, 11]",3.0,This is the first iPod of any kind I've ever o...,"12 8, 2005",A3DIXCPG86IQNN,Steve R,one annoying glitch,...,1.0,11,293,12,178,<FreqDist with 11 samples and 320 outcomes>,23,52,0.325944,0.346487
2912,2912,146795,B0007Y79B2,"[11, 22]",3.0,"Pros...1) Hands-down, the very best UI of ANY ...","03 15, 2006",A1WM8DNWJ8SCEL,the_emperor_of_ice_cream,Most IPOD Fans Do NOT Understand the Financial...,...,0.5,22,181,5,127,<FreqDist with 12 samples and 211 outcomes>,17,52,0.148654,0.126862
