In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVR, LinearSVR
from sklearn.grid_search import GridSearchCV
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer
from time import time
from sklearn import svm
from sklearn import cross_validation

import MHR as mhr

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

In [2]:

def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max


def calc_ndcg(df, column,k):
    min_votes=5
    min_comments=30

    ndcg_global=[]
    grouped=df[df['tot'].astype(int)>min_votes].groupby('asin')

    for name, group in grouped:	
        dffiltro = (df['asin']==name) & (df['tot'].astype(int)>min_votes) 
        comments_count = df[dffiltro ]['tot'].values
        if ( (len(comments_count)>min_comments) ):
            values_test = df[dffiltro]['helpfulness'].T.to_dict().values()
            scores = df[dffiltro][column].T.to_dict().values()

            
            ind = (-np.array(scores)).argsort()
            a = np.array(values_test)[ind]	
            ndcg = ndcg_at_k(a, k)
            ndcg_global.append(ndcg)
    return np.mean(ndcg_global)

In [3]:
reviews_features = pd.read_csv('data/eletronic_sample_counts.csv.gz')
reviews_features.shape

(19756, 19)

In [4]:
reviews_features.columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'asin', u'helpful', u'overall',
       u'reviewText', u'reviewTime', u'reviewerID', u'reviewerName',
       u'summary', u'unixReviewTime', u'helpfulness', u'tot', u'word_count',
       u'sentence_count', u'unigram_count', u'pos_tag', u'adj', u'noun'],
      dtype='object')

In [5]:
df_columns = reviews_features.select_dtypes(include=['float64','int','int64']).columns
df_columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'overall', u'unixReviewTime',
       u'helpfulness', u'tot', u'word_count', u'sentence_count',
       u'unigram_count', u'adj', u'noun'],
      dtype='object')

In [6]:
clf = svm.SVR(epsilon=0.001)


rs = cross_validation.ShuffleSplit(len(reviews_features), n_iter=10, test_size=.4 , random_state=0)
ind=0
ndcg=[]
for train_index, test_index in rs:
    #train
    features_train = reviews_features.ix[train_index][list(['word_count','sentence_count','unigram_count','adj','noun'])].values
    labels_train = reviews_features.ix[train_index]["helpfulness"].values
    clf.fit(features_train, labels_train)
    
    #test
    features_test = reviews_features.ix[test_index][list(['word_count','sentence_count','unigram_count','adj','noun'])].values
    labels_test = reviews_features.ix[test_index]["helpfulness"].values
    x=clf.predict(features_test)
    
    dfTest= reviews_features.ix[test_index]
    dfTest['svm']=x
    local_ndcg = calc_ndcg(dfTest,'svm',5)
    ndcg.append(local_ndcg)
    print "ndcg at fold "+str(ind)+" = "+ str(local_ndcg)
    
    ind=ind+1


11853
fold=0 ndcg= 0.616566953143
11853
fold=1 ndcg= 0.644001272978
11853
fold=2 ndcg= 0.607666903717
11853
fold=3 ndcg= 0.655057569423
11853
fold=4 ndcg= 0.640695607425
11853
fold=5 ndcg= 0.620720575563
11853
fold=6 ndcg= 0.616819750379
11853
fold=7 ndcg= 0.596320987697
11853
fold=8 ndcg= 0.616838786607
11853
fold=9 ndcg= 0.616806797582


In [7]:
print "fold " + str(np.argmax(ndcg)) 
count=0
for train_index, test_index in rs:
    if (count==int(np.argmax(ndcg))):
        print (len(reviews_features.ix[test_index]))
        x=mhr.executeFromDf(reviews_features.ix[test_index])
    count=count+1

fold 3
7903
0.569633838764
0.580706391559
0.637820846157
0.585474216706
0.627686360885
0.639304985259
0.633587411191
0.609644088007
0.674305980195
0.637374826324
0.573731185047
0.650075542027
0.629496477873
0.645655689558
0.596176575181
0.663012699066
0.64109822936
0.66239608391
0.62513974803
0.651172930406
0.592031922582
0.623093598298
0.570452588532
0.594575815155
0.63396975641
0.608747437436
0.707698419793
0.665680049833
0.667800689714
0.665753558631
0.733485996927
0.691667472017
0.6834898588
0.70084242092
0.659601583238
0.613108654619
0.690308456415
0.665002307197
0.658786658276
0.659846927228
0.619205389728


In [8]:
mhr_ndcg=calc_ndcg(x,'powerWithStar',5) 
print "MHR ndcg="+str(mhr_ndcg)

MHR ndcg=0.680939584302
