In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVR, LinearSVR
from sklearn.grid_search import GridSearchCV
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer
from time import time
from sklearn import svm
from sklearn import cross_validation

import MHR as mhr

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)



In [12]:
def reload_package(root_module):
    package_name = root_module.__name__

    # get a reference to each loaded module
    loaded_package_modules = dict([
        (key, value) for key, value in sys.modules.items() 
        if key.startswith(package_name) and isinstance(value, types.ModuleType)])

    # delete references to these loaded modules from sys.modules
    for key in loaded_package_modules:
        del sys.modules[key]

    # load each of the modules again; 
    # make old modules share state with new modules
    for key in loaded_package_modules:
        print 'loading %s' % key
        newmodule = __import__(key)
        oldmodule = loaded_package_modules[key]
        oldmodule.__dict__.clear()
        oldmodule.__dict__.update(newmodule.__dict__)
        
def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max


def calc_ndcg(df, column,k):
    min_votes=5
    min_comments=30

    ndcg_global=[]
    grouped=df.groupby('asin')

    for name, group in grouped:
        dffiltro = (df['asin']==name)
        
        values_test = df[dffiltro]['helpfulness'].values
        scores = df[dffiltro][column].values


        ind = (-np.array(scores)).argsort()
        a = np.array(values_test)[ind]	
        ndcg = ndcg_at_k(a, k)
        ndcg_global.append(ndcg)
    return ndcg_global

def calc_ndcg_mean(df, column,k):
    x = calc_ndcg(df,column,k)
    return np.mean(x)

def calc_corr(df, column):
    correlation=[]
    grouped=df.groupby('asin')

    for name, group in grouped:
        dffiltro = (df['asin']==name)
        
        helpfulness = df[dffiltro]['helpfulness'].values
        scores = df[dffiltro][column].values
        correlation.append(np.corrcoef(helpfulness,scores)[0][1])
    return correlation

In [3]:
reviews_features = pd.read_csv('data/eletronic_sample_counts.csv.gz')
reviews_features.shape

  interactivity=interactivity, compiler=compiler, result=result)


(40870, 34)

In [6]:
reviews_features.columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'Unnamed: 0.1', u'Unnamed: 0.1.1',
       u'Unnamed: 0.1.1', u'MHRs', u'Unnamed: 0.1.1.1', u'Unnamed: 0.1.1.1',
       u'Unnamed: 0.1.1.1.1', u'adj', u'asin', u'helpful', u'helpfulness',
       u'hits', u'noun', u'overall', u'pageRank', u'pos_tag', u'powerWithStar',
       u'reviewText', u'reviewTime', u'reviewerID', u'reviewerName',
       u'sentence_count', u'summary', u'tot', u'unigram_count',
       u'unixReviewTime', u'word_count', u'pr_hs', u'pr_len', u'hs_len',
       u'pr_hs_len', u'revRank'],
      dtype='object')

In [7]:
df_columns = reviews_features.select_dtypes(include=['float64','int','int64']).columns
df_columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'Unnamed: 0.1', u'Unnamed: 0.1.1',
       u'Unnamed: 0.1.1', u'MHRs', u'Unnamed: 0.1.1.1', u'Unnamed: 0.1.1.1',
       u'Unnamed: 0.1.1.1.1', u'adj', u'helpfulness', u'hits', u'noun',
       u'overall', u'pageRank', u'powerWithStar', u'sentence_count', u'tot',
       u'unigram_count', u'unixReviewTime', u'word_count', u'pr_hs', u'pr_len',
       u'hs_len', u'pr_hs_len', u'revRank'],
      dtype='object')

In [9]:
reviews_features[['word_count','sentence_count','unigram_count','adj','noun','helpfulness']].head()

Unnamed: 0,word_count,sentence_count,unigram_count,adj,noun,helpfulness
0,90,2,71,8,20,0.045455
1,604,26,292,53,128,0.733333
2,582,16,273,64,136,1.0
3,318,14,183,22,78,0.6
4,36,2,32,3,8,0.3125


In [14]:
len(products)


383

In [59]:
#CROSVALIDATION BY COMMENTS
clf = svm.SVR()
max_ndcg=0
bestSVMPrediction=pd.DataFrame()

rs = cross_validation.ShuffleSplit(len(reviews_features), n_iter=10, test_size=.1 , random_state=0)
ind=0
ndcg=[]
for train_index, test_index in rs:
    #train
    features_train = reviews_features.iloc[train_index][list(['word_count','sentence_count','unigram_count','adj','noun'])].values
    labels_train = reviews_features.iloc[train_index]["helpfulness"].values
    clf.fit(features_train, labels_train)
    
    #test
    features_test = reviews_features.iloc[test_index][list(['word_count','sentence_count','unigram_count','adj','noun'])].values
    labels_test = reviews_features.iloc[test_index]["helpfulness"].values
    x=clf.predict(features_test)
    
    dfTest= reviews_features.iloc[test_index]
    dfTest['svm']=x
    local_ndcg = calc_ndcg_mean(dfTest,'svm',5)
    ndcg.append(local_ndcg)
    print "ndcg at fold "+str(ind)+" = "+ str(local_ndcg)
    
    ind=ind+1

    if (local_ndcg>max_ndcg):
        bestSVMPrediction=dfTest
    

#save bestSVMPrediction on csv
bestSVMPrediction.to_csv('data/best_prediction_svm.csv')

ndcg at fold 0 = 0.913655280929
ndcg at fold 1 = 0.912583503636
ndcg at fold 2 = 0.923522562114
ndcg at fold 3 = 0.905105137255
ndcg at fold 4 = 0.91319186602
ndcg at fold 5 = 0.921244709246
ndcg at fold 6 = 0.915009124149
ndcg at fold 7 = 0.906041944008
ndcg at fold 8 = 0.923186490416
ndcg at fold 9 = 0.915358528432


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [60]:
print "SVM ndcg@1="+str(calc_ndcg_mean(bestSVMPrediction,'svm',1))
print "SVM ndcg@3="+str(calc_ndcg_mean(bestSVMPrediction,'svm',3))
print "SVM ndcg@5="+str(calc_ndcg_mean(bestSVMPrediction,'svm',5))

SVM ndcg@1=0.764161004389
SVM ndcg@3=0.873865776305
SVM ndcg@5=0.915358528432
