In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVR, LinearSVR
from sklearn.grid_search import GridSearchCV
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer
from time import time
from sklearn import svm
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer

import MHR as mhr

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

In [2]:
def reload_package(root_module):
    package_name = root_module.__name__

    # get a reference to each loaded module
    loaded_package_modules = dict([
        (key, value) for key, value in sys.modules.items() 
        if key.startswith(package_name) and isinstance(value, types.ModuleType)])

    # delete references to these loaded modules from sys.modules
    for key in loaded_package_modules:
        del sys.modules[key]

    # load each of the modules again; 
    # make old modules share state with new modules
    for key in loaded_package_modules:
        print 'loading %s' % key
        newmodule = __import__(key)
        oldmodule = loaded_package_modules[key]
        oldmodule.__dict__.clear()
        oldmodule.__dict__.update(newmodule.__dict__)
        
def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max


def calc_ndcg(df, column,k):
    min_votes=5
    min_comments=30

    ndcg_global=[]
    grouped=df.groupby('asin')

    for name, group in grouped:
        dffiltro = (df['asin']==name)
        
        values_test = df[dffiltro]['helpfulness'].values
        scores = df[dffiltro][column].values


        ind = (-np.array(scores)).argsort()
        a = np.array(values_test)[ind]	
        ndcg = ndcg_at_k(a, k)
        ndcg_global.append(ndcg)
    return ndcg_global

def calc_ndcg_mean(df, column,k):
    x = calc_ndcg(df,column,k)
    return np.mean(x)

def calc_corr(df, column):
    correlation=[]
    grouped=df.groupby('asin')

    for name, group in grouped:
        dffiltro = (df['asin']==name)
        
        helpfulness = df[dffiltro]['helpfulness'].values
        scores = df[dffiltro][column].values
        correlation.append(np.corrcoef(helpfulness,scores)[0][1])
    return correlation

In [3]:
reviews_features = pd.read_csv('data/eletronic_sample_counts.csv.gz')
reviews_features.shape

  interactivity=interactivity, compiler=compiler, result=result)


(40870, 34)

In [4]:
#split by product
min_comments=30
min_votes=5

reviews_features=reviews_features.dropna()
grouped=reviews_features[reviews_features['tot'].astype(int)>min_votes].groupby('asin')
total = len(grouped)
performance=[]


In [14]:
run = 0
svmPrediction = pd.DataFrame()

for name, group in grouped:

    try:
    
        dffiltro = (reviews_features['asin']==name) & (reviews_features['tot'].astype(int)>min_votes)
        reviews_set = reviews_features[dffiltro]
        run += 1
        print("Run %d size %d for %d: %s" % (run, len(reviews_set), total, name))

        #if len(reviews_set) < 30: continue

        #CROSVALIDATION BY COMMENTS
        clf = svm.SVR()
        max_ndcg=0
        bestSVMPrediction=pd.DataFrame()

        features = TfidfVectorizer().fit_transform(reviews_set.reviewText)
        rs = cross_validation.ShuffleSplit(len(reviews_set), n_iter=10, test_size=.1 , random_state=0)
        ind=0
        ndcg=[]
        for train_index, test_index in rs:
            #train
            features_train = features[train_index]
            labels_train = reviews_set.iloc[train_index]["helpfulness"].values
            clf.fit(features_train, labels_train)

            #test
            features_test = features[test_index]
            labels_test = reviews_set.iloc[test_index]["helpfulness"].values
            x=clf.predict(features_test)

            dfTest= reviews_set.iloc[test_index]
            dfTest['svm']=x
            local_ndcg = calc_ndcg_mean(dfTest,'svm',5)
            ndcg.append(local_ndcg)
            #print "ndcg at fold "+str(ind)+" = "+ str(local_ndcg)

            ind=ind+1

            if (local_ndcg>max_ndcg):
                bestSVMPrediction=dfTest

        print "mean: " + str(np.mean(ndcg))
        svmPrediction = svmPrediction.append(bestSVMPrediction)

    except:
        
        print "mean: error"
        
    #if run > 4:
        #break

#save bestSVMPrediction on csv
svmPrediction.to_csv('data/best_prediction_svm_revisited.csv')
svmPrediction.shape

Run 1 size 40 for 383: 1400532655


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


mean: 0.961705052938
Run 2 size 33 for 383: B00000J061
mean: 0.982108409374
Run 3 size 31 for 383: B00001P4ZH
mean: 0.956283883991
Run 4 size 32 for 383: B00001WRSJ
mean: 0.9397043491
Run 5 size 35 for 383: B000031KIM
mean: 0.925188242331
Run 6 size 35 for 383: B00004SB92
mean: 0.965209767023
Run 7 size 34 for 383: B00004THCZ
mean: 0.970023208384
Run 8 size 45 for 383: B00004XOM3
mean: 0.97889846974
Run 9 size 32 for 383: B00004ZCJE
mean: 0.933281345404
Run 10 size 39 for 383: B000053HC5
mean: 0.981345325
Run 11 size 68 for 383: B000053HH5
mean: 0.91103966808
Run 12 size 57 for 383: B00005LEN4
mean: 0.872203066685
Run 13 size 40 for 383: B000062VUO
mean: 0.92397724065
Run 14 size 40 for 383: B00006B7DA
mean: 0.982626357541
Run 15 size 31 for 383: B00006HMPK
mean: 0.977584654494
Run 16 size 71 for 383: B00006I53S
mean: 0.938474192205
Run 17 size 32 for 383: B00006I53X
mean: 0.957578890095
Run 18 size 36 for 383: B00006IS4X
mean: 0.968720147146
Run 19 size 41 for 383: B00006RVPW
mean: 0.

(2117, 35)

In [15]:
print "SVM ndcg@1="+str(calc_ndcg_mean(svmPrediction,'svm',1))
print "SVM ndcg@3="+str(calc_ndcg_mean(svmPrediction,'svm',3))
print "SVM ndcg@5="+str(calc_ndcg_mean(svmPrediction,'svm',5))



SVM ndcg@1=0.889869669906
SVM ndcg@3=0.912010540576
SVM ndcg@5=0.94621064186


In [16]:
#salvando csv
pd.DataFrame(calc_ndcg(svmPrediction,'svm',1)).to_csv('data/ndcg_1_svm_revisited.csv')
pd.DataFrame(calc_ndcg(svmPrediction,'svm',3)).to_csv('data/ndcg_3_svm_revisited.csv')
pd.DataFrame(calc_ndcg(svmPrediction,'svm',5)).to_csv('data/ndcg_5_svm_revisited.csv')
