In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVR, LinearSVR
from sklearn.grid_search import GridSearchCV
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer
from time import time
from sklearn import svm
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer

import MHR as mhr

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

In [2]:
def reload_package(root_module):
    package_name = root_module.__name__

    # get a reference to each loaded module
    loaded_package_modules = dict([
        (key, value) for key, value in sys.modules.items() 
        if key.startswith(package_name) and isinstance(value, types.ModuleType)])

    # delete references to these loaded modules from sys.modules
    for key in loaded_package_modules:
        del sys.modules[key]

    # load each of the modules again; 
    # make old modules share state with new modules
    for key in loaded_package_modules:
        print 'loading %s' % key
        newmodule = __import__(key)
        oldmodule = loaded_package_modules[key]
        oldmodule.__dict__.clear()
        oldmodule.__dict__.update(newmodule.__dict__)
        
def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max


def calc_ndcg(df, column,k):
    min_votes=5
    min_comments=30

    ndcg_global=[]
    grouped=df.groupby('asin')

    for name, group in grouped:
        dffiltro = (df['asin']==name)
        
        values_test = df[dffiltro]['helpfulness'].values
        scores = df[dffiltro][column].values


        ind = (-np.array(scores)).argsort()
        a = np.array(values_test)[ind]	
        ndcg = ndcg_at_k(a, k)
        ndcg_global.append(ndcg)
    return ndcg_global

def calc_ndcg_mean(df, column,k):
    x = calc_ndcg(df,column,k)
    return np.mean(x)

def calc_corr(df, column):
    correlation=[]
    grouped=df.groupby('asin')

    for name, group in grouped:
        dffiltro = (df['asin']==name)
        
        helpfulness = df[dffiltro]['helpfulness'].values
        scores = df[dffiltro][column].values
        correlation.append(np.corrcoef(helpfulness,scores)[0][1])
    return correlation

In [3]:
reviews_features = pd.read_csv('data/book_sample_counts.csv.gz')
reviews_features.shape

(24234, 30)

In [4]:
#split by product
min_comments=30
min_votes=5

reviews_features=reviews_features.dropna()
grouped=reviews_features[reviews_features['tot'].astype(int)>min_votes].groupby('asin')
total = len(grouped)
performance=[]


In [5]:
run = 0
svmPrediction = pd.DataFrame()

for name, group in grouped:

    try:
    
        dffiltro = (reviews_features['asin']==name) & (reviews_features['tot'].astype(int)>min_votes)
        reviews_set = reviews_features[dffiltro]
        run += 1
        print("Run %d size %d for %d: %s" % (run, len(reviews_set), total, name))

        #if len(reviews_set) < 30: continue

        #CROSVALIDATION BY COMMENTS
        clf = svm.SVR()
        max_ndcg=0
        bestSVMPrediction=pd.DataFrame()

        features = TfidfVectorizer().fit_transform(reviews_set.reviewText)
        rs = cross_validation.ShuffleSplit(len(reviews_set), n_iter=10, test_size=.1 , random_state=0)
        ind=0
        ndcg=[]
        for train_index, test_index in rs:
            #train
            features_train = features[train_index]
            labels_train = reviews_set.iloc[train_index]["helpfulness"].values
            clf.fit(features_train, labels_train)

            #test
            features_test = features[test_index]
            labels_test = reviews_set.iloc[test_index]["helpfulness"].values
            x=clf.predict(features_test)

            dfTest= reviews_set.iloc[test_index]
            dfTest['svm']=x
            local_ndcg = calc_ndcg_mean(dfTest,'svm',5)
            ndcg.append(local_ndcg)
            #print "ndcg at fold "+str(ind)+" = "+ str(local_ndcg)

            ind=ind+1

            if (local_ndcg>max_ndcg):
                bestSVMPrediction=dfTest

        print "mean: " + str(np.mean(ndcg))
        svmPrediction = svmPrediction.append(bestSVMPrediction)

    except:
        
        print "mean: error"
        
    #if run > 4:
        #break

#save bestSVMPrediction on csv
svmPrediction.to_csv('data/best_prediction_svm_books_revisited.csv')
svmPrediction.shape

Run 1 size 48 for 461: 0002051850


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


mean: 0.938286600655
Run 2 size 80 for 461: 000224053X
mean: 0.770809262602
Run 3 size 31 for 461: 0006393195
mean: 0.982271034688
Run 4 size 56 for 461: 0006530702
mean: 0.903290963145
Run 5 size 45 for 461: 0007157150
mean: 0.975875434275
Run 6 size 58 for 461: 0007167040
mean: 0.937902400361
Run 7 size 35 for 461: 000721278X
mean: 0.985050313877
Run 8 size 54 for 461: 0007228023
mean: 0.940610030461
Run 9 size 40 for 461: 0007231601
mean: 0.923602285467
Run 10 size 131 for 461: 000779021X
mean: 0.862946139837
Run 11 size 60 for 461: 0026045702
mean: 0.899328206575
Run 12 size 40 for 461: 0029109752
mean: 0.98211765971
Run 13 size 36 for 461: 006000942X
mean: 0.969676745011
Run 14 size 40 for 461: 0060083816
mean: 0.959615218144
Run 15 size 46 for 461: 0060084332
mean: 0.959885140614
Run 16 size 74 for 461: 0060175869
mean: 0.880115283196
Run 17 size 36 for 461: 0060192119
mean: 0.88858511191
Run 18 size 53 for 461: 006019491X
mean: 0.912389428021
Run 19 size 43 for 461: 0060393491
m

(2645, 31)

In [6]:
print "SVM ndcg@1="+str(calc_ndcg_mean(svmPrediction,'svm',1))
print "SVM ndcg@3="+str(calc_ndcg_mean(svmPrediction,'svm',3))
print "SVM ndcg@5="+str(calc_ndcg_mean(svmPrediction,'svm',5))



SVM ndcg@1=0.855399342981
SVM ndcg@3=0.891195545384
SVM ndcg@5=0.931191789478


In [7]:
#salvando csv
pd.DataFrame(calc_ndcg(svmPrediction,'svm',1)).to_csv('data/ndcg_1_svm_books_revisited.csv')
pd.DataFrame(calc_ndcg(svmPrediction,'svm',3)).to_csv('data/ndcg_3_svm_books_revisited.csv')
pd.DataFrame(calc_ndcg(svmPrediction,'svm',5)).to_csv('data/ndcg_5_svm_books_revisited.csv')
