In [None]:
# validation method for recommender systems
# computes the RMSE (root mean squared error) for the recommender method on the same dataset
# using increasing volume ratio of training/testing dataset
# might be used to evaluate how the method improves on increasing volume on testing dataset

import logging
import math
import pandas as pd
import numpy as np
# from dummy_recommender import MeanRatingRecommender as Recommender
# from knn_recommender_v2 import KnnRecommender as Recommender
from recommender.doc2vec_recommender import Doc2VecRecommender as Recommender

# DATA_FILE_PATH="/home/michal/Documents/Misc/recommenders/vcs/book-recommender/data/ratings_Books.csv"
DATA_FILE_PATH = '/home/kvassay/data/book-recommender/ratings_Books.csv'

SAMPLED_USERS = 100000
USER_IS_ROBOT_THRESHOLD = 100

# slices to use for testing methods improvements on increasing amount of testing data
SLICING_INTERVAL = 5

# select how many times the evaluation will split data and test
# selecting 1 means one split with fold on SLICING_INTERVAL-1/SLICING_INTERVAL timestamp for every user
# can automatically test a development of model performance on increasing amount of training data
SLICING_RUNS = 1
SPLIT_Q=0.8
method_name = Recommender.__module__

# logging init:
logger = logging.getLogger()
logger.setLevel(20)

FORMAT = '%(asctime)s.%(msecs)03d - %(levelname)s - %(message)s'
DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
formatter = logging.Formatter(fmt=FORMAT, datefmt=DATE_FORMAT)
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger.addHandler(handler)

data_file = DATA_FILE_PATH

date_format = "%Y-%m-%d %H:%M:%S"

logger.info("Starting application on %s dataset" % data_file)
logger.info("Testing recommender implementation of %s " % method_name)

with open(data_file, "r") as f:
    df = pd.read_csv(data_file, names=["user", "item", "rating", "timestamp"])

grouped_users = df.groupby(["user"]).count()
n_review_users = grouped_users[grouped_users["item"] >= SLICING_INTERVAL]["item"].keys()
eval_users = np.random.choice(n_review_users.unique(), SAMPLED_USERS)
logger.info("Selected %s users to evaluate their ratings" % eval_users.__len__())

eval_dataframe = df[df['user'].isin(eval_users)]
logger.info("Selected dataframe of random %s users containing %s entries" %
            (eval_users.__len__(), eval_dataframe.__len__()))



training_frame = pd.DataFrame(columns=["user", "item", "rating", "timestamp"])
testing_frame = pd.DataFrame(columns=["user", "item", "rating", "timestamp"])


quantile = SPLIT_Q
logger.info("training on users dataset divides on quantile %s" % quantile)

# TODO: later compare recommender results with using dataset having only data newer than from 2013

for user in eval_users:
    user_reviews = eval_dataframe[eval_dataframe['user'] == user]

    if len(user_reviews) >= USER_IS_ROBOT_THRESHOLD:
        # do not include users having more than threshold ratings
        continue

    # value dividing reviews of a user to training and testing
    slicing_timestamp = user_reviews["timestamp"].quantile(q=quantile)

    training_user_data = user_reviews[user_reviews["timestamp"] < slicing_timestamp]
    training_frame = training_frame.append(training_user_data)

    testing_user_data = user_reviews[user_reviews["timestamp"] >= slicing_timestamp]
    testing_frame = testing_frame.append(testing_user_data)

logger.info("training dataframe size: %s" % training_frame.__len__())
logger.info("testing dataframe size: %s" % testing_frame.__len__())

2016-11-23 19:30:46.383 - INFO - Starting application on /home/kvassay/data/book-recommender/ratings_Books.csv dataset
2016-11-23 19:30:46.384 - INFO - Testing recommender implementation of recommender.doc2vec_recommender 
2016-11-23 19:31:44.032 - INFO - Selected 100000 users to evaluate their ratings
2016-11-23 19:31:47.480 - INFO - Selected dataframe of random 100000 users containing 1381510 entries
2016-11-23 19:31:47.483 - INFO - training on users dataset divides on quantile 0.8


In [None]:
def evaluate(recommender_instance):
    recommender_instance.fit(training_frame)
    logger.info("Recommender method has fit on %s entries" % training_frame.__len__())

    # aggregated difference of recommender predicted rating against the real rating
    delta_sum = 0
    len_diff = 0
    for index, entry in testing_frame.iterrows():
        expected_score = entry["rating"]
        actual_score = recommender_instance.predict(entry["user"], entry["item"])

        if actual_score is not None:
            delta_sum += math.fabs(expected_score - actual_score)
        else:
            len_diff += 1

    logger.info("Recommender method has predicted %s ratings" % (testing_frame.__len__()-len_diff))
    logger.info('Recommender failed to predict %s ratings', len_diff)

    delta = float(delta_sum) / (testing_frame.__len__() - len_diff)
    mean_rating = float(testing_frame["rating"].mean())

    logger.info("Testing data mean rating: %s" % mean_rating)
    logger.info("")
    logger.info("Method %s average error delta %s" % (method_name, delta))
    logger.info("")
    return delta

In [None]:
def get_random_cfg(cfgs):
    cfg=dict()
    for key in cfgs:
        num_options=len(cfgs[key])
        cfg[key]=np.random.choice(cfgs[key],1)[0]
    return cfg

In [None]:
CFGS={'num_epochs':[1,3,5,7,10,20], 'alpha':[0.1,0.025,0.15,0.3], 'min_alpha':[0.025,0.01,0.001,0.5], 'dm':[1], 'size':[10,30,50,70,100,150,200,300], 'window':range(1,15,1), 'min_count':[1], 'negative':[0,5,10]}


In [None]:
import time
import matplotlib.pyplot as plt
%matplotlib inline  

TIME_LIMIT_S=3600
logger.setLevel(40)
best_cfg=None
best_score=10000
start=time.time()

n_iter=1
score_improvements=list()
while time.time()-start < TIME_LIMIT_S:
    cfg=get_random_cfg(CFGS)
    rec=Recommender(cfg)
    score=evaluate(rec)
    if score<best_score:
        best_cfg=cfg
        best_score=score
        score_improvements.append(best_score)
        print('Found new best config in iter No. '+str(n_iter)+' ___________________________')
        print('Best cfg: '+str(best_cfg))
        print('Best score: '+str(best_score))
    n_iter+=1
print('Score improvemetns: '+str(score_improvements))

In [None]:
plt.plot(score_improvements)
plt.title('Grid search score improvements')
plt.show()

In [None]:
best_cfg

In [None]:
best_score