In [8]:
import pandas as pd
from datetime import datetime
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from time import sleep
%matplotlib inline

### Reading and sorting data

In [5]:
CSV_PATH='/home/kvassay/data/book-recommender/ratings_Books.csv'
raw_data=pd.read_csv(CSV_PATH,header=0,names=['user','item','rating','timestamp'])


In [10]:
raw_data_mini=raw_data.sample(frac=0.03)

In [66]:
class Doc2VecRecommender():
    def __init__(self):
        self.model = None
        self.user_items_dict = None

    @staticmethod
    def get_user_items_dict(data, min_ratings, max_ratings):
        user_item_dict = dict()
        for row in data.itertuples():
            if row.user in user_item_dict:
                user_item_dict[row.user].append(row.item)
            else:
                user_item_dict[row.user] = [row.item]
        user_item_dict_clean = {key: value for key, value in user_item_dict.iteritems() if
                                len(value) >= min_ratings and len(value) < max_ratings}
        return user_item_dict_clean

    @staticmethod
    def get_user_items_list_lengths(user_item_dict):
        return pd.DataFrame(sorted([len(item_list) for _, item_list in user_item_dict.iteritems()]))

    def fit(self, raw_data):
        self.user_items_dict = Doc2VecRecommender.get_user_items_dict(raw_data, min_ratings=3, max_ratings=100)
        train_data = [TaggedDocument(words, [user_id]) for user_id, words in self.user_items_dict.iteritems()]
        self.model = Doc2Vec(train_data, dm=0, size=30, window=8, min_count=1, workers=4)

    def get_recommendations(self, user, item, k):
        similar_list = self.model.most_similar(item, topn=k)
        asin_list = [item[0] for item in similar_list]
        return asin_list

    @staticmethod
    def cosine_sim_to_rating(sim):
        #cosine sim gets values from -1 to 1, 1 meaning exactly the same
        if sim<-0.6:
            return 1
        if sim<-0.2:
            return 2
        if sim<0.2:
            return 3
        if sim<0.6:
            return 4
        return 5
    
    # expects one entry's user and item
    # will return predicted rating which is the only attribute to be used for performance evaluation
    def get_rating(self, user, item):
        if user in self.user_items_dict:
            users_items=[item for item in self.user_items_dict[user] if item in self.model.vocab]
            if len(users_items)>0:
                if item in self.model.vocab:
                    cos_sim = self.model.n_similarity(users_items, [item])
                    return Doc2VecRecommender.cosine_sim_to_rating(cos_sim)
                else:
                    return None
            else:
                return None
        else:
            return None

    def predict(self, user, item):
        return self.get_rating(user,item)

In [67]:
rec=Doc2VecRecommender()

In [68]:
rec.fit(raw_data_mini)

In [69]:
sample_users=[
 'A2U8J69ASRX1NQ',
 'A32PW3Z8XYOZUM',
 'A3GWE80SUGORJD',
 'ATBXNEJTSIUQ2',
 'A2UTKYA2U8542F',
 'A3DCKL0EPIQ9LD',
 'A177CDBTES35IY',
 'A1S1O3YT8NS68A',
]

In [70]:
sample_books=[
    '0679446958',
 '045141912X',
 '0800719875',
 '0099453193',
 '0140430032',
 'B004UWU9IM',
 '014241977X',
 '0736915141'    
]

In [71]:
samples=zip(sample_users,sample_books)

In [72]:
samples

[('A2U8J69ASRX1NQ', '0679446958'),
 ('A32PW3Z8XYOZUM', '045141912X'),
 ('A3GWE80SUGORJD', '0800719875'),
 ('ATBXNEJTSIUQ2', '0099453193'),
 ('A2UTKYA2U8542F', '0140430032'),
 ('A3DCKL0EPIQ9LD', 'B004UWU9IM'),
 ('A177CDBTES35IY', '014241977X'),
 ('A1S1O3YT8NS68A', '0736915141')]

In [73]:
for item in samples:
    print(rec.predict(item[0], item[1]))

4
4
3
5
3
4
4
5
