In [28]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [29]:

def merge_datasets(path):	#takes the path and returns a merged dataset
    data_m = pd.read_csv(path+'movies.csv')
    data_r = pd.read_csv(path+'ratings.csv')
    data_u = pd.read_csv(path+'users.csv')

    movie_ratings = pd.merge(data_m, data_r)
    lens = pd.merge(movie_ratings, data_u)
    #print(lens.head())

    #most_rated = lens.groupby('title').size().sort_values(ascending=False)[:25]
    #print(most_rated)

    return lens

def create_user_item_pair(data):
    ## picks top k rated movies & top k users with most ratings
    ## selects subset containing only above users and movies
    ## creates (user, item) pair matrix with rating as cell value
    lens = data.copy()

    most_rated_users = lens.groupby('userId').size().sort_values(ascending=False)
    mru_keys = list(most_rated_users.keys())[:600]

    most_rated_movies = lens.groupby('itemId').size().sort_values(ascending=False)
    mrm_keys = list(most_rated_movies.keys())[:400]


    data = data.loc[data['userId'].isin(mru_keys) & data['itemId'].isin(mrm_keys)]
    data = data.pivot(index='userId', columns='itemId', values='rating')
    data = np.where(np.isnan(data),0, data)
    #print(data.head())
    return data

def create_matrix_I(data):
    I = data.copy()
    I = np.asarray(I)
    I = np.where(np.isnan(I), 0, 1)
    
    return I    


In [30]:
def create_user_user_similarity_matrix(data):
    uu_matrix = cosine_similarity(data)
    return uu_matrix
    
def create_prediction_matrix(A, uu_matrix):
    pred_matrix = np.zeros(A.shape)
    K = A.shape[0] #no. of users
    w = A.shape[1] #no. of items
    
    mean_ratings_of_users = []
    sum_of_user_similarity = []
    for i in range(K):
        mean = np.sum(A[i])/np.count_nonzero(A[i])
        mean_ratings_of_users.append(mean)
        sum_of_user_similarity.append(np.sum(uu_matrix[i]))
    for i in range(0, w): #items
        for a in range(0, K): #users
            sum = 0
            for u in range(K):
                sum += (A[u][i] - mean_ratings_of_users[u]) * uu_matrix[a][u]
            sum /= sum_of_user_similarity[a]
            
            pred_matrix[a][i] = mean_ratings_of_users[a] + sum
            
    return pred_matrix
        

def create_episode_matrix(user_item_matrix):
    episode_matrix = []
    r = user_item_matrix.shape[0]
    c = user_item_matrix.shape[1]
    for i in range(r):
        temp = []
        for j in range(c):
            if user_item_matrix[i][j] > 0:
                temp.append(user_item_matrix[i][j])
        episode_matrix.append(temp)
    return episode_matrix


def reward(j, i, episode_matrix, pred_matrix):
    return episode_matrix[i][j+2] - pred_matrix[i][j]


def train_rlcf():
    pass

In [34]:
def main():
    path = 'data/'
    data = merge_datasets(path)

    A = create_user_item_pair(data)
    I = create_matrix_I(A)
    uu_matrix = create_user_user_similarity_matrix(A)
    '''pred_matrix = create_prediction_matrix(A, uu_matrix)
    pickle_out = open('pred_matrix.pickle', 'wb')
    pickle.dump(pred_matrix, pickle_out)
    pickle_out.close()'''
    
    pickle_in = open('pred_matrix.pickle', 'rb')
    pred_matrix = pickle.load(pickle_in)
    print(pred_matrix)
    episode_matrix = create_episode_matrix(A)
    print(len(episode_matrix[0]), len(episode_matrix[1]))

if __name__ == '__main__':
    main()

[[3.3101449  2.223722   2.94294414 ... 2.07473945 2.40824106 1.66081698]
 [2.50215427 1.43050395 2.11143066 ... 1.25743752 1.58533693 0.90029451]
 [3.70870635 2.5905515  3.26310058 ... 2.44923261 2.74436332 2.02727955]
 ...
 [2.7150667  1.62727352 2.31855343 ... 1.4990172  1.82925543 1.11158886]
 [2.56758441 1.42875382 2.05419027 ... 1.29457637 1.62343191 0.82242377]
 [2.69110883 1.55924108 2.19356741 ... 1.41982295 1.72470732 0.96499373]]
196 258


In [None]:
5:59:46