In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from random import shuffle
import operator

In [2]:
! git clone https://github.com/tusharsircar95/CF-MovieLens

Cloning into 'CF-MovieLens'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 6 (delta 0), reused 6 (delta 0), pack-reused 0
Unpacking objects: 100% (6/6), done.


In [3]:
df_orig = pd.read_csv('CF-MovieLens/ratings.csv')
df_orig.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [4]:
print(len(df_orig.user_id.unique()))
print((df_orig.user_id.min()))
print(df_orig.user_id.max())

53424
1
53424


In [5]:
print(len(df_orig.rating.unique()))
print((df_orig.rating.min()))
print(df_orig.rating.max())

5
1
5


In [6]:
print(len(df_orig.book_id.unique()))
print((df_orig.book_id.min()))
print(df_orig.book_id.max())

10000
1
10000


In [0]:
# Number of users
N = 100 

# Number of books
M = 100

def select_user_ids(df_orig, N):
  df = pd.DataFrame(df_orig.pivot_table(index='user_id',values='rating',aggfunc='count').to_records())
  df = df.sort_values(by='rating',ascending=False)
  return np.array(df.user_id[:N])

def select_book_ids(df_orig, M):
  df = pd.DataFrame(df_orig.pivot_table(index='book_id',values='rating',aggfunc='count').to_records())
  df = df.sort_values(by='rating',ascending=False)
  return np.array(df.book_id[:M])



def shrink_and_relabel_dataset(df_orig, N,M):
  
  df_orig = df_orig.sample(frac=1).reset_index(drop=True)
  
  user_ids = select_user_ids(df_orig,N)
  book_ids = select_book_ids(df_orig,M)
  
  user_id_to_idx = {}
  book_id_to_idx = {}
  for idx,user_id in enumerate(user_ids):
    user_id_to_idx[user_id] = idx
  for idx,book_id in enumerate(book_ids):
    book_id_to_idx[book_id] = idx
  
  df = df_orig.copy()
  df = df[df.user_id.isin(user_ids)]
  df = df[df.book_id.isin(book_ids)]
  df.loc[:,'user_id'] = df.apply(lambda x: user_id_to_idx[x['user_id']],axis=1)
  df.loc[:,'book_id'] = df.apply(lambda x: book_id_to_idx[x['book_id']],axis=1)
  
  return df
  
def generate_train_val_test_sets(df):
  n = len(df)
  
  train_ratio = 0.80
  val_ratio = 0.10
  test_ratio = 0.10
  
  df_train = df.iloc[:int(n*train_ratio)]
  df_val = df.iloc[int(n*train_ratio):int(n*(train_ratio+val_ratio))]
  df_test = df.iloc[int(n*(train_ratio+val_ratio)):]
  
  return df_train, df_val, df_test
  
  
def generate_mapping(df):
  user_to_book = {}
  book_to_user = {}
  rating_matrix = {}
  mean_ratings = {}
  
  for index,row in df.iterrows():
    user_id = row['user_id']
    book_id = row['book_id']
    rating = row['rating']
    
    if not user_id in user_to_book:
      user_to_book[user_id] = []
    if not book_id in book_to_user:
      book_to_user[book_id] = []
    if not user_id in rating_matrix:
      rating_matrix[user_id] = {}
    user_to_book[user_id].append(book_id)
    book_to_user[book_id].append(user_id)
    rating_matrix[user_id][book_id] = rating
    
  for user_id in user_to_book.keys():
    ratings = []
    for book_id in user_to_book[user_id]:
      ratings.append(rating_matrix[user_id][book_id])
    mean_ratings[user_id] = np.mean(ratings)
    
  return user_to_book, book_to_user, rating_matrix, mean_ratings


def calculate_correlation(data, user_id1, user_id2):
  
  pass

def get_mse(training_data=None, eval_data=None, limit=5, K = 20, params=None):
  user_to_book, book_to_user, rating_matrix, mean_ratings = generate_mapping(training_data)
  print('Calculated mappings...')
  
  predictions = []
  target_ratings = []
  no_means = 0
  
  print('Proceeding to evaluate %d datapoints...'%len(eval_data))
  # Calculate estimated rating for each datapoint in eval_data
  datapoints = 0
  for index,row in eval_data.iterrows():
    user_id = row['user_id']
    book_id = row['book_id']
    rating = row['rating']
    
    if user_id in mean_ratings:
      default_rating = mean_ratings[user_id]
    else: 
      default_rating = 3
      no_means = no_means + 1
    
    if datapoints % 500 == 0:
      print('Evaluated data points: ', datapoints)
    datapoints = datapoints + 1
    
    if book_id in book_to_user:
      numerator = 0.0
      denominator = 0.0
      weights_and_ratings = []
      
      for id in book_to_user[book_id]:
        if id == user_id:
          continue

        if user_id in user_to_book:
          books_to_consider = np.intersect1d(user_to_book[user_id],user_to_book[id],assume_unique=True)
        else: books_to_consider = []

        # Skip this user if enough common books are not available
        if len(books_to_consider) < limit:
          continue


        ratings_1 = []
        ratings_2 = []
        for book in books_to_consider:
          ratings_1.append(rating_matrix[user_id][book] - default_rating)
          ratings_2.append(rating_matrix[id][book] - mean_ratings[id])


        w = np.corrcoef(ratings_1,ratings_2)[0][0]
        if np.isnan(w):
          continue

        weights_and_ratings.append((w,rating_matrix[id][book_id]-mean_ratings[id]))

         
      weights_and_ratings.sort(key = operator.itemgetter(0))
      weights_and_ratings.reverse()
      select = min(K,len(weights_and_ratings))

      for i in range(select):
        numerator += weights_and_ratings[i][0] * weights_and_ratings[i][1]
        denominator += abs(weights_and_ratings[i][0])


      if denominator == 0:
        predicted_rating = default_rating
      else: predicted_rating = default_rating + numerator / denominator

    else: predicted_rating = default_rating
      
    if predicted_rating < 0:
      predicted_rating = 0
    if predicted_rating > 5:
      predicted_rating = 5
    predictions.append(predicted_rating)
    target_ratings.append(rating)
    
  print('New users: ',no_means)
  return np.array(predictions), np.array(target_ratings)
      


In [0]:
def evaluate_hyperparams(training_data=None, eval_data=None, K=None,limit=None):
  if training_data is None or eval_data is None:
    print('Invalid data...')
  if K is None or limit is None:
    print('Invalid hyperparameters...')
  predictions, target_ratings = get_mse(training_data=training_data, eval_data=eval_data, K=K, limit=limit)
  print('Validation with K = %d, limit = %d'%(K,limit))
  print('Error with User CF: ', np.mean(abs(predictions-target_ratings)))
  print('Error Baseline: ',np.mean(abs(target_ratings-3.83)))


In [0]:
np.random.seed(20110)
df_shrinked = shrink_and_relabel_dataset(df_orig,1000,1000)
df_train, df_val, df_test = generate_train_val_test_sets(df_shrinked)

In [18]:
evaluate_hyperparams(K=10,limit=5, training_data=df_train, eval_data=df_val)
evaluate_hyperparams(K=20,limit=5, training_data=df_train, eval_data=df_val)
evaluate_hyperparams(K=30,limit=5, training_data=df_train, eval_data=df_val)
evaluate_hyperparams(K=40,limit=5, training_data=df_train, eval_data=df_val)

Calculated mappings...
Proceeding to evaluate 11404 datapoints...
Evaluated data points:  0


  c /= stddev[:, None]
  c /= stddev[None, :]


Evaluated data points:  500
Evaluated data points:  1000
Evaluated data points:  1500
Evaluated data points:  2000
Evaluated data points:  2500
Evaluated data points:  3000
Evaluated data points:  3500
Evaluated data points:  4000
Evaluated data points:  4500
Evaluated data points:  5000
Evaluated data points:  5500
Evaluated data points:  6000
Evaluated data points:  6500
Evaluated data points:  7000
Evaluated data points:  7500
Evaluated data points:  8000
Evaluated data points:  8500
Evaluated data points:  9000
Evaluated data points:  9500
Evaluated data points:  10000
Evaluated data points:  10500
Evaluated data points:  11000
New users:  0
Validation with K = 10, limit = 5
Error with User CF:  0.7242223947997671
Error Baseline:  0.843042792002806
Calculated mappings...
Proceeding to evaluate 11404 datapoints...
Evaluated data points:  0
Evaluated data points:  500
Evaluated data points:  1000
Evaluated data points:  1500
Evaluated data points:  2000
Evaluated data points:  2500
E

In [20]:
evaluate_hyperparams(K=40,limit=5, training_data=df_train, eval_data=df_test)

Calculated mappings...
Proceeding to evaluate 11405 datapoints...
Evaluated data points:  0


  c /= stddev[:, None]
  c /= stddev[None, :]


Evaluated data points:  500
Evaluated data points:  1000
Evaluated data points:  1500
Evaluated data points:  2000
Evaluated data points:  2500
Evaluated data points:  3000
Evaluated data points:  3500
Evaluated data points:  4000
Evaluated data points:  4500
Evaluated data points:  5000
Evaluated data points:  5500
Evaluated data points:  6000
Evaluated data points:  6500
Evaluated data points:  7000
Evaluated data points:  7500
Evaluated data points:  8000
Evaluated data points:  8500
Evaluated data points:  9000
Evaluated data points:  9500
Evaluated data points:  10000
Evaluated data points:  10500
Evaluated data points:  11000
New users:  0
Validation with K = 40, limit = 5
Error with User CF:  0.6933369825601169
Error Baseline:  0.8382069267864971
