In [None]:
import numpy as np
import collections 
import pandas as pd
import os
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds
import warnings; warnings.simplefilter('ignore')
from scipy import sparse
from typing import List
import scipy.sparse as sp

for dirname, _, filenames in os.walk('/home/ebcffhh/Documents/personal/Masters/Thesis'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data=pd.read_csv("/home/ebcffhh/Documents/personal/Masters/Thesis/ratings_Beauty.csv", names = ["userId", "ProductId", "Ratings", "Timestamp"])


In [None]:
counts=data.userId.value_counts()
dataset_final=data[data.userId.isin(counts[counts>=25].index)]
print('Number of users who have rated 25 or more items =', len(dataset_final))
print('Number of unique users in the final data = ', dataset_final['userId'].nunique())
print('Number of unique products in the final data = ', dataset_final['ProductId'].nunique())

In [None]:
rated_products = data.groupby(by='userId',as_index=False)['Ratings'].count()
print(rated_products)
rated_products = rated_products[rated_products['Ratings'] < 20]
new_dataset = data.loc[~((data.userId.isin(rated_products['userId']))),:]
no_of_rated_products_per_user = new_dataset.groupby(by='userId')['Ratings'].count().sort_values(ascending=False)
print(no_of_rated_products_per_user)
print(new_dataset.ProductId.nunique())
print(new_dataset.userId.nunique())

In [None]:
n_users = new_dataset.userId.unique().shape[0]
n_products = new_dataset.ProductId.unique().shape[0]
product_list = new_dataset.ProductId.unique().tolist()
print(n_users)

In [None]:
from collections import defaultdict
def GetTopN(predictions, n=10, minimumRating=4.0):
      topN = defaultdict(list)


      for userID, productId, actualRating, estimatedRating, _ in predictions:
          if (estimatedRating >= minimumRating):
              topN[userID].append((productId, estimatedRating))
      for userID, ratings in topN.items():
          ratings.sort(key=lambda x: x[1], reverse=True)
          topN[userID] = ratings[:n]

      return topN

In [None]:
def get_customer_satisfaction(pred_u,k):
  edt = {}
  rating_list = defaultdict(list)
  pred = pred_u.copy().groupby(['userId'])
  for userId in pred.groups.keys():
    sorted_pred_group = pred.get_group(userId).sort_values(['prediction'], ascending = False)
    top_k = sorted_pred_group[:k]
    top_k_g = top_k.groupby(by='userId')

    for userId in top_k_g.groups.keys():
      top_k_user_list = top_k_g.get_group(userId)
      for _, groups in top_k_user_list.iterrows():
        diff_ratings = groups['prediction'] - groups['actual']
        rating_list.setdefault(groups['userId'], []).append(diff_ratings)
      edt[userId] = (np.sum(rating_list.get(userId)))
  return edt

In [None]:
def prediction_coverage(predicted: List[list], catalog: list) -> float:
  predicted_flattened = [p for sublist in predicted for p in sublist]
  unique_predictions = len(set(predicted_flattened))
  prediction_coverage = round(unique_predictions/(len(catalog)* 1.0)*100,2)
  return prediction_coverage

In [None]:
def recommender_precision(predicted: List[list], actual: List[list]) -> int:
  def calc_precision(predicted, actual):
      prec = [value for value in predicted if value in actual]
      prec = np.round(float(len(prec)) / float(len(predicted)), 4)
      return prec

  precision_list = list(map(calc_precision, predicted, actual))
  precision = np.mean(precision_list)
  return precision, precision_list


def recommender_recall(predicted: List[list], actual: List[list]) -> int:
  def calc_recall(predicted, actual):
      reca = [value for value in predicted if value in actual]
      reca = np.round(float(len(reca)) / float(len(actual)), 4)
      return reca

  recall_list = list(map(calc_recall, predicted, actual))
  recall = np.mean(recall_list)
  return recall, recall_list

In [None]:
def personalization(predicted: List[list]) -> float:
    """
    Personalization measures recommendation similarity across users.
    A high score indicates good personalization (user's lists of recommendations are different).
    A low score indicates poor personalization (user's lists of recommendations are very similar).
    A model is "personalizing" well if the set of recommendations for each user is different.
    Parameters:
    ----------
    predicted : a list of lists
        Ordered predictions
        example: [['X', 'Y', 'Z'], ['X', 'Y', 'Z']]
    Returns:
    -------
        The personalization score for all recommendations.
    """

    def make_rec_matrix(predicted: List[list]) -> sp.csr_matrix:
        df = pd.DataFrame(data=predicted).reset_index().melt(
            id_vars='index', value_name='item',
        )
        df = df[['index', 'item']].pivot(index='index', columns='item', values='item')
        df = pd.notna(df)*1
        rec_matrix = sp.csr_matrix(df.values)
        return rec_matrix

    #create matrix for recommendations
    predicted = np.array(predicted)
    rec_matrix_sparse = make_rec_matrix(predicted)

    #calculate similarity for every user's recommendation list
    similarity = cosine_similarity(X=rec_matrix_sparse, dense_output=False)
  

    avg_sim = similarity.mean(axis=1)

    #get indicies for upper right triangle w/o diagonal
    upper_right = np.triu_indices(similarity.shape[0], k=1)

    #calculate average similarity score of all recommended items in list
    ils_single_user = np.mean(similarity[upper_right])
    return avg_sim, (1 - ils_single_user)
    #return similarity

In [None]:
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split

reader = Reader()
rating_data = Dataset.load_from_df(new_dataset[['userId', 'ProductId', 'Ratings']], reader)
trainset, testset = train_test_split(rating_data, test_size=0.2,random_state=100)

In [None]:
from surprise import SVD
from surprise import KNNWithMeans

from surprise import accuracy

k = [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100]

mae_svd = list()
for i in k:
  algo = SVD(n_factors=i, n_epochs=200)
  algo.fit(trainset)
  test_pred = algo.test(testset)
  mae_svd.append(accuracy.mae(test_pred))
  print("Mean Absolute Error for value k {} is ".format(i), accuracy.mae(test_pred))

In [None]:
from surprise import SVD
from surprise import KNNWithMeans
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy


k = [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100]
reader = Reader()
rating_data = Dataset.load_from_df(new_dataset[['userId', 'ProductId', 'Ratings']], reader)
trainset, testset = train_test_split(rating_data, test_size=0.2,random_state=100)


mae_knn = list()
for i in k:
  algo = KNNWithMeans(k=i, sim_options={'name':'pearson','user_based': True})
  algo.fit(trainset)
  test_pred = algo.test(testset)
  mae_knn.append(accuracy.mae(test_pred))
  print("Mean Absolute Error for value k {} is ".format(i), accuracy.mae(test_pred))

In [None]:
def get_f1_score(predictions, k):
    threshold = 4
    # First map the predictions to each user.
    user_est_rating = defaultdict(list)
    
    for index, row in predictions.iterrows():
        user_est_rating[row['userId']].append((row['prediction'], row['actual']))
    # Then sort the predictions for each user and retrieve the k highest ones.
    f1_score = dict()
    for uid, user_ratings in user_est_rating.items():
        user_ratings.sort(key=lambda x:x[0], reverse=True)
        # Number of relevant items
        n_rel = sum((r_ui >= threshold) for (_, r_ui) in user_ratings)
        if math.isnan(n_rel):
          print("nan value for rel") 
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        if math.isnan(n_rec_k):
          print("nan value for rel") 
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])
        if math.isnan(n_rel_and_rec_k):
          print("nan value for rel and rec") 
        precision = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recall = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
        f1_score[uid] = 2 * ((precision * recall)/(precision+recall)) if (precision + recall) != 0 else 0
    return f1_score

def get_cus(predictions, k):
    threshold = 4
    # First map the predictions to each user.
    user_est_rating = defaultdict(list)
    for index, row in predictions.iterrows():
      user_est_rating[row['userId']].append((row['prediction'], row['actual']))

    # Then sort the predictions for each user and retrieve the k highest ones.
    cus = defaultdict(list)
    for uid, user_ratings in user_est_rating.items():
        user_ratings.sort(key=lambda x:x[0], reverse=True)
        for est, r_ui in user_ratings[:k]:
          diff = r_ui - est
          cus[uid].append(diff)
    customerSatisfaction = {}
    for key in cus:
      customerSatisfaction[key] = np.sum(cus.get(key))/k
    return customerSatisfaction

def get_f1_score_nn(predictions, k):
    threshold = 4
    # First map the predictions to each user.
    user_est_rating = defaultdict(list)
    #for uid, iid, r_ui, est in predictions:
    #    user_est_rating[uid].append((est, r_ui))
    for index, row in predictions.iterrows():
        user_est_rating[row['userId']].append((row['prediction'], row['actual']))
    # Then sort the predictions for each user and retrieve the k highest ones.
    f1_score = dict()
    for uid, user_ratings in user_est_rating.items():
        user_ratings.sort(key=lambda x:x[0], reverse=True)
        # Number of relevant items
        n_rel = sum((r_ui >= threshold) for (_, r_ui) in user_ratings)
        if math.isnan(n_rel):
          print("nan value for rel") 
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        if math.isnan(n_rec_k):
          print("nan value for rel") 
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])
        if math.isnan(n_rel_and_rec_k):
          print("nan value for rel and rec") 
        precision = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recall = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
        f1_score[uid] = 2 * ((precision * recall)/(precision+recall)) if (precision + recall) != 0 else 0
    return f1_score

def cal_f1(test_pred, k):
  f1_scores = get_f1_score(test_pred, k)
  average_f1_score = sum(score for score in f1_scores.values())/ len(f1_scores)
  return f1_scores , average_f1_score 

def get_accuracy(predictions,k):
  user_est_rating = defaultdict(list)
  for index, row in predictions.iterrows():
      user_est_rating[row['userId']].append((row['prediction'], row['actual']))
  accuracy_scores = dict()
  for uid, user_ratings in user_est_rating.items():
    scores = list()
    user_ratings.sort(key=lambda x:x[0], reverse=True)
    for (est, actual) in user_ratings[:k]:
      diff = abs(actual - est)
      scores.append(diff)
    accuracy_scores[uid] = sum(score for score in scores)/len(scores)
  return accuracy_scores

def cal_accuracy(test_pred, k):
  accuracy = get_accuracy(test_pred, k)
  average_accuracy = sum(score for score in accuracy.values())/len(accuracy)
  return accuracy , average_accuracy

def get_shannon_entropy(predictions, product_list, no_of_recommendations):
  recommendation_items = [item for sublist in predictions for item in sublist]
  products = set(recommendation_items)
  count_recommendation_items = collections.Counter(recommendation_items)
  print(count_recommendation_items)
  pi = list()
  for product in products:
    #if product in count_recommendation_items.keys():
    pi.append(count_recommendation_items.get(product)/len(set(product_list)))
  #e = -np.sum(pi*np.log(pi)/np.log(no_of_recommendations))
  e = -np.sum(pi*np.log(pi))
  return e
  #print("Average diversity using shannon entropy for {} no of recommendations is {} \n".format(no_of_recommendations, -np.sum(pi*np.log(pi)/np.log(no_of_recommendations))))
def get_shannon_entropy_new(predictions, product_list, no_of_recommendations):
  recommendation_items = [item for sublist in predictions for item in sublist]
  products = set(recommendation_items)
  count_recommendation_items = collections.Counter(recommendation_items)
  n_rec = sum(count_recommendation_items.values())
  c = np.fromiter(count_recommendation_items.values(), dtype=int)
  pi = c/n_rec
  shannon_entropy = -np.sum(pi * np.log2(pi))
  return shannon_entropy
    

In [None]:
import re
def sorted_nicely( l ):
    """ Sorts the given iterable in the way that is expected.
 
    Required arguments:
    l -- The iterable to be sorted.
 
    """
    convert = lambda text: int(text) if text.isdigit() else text
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
    return sorted(l, key = alphanum_key)

In [None]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
import recmetrics
import csv

ratings_dataset = Dataset.load_from_df(new_dataset[['userId', 'ProductId', 'Ratings']],reader)

trainset, testset = train_test_split(ratings_dataset, test_size=.2)

product_list = set()

for inner_pid in trainset.ir.keys():
  product_list.add(trainset.to_raw_iid(inner_pid))

algo = SVD(n_factors= 80, n_epochs=200)
algo.fit(trainset)
predictions = algo.test(testset)
test = pd.DataFrame(predictions)
test = test.rename(columns={'uid':'userId', 'iid': 'productId', 
                            'r_ui':'actual', 'est':'prediction'})
pred_user = test.copy().groupby('userId', as_index=False)['productId'].agg({'ratings': (lambda x: list(set(x)))})
pred_user = pred_user.set_index("userId")                            
cf_model = test.pivot_table(index='userId', 
                            columns='productId', values='prediction').fillna(0)

def get_users_predictions(user_id, n, model):
    recommended_items = pd.DataFrame(model.loc[user_id])
    recommended_items.columns = ["predicted_rating"]
    recommended_items = recommended_items.sort_values('predicted_rating', ascending=False)    
    recommended_items = recommended_items.head(n)
    return recommended_items.index.tolist()

def get_recs(model, k):
    recs = []
    for user in model.index:
        cf_predictions = get_users_predictions(user, k, model)
        recs.append(cf_predictions)
    return recs

productId_counts = dict(new_dataset.ProductId.value_counts())
userId_counts = test['userId'].value_counts()

diversity_svd = []
novelty_svd = []
coverage_svd = []
f1_score_svd = []
accuracy_svd = []

# Top-n recommendations for each user
no_of_recommendations = [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100]
#no_of_recommendations = [5]
for k in no_of_recommendations:
  recs = get_recs(cf_model, k)
  pred_user[f'Top-{k} Recommendation'] = recs

  # To calculate the f1_score
  f1_scores_list_svd, average_f1_score_svd = cal_f1(test.copy() ,k)
  print("The f1 score for {} recommendation is {} \n".format(k, average_f1_score_svd))
  f1_score_svd.append(average_f1_score_svd)

  # To calculate accuracy
  accuracy_scores_svd, average_accuracy_svd = cal_accuracy(test.copy() ,k)
  print("The accuracy score for {} recommendation is {} \n".format(k, average_accuracy_svd))
  accuracy_svd.append(average_accuracy_svd)

  # To calculate the diversity
  diversity_scores_svd, average_diversity_svd = personalization(list(recs))
  #diversity = get_shannon_entropy_new(recs, list(product_list), k)
  print("The diversity score for {} recommendation is {} \n".format(k, average_diversity_svd))
  diversity_svd.append(average_diversity_svd)

  # To calculate the novelty
  cf_novelty_svd, novelty_list_svd = recmetrics.novelty(recs, productId_counts, len(userId_counts), k)
  print("The novelty score for {} recommendation is {} \n".format(k, cf_novelty_svd))
  novelty_svd.append(cf_novelty_svd)

  # To calculate the coverage
  cf_coverage = recmetrics.catalog_coverage(list(recs), product_list, 100)
  print("The coverage score for {} recommendation is {} \n".format(k, cf_coverage))
  coverage_svd.append(cf_coverage)

  # To calculate the customer satisfaction
  edt_svd = get_customer_satisfaction(test, k)
  print("The cusotmer satisfaction for {} recommendation is {}".format(k,  np.mean(list(edt_svd.values()))))

  filename = "/home/ebcffhh/thesis/sorted_svd/metrics_svd_%s_recommendations.csv" % k
  with open(filename, 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(['userId', 'accuracy', 'f1_score', 'diversity', 'novelty', 'customer_satisfaction'])
    for i, nov, (uid, acc_score), (_, f1_score) in zip(diversity_scores_svd, novelty_list_svd, accuracy_scores_svd.items(), f1_scores_list_svd.items()):
       if uid in sorted_nicely(edt_svd.keys()):
         writer.writerow([uid, acc_score, f1_score, (1 - i[0]), nov, edt_svd.get(uid)])

print(accuracy_svd)

In [None]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
import recmetrics

#ratings_dataset = Dataset.load_from_df(new_dataset[['userId', 'ProductId', 'Ratings']],reader)

#trainset, testset = train_test_split(ratings_dataset, test_size=.2)

train_product_list_count = len(trainset.ir.keys())
productId_counts = dict(new_dataset.ProductId.value_counts())
userId_counts = new_dataset['userId'].value_counts()
product_list = set()

for inner_pid in trainset.ir.keys():
  product_list.add(trainset.to_raw_iid(inner_pid))

algo = KNNWithMeans(k=100, sim_options={'name':'pearson','user_based': True})
algo.fit(trainset)
predictions = algo.test(testset)
test = pd.DataFrame(predictions)
test = test.rename(columns={'uid':'userId', 'iid': 'productId', 
                            'r_ui':'actual', 'est':'prediction'})
pred_user = test.copy().groupby('userId', as_index=False)['productId'].agg({'ratings': (lambda x: list(set(x)))})
pred_user = pred_user.set_index("userId")    
cf_model = test.pivot_table(index='userId', 
                            columns='productId', values='prediction').fillna(0)

def get_users_predictions(user_id, n, model):
    recommended_items = pd.DataFrame(model.loc[user_id])
    recommended_items.columns = ["predicted_rating"]
    recommended_items = recommended_items.sort_values('predicted_rating', ascending=False)    
    recommended_items = recommended_items.head(n)
    return recommended_items.index.tolist()

def get_recs(model, k):
    recs = []
    for user in model.index:
        cf_predictions = get_users_predictions(user, k, model)
        recs.append(cf_predictions)
    return recs    
diversity_knn = []
novelty_knn = []
coverage_knn = []
f1_score_knn = []
accuracy_knn = []

# Top-n recommendations for each user
no_of_recommendations = [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100]
for k in no_of_recommendations:
  
  recs = get_recs(cf_model, k)
  pred_user[f'Top-{k} Recommendation'] = recs

  # To calculate the f1_score
  f1_scores_knn, average_f1_score = cal_f1(test ,k)
  print("The f1 score for {} recommendation is {} \n".format(k, average_f1_score))
  f1_score_knn.append(average_f1_score)

  # To calculate accuracy
  accuracy_scores_knn, average_accuracy_knn = cal_accuracy(test.copy() ,k)
  print("The accuracy score for {} recommendation is {} \n".format(k, average_accuracy_knn))
  accuracy_knn.append(average_accuracy_knn)

  # To calculate the diversity
  diversity_scores_knn, average_diversity_knn = personalization(list(recs))
  #diversity = get_shannon_entropy(recs, list(product_list), k)
  print("The diversity score for {} recommendation is {} \n".format(k, average_diversity_knn))
  diversity_knn.append(average_diversity_knn)

  # To calculate the novelty
  cf_novelty_knn, novelty_list_knn = recmetrics.novelty(list(recs), productId_counts, len(userId_counts), k)
  print("The novelty score for {} recommendation is {} \n".format(k, cf_novelty_knn))
  novelty_knn.append(cf_novelty_knn)

  # To calculate the coverage
  cf_coverage_knn = recmetrics.catalog_coverage(list(recs), product_list, 100)
  print("The coverage score for {} recommendation is {} \n".format(k, cf_coverage_knn))
  coverage_knn.append(cf_coverage_knn)
  
  # To calculate the customer satisfaction
  edt_knn = get_customer_satisfaction(test, k)
  print("The cusotmer satisfaction for {} recommendation is {}".format(k,  np.mean(list(edt_knn.values()))))

  filename = "/home/ebcffhh/thesis/sorted_knn/metrics_knn_%s_recommendations.csv" % k
  with open(filename, 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(['userId', 'accuracy', 'f1_score', 'diversity', 'novelty', 'customer_satisfaction'])
    for i, nov, (uid, acc_score), (_, f1_score) in zip(diversity_scores_knn, novelty_list_knn, accuracy_scores_knn.items(), f1_scores_knn.items()):
       if uid in sorted_nicely(edt_knn.keys()):
         writer.writerow([uid, acc_score, f1_score, (1 - i[0]), nov, edt_knn.get(uid)])
  

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import one_hot
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(new_dataset, test_size = 0.2)

user_encoder = LabelEncoder()
product_encoder = LabelEncoder()

train_user_ids = np.array([one_hot(d,10) for d in train_data['userId']])
train_product_ids = np.array([one_hot(d,10) for d in train_data['ProductId']])
test_product_ids = np.array([one_hot(d,10) for d in test_data['ProductId']])
test_user_ids = np.array([one_hot(d,10) for d in test_data['userId']])

num_users= train_user_ids.max()+1
num_products = train_product_ids.max() + 1
print(num_users)

In [None]:
from keras.layers.normalization.batch_normalization import BatchNormalization
import tensorflow as tf
from keras.layers import Input, Embedding, Flatten, Dense, Concatenate, dot
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from keras.models import Model, Sequential
from keras.layers import Input, Embedding, Flatten, Dense, Concatenate, dot, Multiply, Dropout
from keras.preprocessing.text import one_hot,Tokenizer
import keras.layers
from keras.optimizers import adam_v2
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import tensorflow as tf

def get_ncf_model(no_of_factors):

  product_input = Input(shape = [1], name = "Product-Input")
  user_input = Input(shape = [1], name = "User-Input")



  # Product embedding for GMF
  gmf_product_embedding = Embedding(n_products, no_of_factors, name= "GMF-Product-Embedding", embeddings_initializer="he_normal")(product_input)
  

  # User embedding for GMF
  gmf_user_embedding = Embedding(n_users, no_of_factors, name = "GMF-User-Embedding", embeddings_initializer="he_normal")(user_input)
  
  # GMF layers
  gmf_product_vec = Flatten(name = "GMF-Flatten-Products")(gmf_product_embedding)
  gmf_user_vec = Flatten(name = "GMF-Flatten-Users")(gmf_user_embedding)
  gmf_output = Multiply()([gmf_user_vec, gmf_product_vec])

  
  # Product embedding for MLP
  mlp_product_embedding = Embedding(n_products, no_of_factors, name= "MLP-Product-Embedding", embeddings_initializer="he_normal")(product_input)
  

  # User embedding for MLP
  mlp_user_embedding = Embedding(n_users, no_of_factors, name = "MLP-User-Embedding", embeddings_initializer="he_normal")(user_input)

  # MLP layers

  mlp_product_vec = Flatten(name = "MLP-Flatten-Products")(mlp_product_embedding)
  mlp_user_vec = Flatten(name = "MLP-Flatten-Users")(mlp_user_embedding)

  #Concatenate features
  conc = Concatenate()([mlp_product_vec, mlp_user_vec])

  fc1 = Dropout(0.2)(conc)
  fc2 = Dense(64, activation='relu')(fc1)
  fc3 = BatchNormalization()(fc2)

  fc4 = Dropout(0.2)(fc3)
  fc5 = Dense(32, activation='relu')(fc4)
  fc6 = BatchNormalization()(fc5)

  fc7 = Dropout(0.2)(fc6)

  fc8 = Dense(16, activation='relu')(fc7)
  fc9 = BatchNormalization()(fc8)

  fc10 = Dropout(0.2)(fc9)
  fc11 = Dense(8, activation='relu')(fc10)
  final_conc = Concatenate()([gmf_output, fc11])
  output = Dense(1, activation='relu')(final_conc)

 
  #Create model and compile it
  opt = keras.optimizers.adam_v2.Adam(learning_rate=0.001)
  model = Model([user_input, product_input], output)
  model.compile(loss='mean_absolute_error', optimizer=opt, metrics=['accuracy'] )
  #model = Model([user_input, product_input], output)
  #model.compile('adam', 'mean_absolute_error')
  return model


In [None]:
from IPython.display import SVG

model = get_ncf_model(10)
SVG(model_to_dot( model,  show_shapes=True, show_layer_names=True).create(prog='dot', format='svg'))

In [None]:
from sklearn.metrics import mean_absolute_error
no_of_factors = [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100]

mae_ncf = list()
for k in no_of_factors:
  model = get_ncf_model(k)
  model.fit([train_user_ids, train_product_ids], train_data['Ratings'], epochs=3)
  prediction = model.predict([test_user_ids, test_product_ids])
  mae_ncf.append(mean_absolute_error(test_data['Ratings'], prediction))
  print("Mean Absolute Error for value k {} is ".format(k), mean_absolute_error(test_data['Ratings'], prediction))

In [None]:
model = get_ncf_model(40)
model.fit([train_user_ids, train_product_ids], train_data['Ratings'], epochs=3)
prediction = model.predict([test_user_ids, test_product_ids])
predicted_df = pd.DataFrame({'userId': test_data['userId'], 'productId': test_data['ProductId'], 'actual': test_data['Ratings']})
predicted_df['prediction'] = prediction
pred_user = predicted_df.copy().groupby('userId', as_index=False)['productId'].agg({'actual': (lambda x: list(set(x)))})
pred_user = pred_user.set_index("userId")      
cf_model = predicted_df.pivot_table(index='userId', 
                            columns='productId', values='prediction').fillna(0)

userId_counts = test_data['ProductId'].value_counts()
product_list = train_data.ProductId.unique().tolist()
productId_counts = dict(new_dataset.ProductId.value_counts())

def get_users_predictions(user_id, n, model):
    recommended_items = pd.DataFrame(model.loc[user_id])
    recommended_items.columns = ["prediction"]
    recommended_items = recommended_items.sort_values('prediction', ascending=False)    
    recommended_items = recommended_items.head(n)
    return recommended_items.index.tolist()

def get_recs(model, k):
    recs = []
    for user in model.index:
        cf_predictions = get_users_predictions(user, k, model)
        recs.append(cf_predictions)
    return recs    

diversity_nn = []
novelty_nn = []
coverage_nn = []
f1_score_nn = []
accuracy_nn = []

# Top-n recommendations for each user
no_of_recommendations = [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100]
for k in no_of_recommendations:
  recs = get_recs(cf_model, k)
  preds = pd.DataFrame(index=cf_model.index)
  preds[f'Top-{k} Recommendation'] = recs

  # To calculate the f1_score
  f1_scores, f1_score = cal_f1(predicted_df ,k)
  print("The f1 score for {} recommendation is {}".format(k, f1_score))
  f1_score_nn.append(f1_score)

  # To calculate accuracy
  accuracy_scores, accuracy = cal_accuracy(predicted_df ,k)
  print("The accuracy score for {} recommendation is {} \n".format(k, accuracy))
  accuracy_nn.append(accuracy)

  # To calculate the diversity
  diversity_scores, diversity = personalization(list(recs))
  #diversity = get_shannon_entropy(recs, list(product_list), k)
  print("The diversity score for {} recommendation is {} \n".format(k, diversity))
  diversity_nn.append(diversity)

  # To calculate the novelty
  cf_novelty, novelty_list = recmetrics.novelty(list(recs), productId_counts, len(userId_counts), k)
  print("The novelty score for {} recommendation is {} \n".format(k, cf_novelty))
  novelty_nn.append(cf_novelty)

  # To calculate the coverage
  cf_coverage = recmetrics.catalog_coverage(list(recs), product_list, 100)
  print("The coverage score for {} recommendation is {} \n".format(k, cf_coverage))
  coverage_nn.append(cf_coverage)
  
  # To calculate the customer satisfaction
  edt = get_customer_satisfaction(predicted_df, k)
  print("The cusotmer satisfaction for {} recommendation is {}".format(k,  np.mean(list(edt.values()))))

  filename = "/home/ebcffhh/thesis/dnn/metrics_dnn_%s_recommendations.csv" % k
  with open(filename, 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(['userId', 'accuracy', 'f1_score', 'diversity', 'novelty', 'customer_satisfaction'])
    for i, nov, (uid, acc_score), (_, f1_score) in zip(diversity_scores, novelty_list, accuracy_scores.items(), f1_scores.items()):
       if uid in sorted(edt.keys()):
         writer.writerow([uid, acc_score, f1_score, (1 - i[0]), nov, edt.get(uid)])

In [None]:
k = [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100]

plt.xlabel('No of factors')
plt.ylabel('Mean Absolute Error')
ax=plt.gca()
ax.locator_params('y', nbins=10)

plt.locator_params('x', nbins=20)
plt.plot(k, mae_knn, label = "KNN")
plt.plot(k, mae_svd, label = "SVD")
plt.plot(k, mae_ncf, label = "DNN")
plt.scatter(k,mae_knn,s=50,color='red',zorder=2)
plt.scatter(k,mae_svd,s=50,color='green',zorder=2)
plt.scatter(k,mae_ncf,s=50,color='brown',zorder=2)
plt.legend()
plt.show()

print("mae")
print("mae_svd",mae_svd)
print("mae_knn",mae_knn)
print("mae_ncf",mae_ncf)


plt.xlabel('No of Recommendations')
plt.ylabel('Mean Absolute Error')
ax=plt.gca()
ax.locator_params('y', nbins=10)

plt.locator_params('x', nbins=20)
plt.plot(k, accuracy_knn, label = "KNN")
plt.plot(k, accuracy_svd, label = "SVD")
plt.plot(k, accuracy_nn, label = "DNN")
plt.scatter(k,accuracy_knn,s=50,color='red',zorder=2)
plt.scatter(k,accuracy_svd,s=50,color='green',zorder=2)
plt.scatter(k,accuracy_nn,s=50,color='brown',zorder=2)
plt.legend()
plt.show()

print("accuracy")
print("accuracy_svd",accuracy_svd)
print("accuracy_knn",accuracy_knn)
print("accuracy_dnn",accuracy_nn)

plt.xlabel('No of Recommendations')
plt.ylabel('f1_Score')
ax=plt.gca()
ax.locator_params('y', nbins=15)
plt.locator_params('x', nbins=20)
plt.plot(k, f1_score_knn, label = "KNN")
plt.plot(k, f1_score_svd, label = "SVD")
plt.plot(k, f1_score_nn, label = "DNN")


plt.scatter(k,f1_score_knn,s=50,color='red',zorder=2)
plt.scatter(k,f1_score_svd,s=50,color='green',zorder=2)
plt.scatter(k,f1_score_nn,s=50,color='brown',zorder=2)

plt.legend()
plt.show()

print("f1_Score")
print("f1_Score_nn", f1_score_nn)
print("f1_Score_knn", f1_score_knn)
print("f1_Score_svd", f1_score_svd)

plt.xlabel('No of Recommendations')
plt.ylabel('Diversity')
ax=plt.gca()
ax.locator_params('y', nbins=25)
plt.locator_params('x', nbins=20)
for i, txt in enumerate(diversity_nn):
    ax.annotate(round(txt, 5), (k[i],diversity_nn[i]),  fontsize=8)
plt.plot(k, diversity_nn, label = "DNN")
plt.scatter(k,diversity_nn,s=20,color='brown',zorder=1)
plt.legend()
plt.show()

plt.xlabel('No of Recommendations')
plt.ylabel('Diversity')
ax=plt.gca()
ax.locator_params('y', nbins=25)
plt.locator_params('x', nbins=20)
for i, txt in enumerate(diversity_knn):
    ax.annotate(round(txt, 5), (k[i],diversity_knn[i]),  fontsize=8)
plt.plot(k, diversity_knn, label = "KNN")
plt.scatter(k,diversity_knn,s=20,color='red',zorder=2)
plt.legend()
plt.show()


plt.xlabel('No of Recommendations')
plt.ylabel('Diversity')
ax=plt.gca()
ax.locator_params('y', nbins=25)
plt.locator_params('x', nbins=20)
for i, txt in enumerate(diversity_svd):
    ax.annotate(round(txt, 5), (k[i],diversity_svd[i]),  fontsize=8)
plt.plot(k, diversity_svd, label = "SVD")


plt.scatter(k,diversity_svd,s=20,color='red',zorder=2)

plt.legend()
plt.show()

print("diversity")
print("divesity_knn", diversity_knn)
print("diversity_nn",diversity_nn)
print("diversity_svd",diversity_svd)

plt.xlabel('No of Recommendations')
plt.ylabel('Novelty')
ax=plt.gca()
ax.locator_params('y', nbins=15)
plt.locator_params('x', nbins=20)
plt.plot(k, novelty_knn, label = "KNN")
plt.plot(k, novelty_svd, label = "SVD")
plt.plot(k, novelty_nn, label = "DNN")

plt.scatter(k,novelty_knn,s=50,color='red',zorder=2)
plt.scatter(k,novelty_svd,s=50,color='green',zorder=2)
plt.scatter(k,novelty_nn,s=50,color='black',zorder=2)

plt.legend()
plt.show()

print("novelty")
print("novelty_knn", novelty_knn)
print("novelty_nn",novelty_nn)
print("novelty_svd",novelty_svd)

plt.xlabel('No of Recommendations')
plt.ylabel('Coverage')
ax=plt.gca()
ax.locator_params('y', nbins=15)
plt.locator_params('x', nbins=20)
plt.plot(k, coverage_knn, label = "KNN")
plt.plot(k, coverage_svd, label = "SVD")
plt.plot(k, coverage_nn, label = "DNN")

plt.scatter(k,coverage_knn,s=50,color='blue',zorder=2)
plt.scatter(k,coverage_svd,s=50,color='grey',zorder=2)
plt.scatter(k,coverage_nn,s=50,color='orange',zorder=2)

plt.legend()
plt.show()

