In [1]:
from pathlib import Path
import sys  
import os
import pandas as pd 
from datetime import datetime
import scipy.sparse as sps
from numpy import linalg as LA

import numpy as np
from operator import itemgetter

In [2]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "libs")))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "src")))

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from Utils.load_URM import load_URM
from Utils.load_ICM import load_ICM

from scipy.sparse import hstack, vstack
from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_all = load_URM("../data/data_train.csv")
ICM_all = load_ICM("../data/data_ICM_metadata.csv")

In [5]:
from libs.Recommenders.SLIM.SLIMElasticNetRecommender import SLIMElasticNetRecommender 
from libs.Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender
from libs.Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from libs.Recommenders.KNN.ItemKNNCustomSimilarityRecommender import ItemKNNCustomSimilarityRecommender

In [6]:
import optuna as op

In [7]:
# URM_train = sps.load_npz('URM_train.npz')
# URM_validation = sps.load_npz('URM_validation.npz')
URM_train_validation, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train_validation, train_percentage = 0.85)



In [8]:
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

EvaluatorHoldout: Ignoring 1008 ( 2.8%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 527 ( 1.5%) Users that have less than 1 test interactions


In [9]:
def AP(recommended_items, relevant_items):
   
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    ap_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return ap_score

In [10]:
# We pass as paramether the recommender class

def evaluate_algorithm(URM_test, recommender_object, at=10):
    
    #cumulative_precision = 0.0
    #cumulative_recall = 0.0
    cumulative_AP = 0.0
    
    num_eval = 0


    for user_id in range(URM_test.shape[0]):

        relevant_items = URM_test.indices[URM_test.indptr[user_id]:URM_test.indptr[user_id+1]]
        
        if len(relevant_items)>0:
            
            recommended_items = recommender_object.recommend(user_id, cutoff=at)
            num_eval+=1

            #cumulative_precision += precision(recommended_items, relevant_items)
            #cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_AP += AP(recommended_items, relevant_items)
            
    #cumulative_precision /= num_eval
    #cumulative_recall /= num_eval
    MAP = cumulative_AP / num_eval
    
    return MAP


In [11]:
slim_model_train = SLIMElasticNetRecommender(URM_train)
slim_model_train.fit(l1_ratio=0.11006885790633625, alpha=0.0002551115306127753, topK = 307)

#slim_model_train.load_model('result_experiments/SLIM/', 'slim_300_train_weights')
# slim_model_all = SLIMElasticNetRecommender(URM_train)
# slim_model_all.load_model('result_experiments/SLIM/', 'slim_300_2_weights')

SLIMElasticNetRecommender: Processed 8931 (23.4%) in 5.00 min. Items per second: 29.76
SLIMElasticNetRecommender: Processed 18026 (47.3%) in 10.00 min. Items per second: 30.04
SLIMElasticNetRecommender: Processed 27272 (71.5%) in 15.00 min. Items per second: 30.30
SLIMElasticNetRecommender: Processed 36640 (96.1%) in 20.00 min. Items per second: 30.53
SLIMElasticNetRecommender: Processed 38121 (100.0%) in 20.85 min. Items per second: 30.47


In [12]:
slim_train_res, _ = evaluator_validation.evaluateRecommender(slim_model_train)

EvaluatorHoldout: Processed 34728 (100.0%) in 13.72 sec. Users per second: 2532


In [13]:
slim_train_res["MAP"]

cutoff
10    0.030621
Name: MAP, dtype: object

In [14]:
evaluate_algorithm(URM_validation, slim_model_train)

0.06297588460871925

In [15]:
slim_model_train_val = SLIMElasticNetRecommender(URM_train_validation)
slim_model_train_val.fit(l1_ratio=0.11006885790633625, alpha=0.0002551115306127753, topK = 307)

SLIMElasticNetRecommender: Processed 7900 (20.7%) in 5.00 min. Items per second: 26.33
SLIMElasticNetRecommender: Processed 15817 (41.5%) in 10.00 min. Items per second: 26.36
SLIMElasticNetRecommender: Processed 23548 (61.8%) in 15.00 min. Items per second: 26.16
SLIMElasticNetRecommender: Processed 25736 (67.5%) in 20.00 min. Items per second: 21.44
SLIMElasticNetRecommender: Processed 33919 (89.0%) in 25.00 min. Items per second: 22.61
SLIMElasticNetRecommender: Processed 38121 (100.0%) in 27.65 min. Items per second: 22.98


In [16]:
slim_train_val_res, _ = evaluator_test.evaluateRecommender(slim_model_train_val)

EvaluatorHoldout: Processed 35209 (100.0%) in 15.07 sec. Users per second: 2337


In [17]:
slim_train_val_res["MAP"]

cutoff
10    0.047714
Name: MAP, dtype: object

In [18]:
evaluate_algorithm(URM_test, slim_model_train_val)

0.08366541182230076

In [19]:
rp3_beta_train = RP3betaRecommender(URM_train)
rp3_beta_train.fit(alpha=0.34989902568351894, beta=0.1817338725671425, topK=12, min_rating=0.0, implicit=False, normalize_similarity=True)

rp3_beta_train_val = RP3betaRecommender(URM_train_validation)
rp3_beta_train_val.fit(alpha=0.34989902568351894, beta=0.1817338725671425, topK=12, min_rating=0.0, implicit=False, normalize_similarity=True)

#rp3_beta_train.load_model('result_experiments/RP3beta/', 'rp3beta_10_train_weights')
# rp3_beta_all = RP3betaRecommender(URM_train)
# rp3_beta_all.load_model('result_experiments/RP3beta/', 'rp3beta_10_weights')

RP3betaRecommender: Similarity column 38121 (100.0%), 4374.34 column/sec. Elapsed time 8.71 sec
RP3betaRecommender: Similarity column 38121 (100.0%), 3967.24 column/sec. Elapsed time 9.61 sec


In [29]:
evaluate_algorithm(URM_validation, rp3_beta_train), evaluate_algorithm(URM_test, rp3_beta_train_val)

(0.05821489564907338, 0.07579670317181916)

In [22]:
rp3_beta_train_res, _ = evaluator_validation.evaluateRecommender(rp3_beta_train)
rp3_beta_train_val_res, _ = evaluator_test.evaluateRecommender(rp3_beta_train_val)

EvaluatorHoldout: Processed 34728 (100.0%) in 8.72 sec. Users per second: 3982
EvaluatorHoldout: Processed 35209 (100.0%) in 8.99 sec. Users per second: 3915


In [23]:
rp3_beta_train_res["MAP"], rp3_beta_train_val_res["MAP"]

(cutoff
 10    0.028152
 Name: MAP, dtype: object,
 cutoff
 10    0.042773
 Name: MAP, dtype: object)

In [24]:
rp3_beta_train_val_res["MAP"]

cutoff
10    0.042773
Name: MAP, dtype: object

In [25]:
ItemKNNCFRecommender_results = pd.read_csv("result_experiments/ItemKNNCFRecommender/results.csv")

In [26]:
ItemKNNCFRecommender_results.sort_values('MAP', ascending=False).iloc[:, : 15]

Unnamed: 0,similarity,topK,shrink,normalize,time,cutoff,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE
1388,jaccard,5,12,False,2024-12-05 05:49:14,10,0.097692,0.146128,0.124175,0.051709,0.075515,0.273210,0.142178,0.109353,0.522025
1387,jaccard,5,12,True,2024-12-05 05:48:58,10,0.097692,0.146128,0.124175,0.051709,0.075515,0.273210,0.142178,0.109353,0.522025
2072,tanimoto,5,12,False,2024-12-05 10:34:02,10,0.097692,0.146128,0.124175,0.051709,0.075515,0.273210,0.142178,0.109353,0.522025
2071,tanimoto,5,12,True,2024-12-05 10:33:46,10,0.097692,0.146128,0.124175,0.051709,0.075515,0.273210,0.142178,0.109353,0.522025
1392,jaccard,5,14,False,2024-12-05 05:50:18,10,0.097816,0.146319,0.124356,0.051682,0.075437,0.272863,0.142174,0.109501,0.522391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,pearson,11,0,True,2024-12-05 03:53:43,10,0.000793,0.001144,0.000934,0.000252,0.000371,0.002408,0.001007,0.000858,0.007590
916,pearson,11,0,False,2024-12-05 03:53:57,10,0.000793,0.001144,0.000934,0.000252,0.000371,0.002408,0.001007,0.000858,0.007590
917,pearson,11,5,True,2024-12-05 03:54:11,10,0.000793,0.001144,0.000934,0.000252,0.000371,0.002408,0.001007,0.000858,0.007590
918,pearson,11,5,False,2024-12-05 03:54:26,10,0.000793,0.001144,0.000934,0.000252,0.000371,0.002408,0.001007,0.000858,0.007590


In [42]:
# similarity='jaccard', topK=5, shrink=12, normalize=False

knn_train = ItemKNNCFRecommender(URM_train)
knn_train.fit(similarity='tversky', topK=4, shrink=18,
                  feature_weighting='none', tversky_alpha=0.1263621,
                  tversky_beta=1.72181042,
                  normalize=False)

knn_train_val = ItemKNNCFRecommender(URM_train_validation)
knn_train_val.fit(similarity='tversky', topK=4, shrink=18,
                  feature_weighting='none', tversky_alpha=0.1263621,
                  tversky_beta=1.72181042,
                  normalize=False)


# ItemKNNCF_train = ItemKNNCFRecommender(URM_train)
# ItemKNNCF_train.load_model("result_experiments/ItemKNNCFRecommender/", "ItemKNNCF_5_train_weights")

Similarity column 38121 (100.0%), 5021.78 column/sec. Elapsed time 7.59 sec
Similarity column 38121 (100.0%), 4593.81 column/sec. Elapsed time 8.30 sec


In [43]:
evaluate_algorithm(URM_validation, knn_train), evaluate_algorithm(URM_test, knn_train_val)

(0.057754182629180306, 0.07549006255404529)

In [44]:
knn_train_res, _ = evaluator_validation.evaluateRecommender(knn_train)
knn_train_val_res, _ = evaluator_test.evaluateRecommender(knn_train_val)

EvaluatorHoldout: Processed 34728 (100.0%) in 8.80 sec. Users per second: 3948
EvaluatorHoldout: Processed 35209 (100.0%) in 8.79 sec. Users per second: 4006


In [45]:
knn_train_res["MAP"], knn_train_val_res["MAP"]

(cutoff
 10    0.027949
 Name: MAP, dtype: object,
 cutoff
 10    0.042742
 Name: MAP, dtype: object)

# Optuna

In [46]:
results_dict={
    'Iteration':[],
    'SLIM_w':[],
    'RP3b_w':[],
    'ItemKNN_w':[],
    'MAP_val':[],
    'MAP_test':[],
}

In [47]:
new_best_on_test={
    'SLIM_w':0,
    'RP3b_w':0,
    'ItemKNN_w':0,
    'MAP_test':0
}

In [48]:
improving_treshold=0.85 

In [49]:
import optuna as op

best_on_val = 0.03

def objective_function_weight_hybrid(trial):
    results_dict['Iteration'].append(trial.number)
    n = 3
    x = []
    for i in range(n):
        x.append(- np.log(trial.suggest_float(f"x_{i}", 0, 1)))
    weights = []
    for i in range(n):
        weights.append(x[i] / sum(x))
        if i==0:
            key='SLIM_w'
        elif i==1:
            key='RP3b_w'
        else:
            key='ItemKNN_w'
        results_dict[key].append(weights[i])

    for i in range(n):
        trial.set_user_attr(f"weights_{i}",weights[i])
        
    #PRINT WEIGHTS CHOSEN ###################################################
    print("__________Iteration " + str(trial.number) + "______________")
    print("SLIM weight: "+str(weights[0]))
    print("RP3b weight: "+str(weights[1]))
    print("ItemKNN weight: "+str(weights[2]))##
    
    #BUILD HYBRID ###########################################################
    new_similarity_train = weights[0] * slim_model_train.W_sparse + weights[1] * rp3_beta_train.W_sparse + weights[2] * knn_train.W_sparse
    recommender_object_train = ItemKNNCustomSimilarityRecommender(URM_train)
    recommender_object_train.fit(new_similarity_train)
    res_on_val, _ = evaluator_validation.evaluateRecommender(recommender_object_train)
    results_dict['MAP_val'].append(res_on_val['MAP'][10])

    #print("VAL", res_on_val['MAP'][10], best_on_val * improving_treshold)
    if res_on_val['MAP'][10] >= best_on_val * improving_treshold:
        new_similarity_train_val = weights[0] * slim_model_train_val.W_sparse + weights[1] * rp3_beta_train_val.W_sparse + weights[2] * knn_train_val.W_sparse
        recommender_object_train_val = ItemKNNCustomSimilarityRecommender(URM_train_validation)
        recommender_object_train_val.fit(new_similarity_train_val)
        res_on_test, _ = evaluator_test.evaluateRecommender(recommender_object_train_val)
        results_dict['MAP_test'].append(res_on_test["MAP"][10])

        #print("Test", res_on_test["MAP"][10], new_best_on_test['MAP_test'])
        if res_on_test["MAP"][10] > new_best_on_test['MAP_test']:
            new_best_on_test['SLIM_w']=weights[0]
            new_best_on_test['RP3b_w']=weights[1]
            new_best_on_test['ItemKNN_w']=weights[2]
            new_best_on_test['MAP_test']=res_on_test["MAP"][10]
            print("New best on test found! Score is "+ str(res_on_test["MAP"][10]))
            print(f"weights: SLIM_w: {weights[0]}, RP3b_w: {weights[1]}, ItemKNN_w: {weights[2]}")
            
        
        
    else:
         results_dict['MAP_test'].append(0.0)
    return res_on_val['MAP'][10]

In [None]:
# study = op.create_study(direction="maximize")
study.optimize(objective_function_weight_hybrid,
                      callbacks=[],
                      n_trials = 400)

In [59]:
new_best_on_test

{'SLIM_w': 0.8672976014690289,
 'RP3b_w': 0.10507625421076075,
 'ItemKNN_w': 0.027626144320210273,
 'MAP_test': 0.04792289534556976}

In [52]:
# slim_model_train.save_model("result_experiments/SLIM/slim_307_train_ens")
# slim_model_train_val.save_model("result_experiments/SLIM/slim_307_train_val_ens")

SLIMElasticNetRecommender: Saving model in file 'result_experiments/SLIM/slim_307_train_ensSLIMElasticNetRecommender'
SLIMElasticNetRecommender: Saving complete
SLIMElasticNetRecommender: Saving model in file 'result_experiments/SLIM/slim_307_train_val_ensSLIMElasticNetRecommender'
SLIMElasticNetRecommender: Saving complete


In [53]:
# knn_train.save_model("result_experiments/ItemKNNCFRecommender/knn_train_ens")
# knn_train_val.save_model("result_experiments/ItemKNNCFRecommender/knn_train_val_ens")

ItemKNNCFRecommender: Saving model in file 'result_experiments/ItemKNNCFRecommender/knn_train_ensItemKNNCFRecommender'
ItemKNNCFRecommender: Saving complete
ItemKNNCFRecommender: Saving model in file 'result_experiments/ItemKNNCFRecommender/knn_train_val_ensItemKNNCFRecommender'
ItemKNNCFRecommender: Saving complete


In [55]:
# rp3_beta_train.save_model("result_experiments/RP3beta/rp3beta_train_ens")
# rp3_beta_train_val.save_model("result_experiments/RP3beta/rp3beta_train_val_ens")

RP3betaRecommender: Saving model in file 'result_experiments/RP3beta/rp3beta_train_ensRP3betaRecommender'
RP3betaRecommender: Saving complete
RP3betaRecommender: Saving model in file 'result_experiments/RP3beta/rp3beta_train_val_ensRP3betaRecommender'
RP3betaRecommender: Saving complete


In [57]:
# results_df=pd.DataFrame(results_dict)
# results_df.to_csv("result_experiments/ensemble_SLIM_RP3_KNNCFR/results_prof_eval_11_12_24.csv",sep=',',index=False)

In [63]:
# sps.save_npz('URM_train_ens.npz', URM_train)
# sps.save_npz('URM_train_val_ens.npz', URM_train_validation)
# sps.save_npz('URM_test_ens.npz', URM_test)

In [58]:
print(new_best_on_test['SLIM_w'])
print(new_best_on_test['RP3b_w'])
print(new_best_on_test['ItemKNN_w'])

0.8672976014690289
0.10507625421076075
0.027626144320210273


In [70]:
results_df.sort_values('MAP_val', ascending=False)

Unnamed: 0,Iteration,SLIM_w,RP3b_w,ItemKNN_w,MAP_val,MAP_test
267,267,0.000138,0.999847,0.000015,0.128963,0.0
257,257,0.000048,0.999820,0.000133,0.128963,0.0
297,297,0.000019,0.999837,0.000144,0.128961,0.0
260,260,0.000519,0.999391,0.000089,0.128960,0.0
253,253,0.000198,0.999592,0.000210,0.128958,0.0
...,...,...,...,...,...,...
9,9,0.415903,0.133992,0.450105,0.089874,0.0
1,1,0.752678,0.061503,0.185818,0.089587,0.0
237,237,0.052045,0.195269,0.752685,0.086758,0.0
30,30,0.013971,0.168956,0.817073,0.084429,0.0


In [70]:
def write_submission_lib_model(trained_model, filename: str = "submission.csv") -> None:
	"""Builds the submission file from a trained recommender model. The file is saved in a CSV format.

	:param trained_model: A fitted recommender model
	:type trained_model: RecommenderModel
	:param filename: The filename of the submission for this particular recommender model
	:type filename: str
	"""
	target_users_test = pd.read_csv("../data/data_target_users_test.csv",).to_numpy().ravel()

	recommendations = np.array([
		trained_model.recommend(user_id, cutoff=10) for user_id in target_users_test
	])

	if not os.path.exists("../submissions"):
		os.makedirs("../submissions")
	with open(f"../submissions/{filename}", "w") as f:
		f.write("user_id,item_list\n")
		for user_id, recs in zip(target_users_test, recommendations):
			f.write(f"{user_id},{' '.join(map(str, recs))}\n")

In [65]:
SLIM_all = SLIMElasticNetRecommender(URM_all)
SLIM_all.fit(l1_ratio=0.11006885790633625, alpha=0.0002551115306127753, topK = 307)

SLIMElasticNetRecommender: Processed 7098 (18.6%) in 5.00 min. Items per second: 23.66
SLIMElasticNetRecommender: Processed 14067 (36.9%) in 10.00 min. Items per second: 23.44
SLIMElasticNetRecommender: Processed 21113 (55.4%) in 15.00 min. Items per second: 23.46
SLIMElasticNetRecommender: Processed 28084 (73.7%) in 20.00 min. Items per second: 23.40
SLIMElasticNetRecommender: Processed 35160 (92.2%) in 25.00 min. Items per second: 23.44
SLIMElasticNetRecommender: Processed 38121 (100.0%) in 27.20 min. Items per second: 23.36


In [66]:
RP3_beta_all = RP3betaRecommender(URM_all)
RP3_beta_all.fit(alpha=0.34989902568351894, beta=0.1817338725671425, topK=12, min_rating=0.0, implicit=False, normalize_similarity=True)

RP3betaRecommender: Similarity column 38121 (100.0%), 3771.53 column/sec. Elapsed time 10.11 sec


In [67]:
KNN_all = ItemKNNCFRecommender(URM_all)
KNN_all.fit(similarity='tversky', topK=4, shrink=18,
                  feature_weighting='none', tversky_alpha=0.1263621,
                  tversky_beta=1.72181042,
                  normalize=False)

Similarity column 38121 (100.0%), 4234.02 column/sec. Elapsed time 9.00 sec


In [68]:
# iter 239 0.04801274656744886
# 0.7380796311636464 * SLIM_all.W_sparse + 0.21602197286950797 * RP3_beta_all.W_sparse + 0.04589839596684567 * KNN_all.W_sparse

similarity_all = 0.8672976014690289 * SLIM_all.W_sparse + 0.10507625421076075 * RP3_beta_all.W_sparse + 0.027626144320210273 * KNN_all.W_sparse
recommender_best = ItemKNNCustomSimilarityRecommender(URM_all)
recommender_best.fit(similarity_all)

In [126]:
evaluator_test.evaluateRecommender(recommender_best)

EvaluatorHoldout: Processed 35171 (100.0%) in 15.52 sec. Users per second: 2266


(       PRECISION PRECISION_RECALL_MIN_DEN RECALL  MAP MAP_MIN_DEN  MRR NDCG  \
 cutoff                                                                        
 10           0.0                      0.0    0.0  0.0         0.0  0.0  0.0   
 
          F1 HIT_RATE ARHR_ALL_HITS  ... COVERAGE_USER COVERAGE_USER_HIT  \
 cutoff                              ...                                   
 10      0.0      0.0           0.0  ...       0.98419               0.0   
 
        USERS_IN_GT DIVERSITY_GINI SHANNON_ENTROPY RATIO_DIVERSITY_HERFINDAHL  \
 cutoff                                                                         
 10         0.98419       0.263098       13.599568                   0.999901   
 
        RATIO_DIVERSITY_GINI RATIO_SHANNON_ENTROPY RATIO_AVERAGE_POPULARITY  \
 cutoff                                                                       
 10                 0.430335              0.921023                 1.229809   
 
        RATIO_NOVELTY  
 cutoff             

In [71]:
write_submission_lib_model(recommender_best, "ensemble_SLIM_RP3beta_KNN_7.csv")