In [44]:
from datetime import datetime
import settings.config as cfg
import pandas as pd
import numpy as np
import pickle

preprocessed_dataset_folder = "preprocessed_dataset" #cfg.preprocessed_dataset_folder
individual_rs_strategy = "CB_KNN" #cfg.individual_rs_strategy
group_rs_evaluation_folds_k = 5 #cfg.group_rs_evaluation_folds_k

In [45]:
ratings_df = pd.read_csv(preprocessed_dataset_folder+"/ratings.csv")
movies_df = pickle.load(open(preprocessed_dataset_folder+"/movies.pkl", "rb"))

In [46]:
from sklearn.model_selection import StratifiedKFold
import itertools
import warnings

warnings.filterwarnings('ignore')

skf = StratifiedKFold(n_splits=group_rs_evaluation_folds_k, random_state=42, shuffle=True)

print(datetime.now(), "Folds created!")

iteration = 1

for train_index, test_index in skf.split(ratings_df, ratings_df['user']):
    if iteration == 1:
        # split train and test df
        train_df = ratings_df.iloc[train_index]
        test_df = ratings_df.iloc[test_index]
        display("train_df", train_df, "test_df", test_df)

        # getting user-items pairs in the training set
        train_set_pairs = set(list(zip(train_df['user'].values,train_df['item'].values)))

        # create test_complete_df with all the possible user-items pairs in the test_df
        user_set = set(test_df['user'].values)
        item_set = set(test_df['item'].values)

        all_ui_values = list(itertools.product(user_set, item_set))
        test_pred_df = pd.DataFrame(all_ui_values, columns=['user', 'item'])
        display("test_pred_df", test_pred_df)

        iteration += 1

2023-01-31 01:23:44.670636 Folds created!


'train_df'

Unnamed: 0,user,item,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
942216,6040,2028,5
942220,6040,1091,1
942222,6040,562,5
942223,6040,1096,4


'test_df'

Unnamed: 0,user,item,rating
5,1,1197,3
8,1,594,4
11,1,938,4
13,1,2918,4
15,1,2791,4
...,...,...,...
942191,6040,2745,3
942217,6040,1080,4
942218,6040,1089,4
942219,6040,1090,3


'test_pred_df'

Unnamed: 0,user,item
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5
...,...,...
12194755,6040,3937
12194756,6040,3946
12194757,6040,3948
12194758,6040,3949


In [47]:
display("movies_df", movies_df)

'movies_df'

Unnamed: 0,item,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,1995,0,1,1,1,1,0,0,0,...,0.04050,0.01425,0.03050,0.03500,0.14125,0.05775,0.03900,0.02975,0.08475,0.02200
1,2,1995,0,1,0,1,0,0,0,0,...,0.05250,0.01575,0.01250,0.02000,0.12225,0.03275,0.02100,0.01100,0.10525,0.01975
2,3,1995,0,0,0,0,1,0,0,0,...,0.06275,0.01950,0.02225,0.02300,0.12200,0.03475,0.01700,0.01800,0.09100,0.01775
3,4,1995,0,0,0,0,1,0,0,1,...,0.05325,0.02800,0.01675,0.03875,0.18200,0.07050,0.01625,0.01425,0.08850,0.01500
4,5,1995,0,0,0,0,1,0,0,0,...,0.05350,0.02050,0.01425,0.02550,0.19225,0.02675,0.01625,0.01300,0.08700,0.01600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62418,209157,2018,0,0,0,0,0,0,0,1,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
62419,209159,2001,0,0,0,0,0,0,1,0,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
62420,209163,2018,0,0,0,0,1,0,0,1,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
62421,209169,2001,0,0,0,0,0,0,0,0,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000


In [48]:
from sklearn.neighbors import NearestNeighbors
from abc import ABC, abstractmethod

class IndividualRS(ABC):

    @staticmethod
    def train_individual_rs_and_get_predictions(recommender, training_df, test_df):
        cfg.individual_rs_strategy = recommender

        if cfg.individual_rs_strategy == "CB_KNN":
            print(cfg.individual_rs_strategy)
            rs = ContentBasedKNN()
            return rs.train_and_predict(training_df, test_df)  
        return None    
    
    @abstractmethod
    def train_and_predict(self, training_df, test_df):
        pass

class ContentBasedKNN():
    def train_and_predict(self, training_df, test_df):
        if cfg.individual_rs_validation_folds_k <=0:
            sim = similarity_matrix(test_pred_df, movies_df)
            
            print("evaluating predictions")
            test_df = predict(sim, training_df, test_pred_df)
            print("Done!")
            return test_df
        return None  


In [49]:
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

def similarity_matrix(test_pred_df, movies_df):
    items = movies_df["item"]
    test_items = test_pred_df["item"].unique()

    csr_movies = csr_matrix(movies_df.values)
    sim = np.zeros((len(items), len(items)))
    
    #zhruba od 800 záznamů začíná být ukládání hodnot do csr_matrix() výrazně pomalé, převod csr_matrix(sim) také pomalý
    #sim = csr_matrix((len(items), len(items)))

    #print(len(test_items))
    test_items = np.intersect1d(items, test_items)
    #print(len(test_items))

    print("Building similarity matrix...")

    for i in test_items:
        index = movies_df.loc[movies_df["item"] == i].index
        item = csr_movies[index]

        cos = cosine_similarity(item, csr_movies)
        sim[i, :] = cos[0, :]

        #print(i)
    print("Done!")

    return sim

def predict(sim, training_df, test_pred_df):
    result = pd.DataFrame(columns=test_pred_df.columns)
    pred_rating_df = pd.DataFrame(data=test_pred_df["item"].unique(), columns=["item"])
    pred_items = pred_rating_df["item"]

    users = 100
    
    #for user in test_pred_df["user"].unique():    
    for user in test_pred_df.loc[test_pred_df["user"] <= users, "user"].unique():
        user_rating_df = training_df.loc[training_df['user'] == user]
        items = user_rating_df["item"].tolist()

        simId = []
        simVal = []
        rating = []

        # čistě přes np.amax(sim[:, items]), np.argmax() bez použití cyklu -> výrazné zpomalení
        for i in pred_items:
            similarities = sim[i, items]
            sim_item = np.argmax(similarities)
            sim_val = similarities[sim_item]
            simId.append(items[sim_item])
            simVal.append(sim_val)

        user_pred_rating_df = pd.DataFrame({"user": user, "item":pred_items, "simId": simId, "simVal": simVal})

        result = pd.concat([result, user_pred_rating_df])

        #print("User", user, "Done")

    rating = result.merge(training_df, left_on=['user','simId'], right_on=['user','item'])
    result["rating"] = rating["rating"]

    result['predicted_rating'] = result['rating'] * result['simVal']

    result.reset_index(drop=True, inplace=True)
    result.drop(columns=["simId", "simVal", "rating"], axis=1, inplace=True)
    return result  

In [50]:
result = IndividualRS.train_individual_rs_and_get_predictions(individual_rs_strategy, train_df, test_pred_df)
#result.to_csv(preprocessed_dataset_folder+"/results_CB.csv", index=False)

display(result)

CB_KNN
Building similarity matrix...
Done!
evaluating predictions
Done!


Unnamed: 0,user,item,predicted_rating
0,1,1,4.999979
1,1,2,5.000000
2,1,3,4.999981
3,1,4,4.999977
4,1,5,4.999975
...,...,...,...
201895,100,3937,3.999939
201896,100,3946,3.999878
201897,100,3948,3.999884
201898,100,3949,3.999883
