# Laboratorium 5 - rekomendacje grupowe

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] utwórz wirtualne środowisko:
 `python3 -m venv ./recsyslab5`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas scipy matplotlib`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety
import numpy as np
import pandas as pd

from scipy.sparse.linalg import svds
from collections import defaultdict
from random import sample
from statistics import mean, stdev

In [2]:
# ścieżka do datasetu
PATH = 'ml-latest-small'

In [3]:
# wczytujemy oceny użytkowników i obliczamy (za pomocą dekompozycji macierzy) 
# wszystkie przewidywane oceny filmów
def read_ratings(k=600, scale_factor=2.0, print_stats=True):
    # idea: https://www.kaggle.com/code/indralin/movielens-project-1-2-collaborative-filtering
    reviews = pd.read_csv(
        f'{PATH}/ratings.csv',
        names=['userId', 'movieId', 'rating', 'time'],
        delimiter=',',
        engine='python',
        skiprows=1
    )
    
    reviews.drop(['time'], axis=1, inplace=True)
    reviews_no, _ = reviews.shape
    reviews_matrix = reviews.pivot(index='userId', columns='movieId', values='rating')
    movies = reviews_matrix.columns
    users = reviews_matrix.index
    users_no, movies_no = reviews_matrix.shape
    print(f'Got {reviews_no} reviews for {movies_no} movies and {users_no} users.')

    user_ratings_mean = np.nanmean(reviews_matrix.values, axis=1)
    normalized_reviews_matrix = np.nan_to_num(
        reviews_matrix.values - user_ratings_mean.reshape(-1, 1), 0.0
    )

    U, sigma, Vt = svds(normalized_reviews_matrix, k=k)
    sigma = np.diag(sigma)
    predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1).clip(0.5, 5.0)
    mean_square_error = np.nanmean(np.square(predicted_ratings - reviews_matrix.values))
    std_square_error = np.nanstd(np.square(predicted_ratings - reviews_matrix.values))
    print(f'Reviews prediction mean square error = {mean_square_error}')
    print(f'Reviews prediction standard deviation of square error = {std_square_error}')

    if print_stats:
        stats = [
            ('metric', 'dataset', 'prediction'),
            ('avg', np.nanmean(reviews_matrix), np.mean(predicted_ratings)),
            ('st_dev', np.nanstd(reviews_matrix), np.std(predicted_ratings)),
            ('median', np.nanmedian(reviews_matrix), np.median(predicted_ratings)),
            ('p25', np.nanquantile(reviews_matrix, 0.25), np.quantile(predicted_ratings, 0.25)),
            ('p75', np.nanquantile(reviews_matrix, 0.75), np.quantile(predicted_ratings, 0.75))
        ]

        print('Stats (for ratings in original range [0.5, 5.0]):')
        print('\n'.join([str(s) for s in stats]))

    # zmieniamy zakres wartości na {1, 2, ..., 10}
    rounded_predictions = np.rint(scale_factor * predicted_ratings)

    return pd.DataFrame(data=rounded_predictions, index=list(users), columns=list(movies))


ratings = read_ratings()

# dostęp do danych:
# ratings[movieId][userId] pobiera 1 wartość
# ratings.loc[:, movieId] pobiera wektor dla danego filmu
# ratings.loc[userId, :] pobiera wektor dla danego użytkownika

ratings

Got 100836 reviews for 9724 movies and 610 users.
Reviews prediction mean square error = 1.657778784292441e-05
Reviews prediction standard deviation of square error = 0.0007928950536518454
Stats (for ratings in original range [0.5, 5.0]):
('metric', 'dataset', 'prediction')
('avg', np.float64(3.501556983616962), np.float64(3.657222337747399))
('st_dev', np.float64(1.042524069618056), np.float64(0.49546237560971024))
('median', np.float64(3.5), np.float64(3.705224008811769))
('p25', np.float64(3.0), np.float64(3.357451718316402))
('p75', np.float64(4.0), np.float64(3.999981626883001))


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,8.0,9.0,8.0,9.0,9.0,8.0,9.0,9.0,9.0,9.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
2,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
3,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
4,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
5,8.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,5.0,7.0,7.0,7.0,7.0,7.0,5.0,7.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
607,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
608,5.0,4.0,4.0,6.0,6.0,6.0,6.0,6.0,6.0,8.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
609,6.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,8.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0


In [4]:
# wczytujemy nazwy filmów i kategorie
movies_metadata = pd.read_csv(f'{PATH}/movies.csv').set_index('movieId')
movies_metadata

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
# wczytujemy przykładowe grupy użytkowników
groups = pd.read_csv('groups.csv').values.tolist()
groups

[[111, 307, 474, 599, 414],
 [469, 182, 232, 448, 600],
 [508, 581, 497, 402, 566],
 [300, 515, 245, 568, 507],
 [2, 371, 252, 518, 37],
 [269, 360, 469, 287, 308],
 [243, 527, 418, 118, 370],
 [186, 559, 327, 553, 314]]

In [6]:
# przygotowujemy funkcję pomocniczą
def describe_group(group, N=10):
    print(f'\nUser ids: {group}')
    
    mean_stdev = ratings.loc[group].std(axis=0).mean()
    median_stdev = ratings.loc[group].std(axis=0).median()
    std_stdev = ratings.loc[group].std(axis=0).std()

    print(f'\nMean ratings deviation: {mean_stdev}')
    print(f'Median ratings deviation: {median_stdev}')
    print(f'Standard deviation of ratings deviation: {std_stdev}')
    
    average_scores = ratings.iloc[group].mean(axis=0)
    average_scores = average_scores.sort_values()
    best_movies = [
        (movies_metadata['title'][movie_id], average_scores[movie_id])
        for movie_id in list(average_scores[-N:].index)
    ]

    worst_movies = [
        (movies_metadata['title'][movie_id], average_scores[movie_id])
        for movie_id in list(average_scores[:N].index)
    ]
    
    print('\nBest movies:')
    for movie, score in best_movies[::-1]:
        print(f'{movie}, {score}*')
    
    print('\nWorst movies:')
    for movie, score in worst_movies:
        print(f'{movie}, {score}*')


describe_group(groups[5])


User ids: [269, 360, 469, 287, 308]

Mean ratings deviation: 1.1259149574579788
Median ratings deviation: 1.0954451150103321
Standard deviation of ratings deviation: 0.17836724055768716

Best movies:
Forrest Gump (1994), 8.2*
Toy Story (1995), 8.2*
Braveheart (1995), 8.0*
Willy Wonka & the Chocolate Factory (1971), 8.0*
Terminator 2: Judgment Day (1991), 7.8*
Schindler's List (1993), 7.8*
Shawshank Redemption, The (1994), 7.6*
Twelve Monkeys (a.k.a. 12 Monkeys) (1995), 7.6*
Nixon (1995), 7.6*
Dances with Wolves (1990), 7.6*

Worst movies:
Broken Arrow (1996), 5.2*
Cable Guy, The (1996), 5.4*
Sleepy Hollow (1999), 5.4*
The Devil's Advocate (1997), 5.4*
Nutty Professor, The (1996), 5.6*
Mission: Impossible (1996), 5.6*
Matrix Revolutions, The (2003), 5.8*
Cheech & Chong's The Corsican Brothers (1984), 5.8*
Hellbound: Hellraiser II (1988), 5.8*
Masquerade (1988), 5.8*


## Część 2. - algorytmy proste

In [7]:
# zdefiniujmy interfejs dla wszystkich algorytmów rekomendacyjnych
class Recommender:
    def recommend(self, movies, ratings, group, size):
        pass


# jako pierwszy zaimplementujemy algorytm losowy - dla porównania
class RandomRecommender(Recommender):
    def __init__(self):
        self.name = 'random'
    
    
    def recommend(self, movies, ratings, group, size):
        return sample(sorted(movies), size)

In [8]:
# algorytm rekomendujący filmy o najwyższej średniej ocen
class AverageRecommender(Recommender):
    def __init__(self):
        self.name = 'average'
    
    
    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.loc[group, :]
        average_ratings = group_ratings.mean(axis=0)
        average_ratings = average_ratings.sort_values()
        recommendation = [movies[movie_id] for movie_id in list(average_ratings[-size:].index)]

        return recommendation

In [9]:
# algorytm rekomendujący filmy o najwyższej średniej ocen,
# ale równocześnie wykluczający te filmy, które otrzymały choć jedną ocenę poniżej thresholdu
class AverageWithoutMiseryRecommender(Recommender):
    def __init__(self, score_threshold):
        self.name = 'average_without_misery'
        self.score_threshold = score_threshold
    
     
    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.loc[group, :]

        # wykluczenie filmów z oceną poniżej thresholdu
        group_ratings = group_ratings[group_ratings >= self.score_threshold]

        average_ratings = group_ratings.mean(axis=0)
        average_ratings = average_ratings.sort_values()
        recommendation = [movies[movie_id] for movie_id in list(average_ratings[-size:].index)]

        return recommendation

In [10]:
# algorytm uwzględniający preferencje tylko jednego użytkownika w każdej iteracji
class FairnessRecommender(Recommender):
    def __init__(self):
        self.name = 'fairness'
    
     
    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.loc[group, :].copy()
        recommendation = []
        n = len(group)

        for i in range(size):
            user = group[i%n]
            user_ratings = group_ratings.loc[user, :]
            user_ratings = user_ratings.sort_values()
            best_movie_id = list(user_ratings[-1:].index)[0]
            recommendation.append(movies[best_movie_id])
            group_ratings.drop(best_movie_id, axis=1, inplace=True)
        
        return recommendation

In [11]:
# wybrany algorytm wyborczy (dyktatura, Borda, Copeland)
class VotingRecommender(Recommender):
    def __init__(self):
        # nazwa wybranego algorytmu
        self.name = 'borda'
    
    
    def recommend(self, movies, ratings, group, size):
        N = ratings.shape[1] - 1
        movies_points = pd.Series(0.0, index=ratings.columns)

        for user in group:
            user_ratings = ratings.loc[user, :]
            user_ratings = user_ratings.sort_values(ascending=False)

            user_points = pd.Series(
                [N - i for i in range(N+1)],
                index=user_ratings.index
            )

            movies_points += user_points

        movies_points = movies_points.sort_values()
        recommendation = [movies[movie_id] for movie_id in list(movies_points[-size:].index)]

        return recommendation

In [12]:
# algorytm zachłanny, aproksymujący metodę Proportional Approval Voting
# w każdej iteracji wybieramy ten film, który najbardziej zwiększa zadowolenie zgodnie z punktacją PAV
class ProportionalApprovalVotingRecommender(Recommender):
    def __init__(self, threshold):
        self.threshold = threshold
        self.name = 'PAV'
    
     
    def recommend(self, movies, ratings, group, size):
        user_accepted = {user: 1 for user in group}
        group_ratings = ratings.loc[group, :].copy()
        recommendation = [None] * size

        for i in range(size):
            movie_points = defaultdict(int)

            for user in group:
                for movie_id in list(group_ratings.loc[user, :].index):
                    if ratings.loc[user, movie_id] >= self.threshold:
                        movie_points[movie_id] += 1. / user_accepted[user]
            
            best_movie_id = sorted(movie_points.items(), key=lambda item: -item[1])[0][0]
            recommendation[i] = movies[best_movie_id]

            for user in group:
                if ratings.loc[user, best_movie_id] >= self.threshold:
                    user_accepted[user] += 1

            group_ratings.drop(best_movie_id, axis=1, inplace=True)

        return recommendation

## Część 3. - funkcje celu

In [13]:
# dwie funkcje pomocnicze:
# - znajdująca ulubione filmy danego użytkownika
# - obliczająca sumę ocen wystawionych przez użytkownika wszystkim filmom w rekomendacji
def top_n_movies_for_user(ratings, movies, user, n):
    user_ratings = ratings.loc[user, :]
    user_ratings = user_ratings.sort_values()

    return [movies[movie_id] for movie_id in list(user_ratings[-n:].index)]


def total_score(recommendation, user, movies, ratings):
    movies_idxs = [idx for idx in movies.index if movies[idx] in recommendation]
    user_ratings = ratings.loc[user, movies_idxs]

    return user_ratings.sum(axis=0)

In [14]:
# funkcja obliczająca zadowolenie pojedynczego użytkownika - iloraz zadowolenia
# z wygenerowanej rekomendacji oraz zadowolenia z hipotetycznej rekomendacji idealnej
def overall_user_satisfaction(recommendation, user, movies, ratings):
    recommendation_size = len(recommendation)
    ideal_recommendation = top_n_movies_for_user(ratings, movies, user, recommendation_size)
    numerator = total_score(recommendation, user, movies, ratings)
    denominator = total_score(ideal_recommendation, user, movies, ratings)
    
    return numerator / denominator


# funkcja celu - średnia z zadowolenia wszystkich użytkowników w grupie
def overall_group_satisfaction(recommendation, group, movies, ratings):
    n = len(group)
    users_satisfaction = [None] * n

    for i, user in enumerate(group):
        users_satisfaction[i] = overall_user_satisfaction(recommendation, user, movies, ratings)

    return sum(users_satisfaction) / n


# funkcja celu - różnica między maksymalnym i minimalnym zadowoleniem w grupie
def group_disagreement(recommendation, group, movies, ratings):
    n = len(group)
    users_satisfaction = [None] * n

    for i, user in enumerate(group):
        users_satisfaction[i] = overall_user_satisfaction(recommendation, user, movies, ratings)

    return max(users_satisfaction) - min(users_satisfaction)

## Część 4. - Sequential Hybrid Aggregation

In [15]:
# algorytm balansujący pomiędzy wyborem elementów o najwyższej średniej ocen i o najwyższej 
# minimalnej ocenie, wyliczający w każdej iteracji parametr alfa - jak na wykładzie
class SequentialHybridAggregationRecommender(Recommender):
    def __init__(self):
        self.name = 'sequential_hybrid_aggregation'
    

    def recommend(self, movies, ratings, group, size):
        alpha = 0
        recommendation = []
        group_ratings = ratings.loc[group, :].copy()

        for _ in range(size):
            movie_scores = {}

            for movie_id in group_ratings.columns:
                movie_ratings = group_ratings.loc[:, movie_id]
                average_score = movie_ratings.mean(axis=None)
                least_score = movie_ratings.min(axis=None)
                movie_scores[movie_id] = (1 - alpha) * average_score + alpha * least_score

            best_movie_id = sorted(movie_scores.items(), key=lambda item: -item[1])[0][0]
            recommendation.append(movies[best_movie_id])
            group_ratings.drop(best_movie_id, axis=1, inplace=True)
            alpha = group_disagreement(recommendation, group, movies, ratings)
        
        return recommendation

## Część 5. - porównanie algorytmów

In [16]:
recommenders = [
    RandomRecommender(),
    AverageRecommender(),
    AverageWithoutMiseryRecommender(5),
    FairnessRecommender(),
    VotingRecommender(),
    ProportionalApprovalVotingRecommender(5),
    SequentialHybridAggregationRecommender()
]

recommendation_size = 10

# pomocnicze określenie filmów na potrzeby rekomendacji i funkcji celu
movies = movies_metadata['title']

# dla każdego algorytmu:
# - wygenerujmy jedną rekomendację dla każdej grupy
# - obliczmy wartości obu funkcji celu dla każdej rekomendacji
# - wypiszmy wyniki na konsolę
for i, recommender in enumerate(recommenders):
    print(f"Algorytm: {recommender.name}")

    for j, group in enumerate(groups):
        print(f"\tGrupa nr {j+1}: {group}")
        recommendation = recommender.recommend(movies, ratings, group, recommendation_size)

        # wartości funkcji celu
        ogs = overall_group_satisfaction(recommendation, group, movies, ratings)
        gd = group_disagreement(recommendation, group, movies, ratings)

        print(f"\t - overall group satisfaction -> {ogs:.4f}")
        print(f"\t - group disagreement -> {gd:.4f}\n")

    print()

Algorytm: random
	Grupa nr 1: [111, 307, 474, 599, 414]
	 - overall group satisfaction -> 0.6200
	 - group disagreement -> 0.2200

	Grupa nr 2: [469, 182, 232, 448, 600]
	 - overall group satisfaction -> 0.6680
	 - group disagreement -> 0.1100

	Grupa nr 3: [508, 581, 497, 402, 566]
	 - overall group satisfaction -> 0.7589
	 - group disagreement -> 0.2612

	Grupa nr 4: [300, 515, 245, 568, 507]
	 - overall group satisfaction -> 0.8698
	 - group disagreement -> 0.2424

	Grupa nr 5: [2, 371, 252, 518, 37]
	 - overall group satisfaction -> 0.8288
	 - group disagreement -> 0.1222

	Grupa nr 6: [269, 360, 469, 287, 308]
	 - overall group satisfaction -> 0.6384
	 - group disagreement -> 0.2778

	Grupa nr 7: [243, 527, 418, 118, 370]
	 - overall group satisfaction -> 0.7957
	 - group disagreement -> 0.2091

	Grupa nr 8: [186, 559, 327, 553, 314]
	 - overall group satisfaction -> 0.7650
	 - group disagreement -> 0.2750


Algorytm: average
	Grupa nr 1: [111, 307, 474, 599, 414]
	 - overall grou