In [None]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [224]:
def calculate_similarities(num_users,
                           common_ratings_mat,
                           mask,
                           item_data,
                           min_common_items=2,
                           threshold=None,
                           weight_ratio=False,
                           weight_distance = False,
                           weight_variance = False):
    '''
    Calculate similarities for the given ratings matrix
    '''
    sim_mat = np.full((num_users, num_users),np.nan)
    num_items = item_data.shape[1]
    start_time = time.time()
    sim_counter = 0
    for a in range(num_users):
        for b in range (a+1, num_users):
            # Select only given number of common items but atleast 2.
            if(common_ratings_mat[a,b] >= min_common_items):
                row_mask = mask[a] & mask[b]
                ratings_a = item_data[a, row_mask]
                ratings_b = item_data[b, row_mask]
                centered_a = ratings_a - ratings_a.mean()
                centered_b = ratings_b - ratings_b.mean()

                # Optional: Weight items which has higher variance.
                if (weight_variance):
                    item_vars = item_data[:, row_mask].var()
                    centered_a = centered_a * item_vars 
                    centered_b = centered_b * item_vars 

                if (centered_a.all() == 0 or centered_b.all() == 0):
                    pcc = 0.0
                else: 
                    pcc = np.dot(centered_a, centered_b) / (np.sqrt(np.dot(centered_a, centered_a)) * np.sqrt(np.dot(centered_b, centered_b)))
                # Optional: Use threshold value to filter similarities which are small.
                if (threshold != None and pcc < threshold):
                    continue

                # Optional: Weight pcc by the ration of common items and all items.
                if (weight_ratio):
                    pcc = pcc * num_common_mat[a,b] / num_items

                # Optional: Give more weight to a values which are closer to the 1.0.
                if (weight_distance and pcc != 1.0):
                    pcc = pcc / abs(1.0 - pcc)


                sim_mat[a,b] = pcc
                sim_mat[b,a] = pcc
                sim_counter += 1

    end_time = time.time()
    return sim_mat

In [248]:
def predict_user_items(a, user_ratings, data_mat, sims):
    '''
    Predicts all missing items for the given user using collabarative filtering.

    Input:
        a: int - user id
        user_ratings: ndarray - user a's item ratings.
        data_mat: ndarray - movies rating data. Rows are users, columns are movies and values are ratings.
        sims: ndarray  - similarity values for other users.
    '''
    user_idx = a-1
    start_time = time.time()
    sims_filter = ~np.isnan(sims[user_idx])
    mean = np.nanmean(user_ratings)
    result = []
    for i,item in enumerate(user_ratings):
        if (item == np.nan or item != 0.0):
            result.append(0.0)
            continue
        data_filter = data_mat[:,i] != 0
        pred_filter = sims_filter & data_filter
        data_p = data_mat[pred_filter]
        sim_values = sims[pred_filter]
        pred = mean + (np.nansum(sim_values[:,user_idx] * (data_p[:,i] - np.nanmean(data_p, axis=1))))/(np.nansum(sim_values[:,user_idx]))
        pred = max(0.5, min(pred, 5)) 
        result.append(pred))
    end_time = time.time()
    dt = end_time - start_time
    return result

SyntaxError: unmatched ')' (1494482501.py, line 18)

In [None]:
# Data loading
data_raw = pd.read_csv("../data/ml-latest-small/ratings.csv")
num_users = data_raw['userId'].nunique()
num_movies = data_raw['movieId'].nunique()
print(data_raw.head())
print(f"Loaded: {len(data_raw)} ratings")
print(f"Users: {num_users}")
print(f"Movies: {num_movies}")

In [None]:
# Check if the IDs are consecutive.
ids = set(range(1, num_users + 1))
users = set(data_raw['userId'])
diff = ids - users
print(len(diff))
ids = set(range(1, num_movies + 1))
movies = set(data_raw['movieId'])
diff = ids - movies
print(len(diff))

In [None]:
# Set up datastructures.
data_clean = data_raw.drop(columns='timestamp')
data_mat_pd = data_clean.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)
filter_pd = data_mat_pd != 0

data = data_mat_pd.to_numpy()
mask = filter_pd.to_numpy()
bit_mask = mask.astype(np.uint8)
num_common_items_mat = bit_mask @ bit_mask.T
num_users = data.shape[0]

In [None]:
# Set up movies
movies_df = pd.read_csv("../data/ml-latest-small/movies.csv")
# map movieId to index
movies_map = {}
movies_filter = data_clean["movieId"].drop_duplicates().sort_values()
movies_df = movies_df[movies_df["movieId"].isin(movies_filter)]
for i, movie in enumerate(movies_filter):
    movies_map[movie] = i


In [None]:
# Set up genres and embeddings for movies
genre_set = set()
for genres in movies_df['genres']:
    genre_set.update(genres.split('|'))
genres_list = sorted(list(genre_set))
genre_idx = {genre: i for i, genre in enumerate(genres_list)}
movies_df["embedding"] = [np.zeros(len(genres_list), dtype=int) for i in range(num_movies)]

for row in movies_df.itertuples():
    genres = row.genres.split('|')
    for genre in genres:
        movies_df.loc[row.Index, "embedding"][genre_idx[genre]] = 1


In [None]:
# calcuclate similarity matrix
sim_mat = calculate_similarities(num_users, num_common_items_mat, mask, data, min_common_items=10)

In [None]:
movie_counts = data_clean.groupby('movieId').size()
popular_movies = movie_counts[movie_counts >= 100].index.tolist()

In [None]:
def calculate_metrics(bit_mask, ratings, predictions):
    # 1. recognition
    recognition = np.mean(bit_mask, axis=0)
    # 2. rating
    rating = np.mean(ratings, axis=0)
    # 3. influence
    influence = bit_mask.T.dot(predictions)
    # 4. exp power
    # NotImplemented
    return np.array([recognition, rating, influence])

In [None]:
# courtesies for chatgpt for the contents of this cell.
def dominates(a, b):
    return np.all(a >= b) and np.any(a > b)

def pareto_front(matrix):
    n = matrix.shape[0]
    dominated = np.zeros(n, dtype=bool)
    
    for i in range(n):
        for j in range(n):
            if i != j and dominates(matrix[j], matrix[i]):
                dominated[i] = True
                break
    return np.where(~dominated)[0]  # indices of Pareto-optimal rows

In [247]:
# 1. Get input arrays for each user
group_users = [1, 414, 599]
input_mask = mask[group_users]
# 2 do initial recommendation
group_data = data[group_users]
predictions = []
for idx, (user, ratings) in enumerate(zip(group_users, group_data)):
    user_preds = predict_user_items(user, ratings, data, sim_mat)
    predictions.append(user_preds)
predictions = np.array([[t[1] for t in row] for row in predictions], dtype=np.float64)
rec_initial= np.argmax((0.5 * np.mean(predictions, axis=0) + 0.5 * np.min(predictions, axis=0)))
print(f"Initial recommendation: {rec_initial}")
# 3. For each item in the input calculate metrics
rec_scores = predictions[:,rec_idx]
metrics = calculate_metrics(bit_mask[group_users], data[group_users], rec_scores)
exp_data = data[group_users]
while (True):
# 4. sort the input array by metric score
    pf = pareto_front(metrics.T)
    print(f"pareto front {pf}")
# 5. removed parento front values
    exp_data[:,pf] = np.nan
    metrics[:,pf] = 0.0
    predictions2 = []
# 6. Calculate the recommendations again.
    for idx, (user, ratings) in enumerate(zip(group_users, exp_data)):
        user_preds = predict_user_items(user, ratings, data, sim_mat)
        predictions2.append(user_preds)
    predictions2 = np.array([[t[1] for t in row] for row in predictions2], dtype=np.float64)
    rec_idx2= np.argmax((0.5 * np.mean(predictions2, axis=0) + 0.5 * np.min(predictions2, axis=0)))
    print(f"recommendation after explanantion round {rec_idx2}")
    print(np.max((0.5 * np.mean(predictions2, axis=0) + 0.5 * np.min(predictions2, axis=0)))
)
# 7. If the initial recommendation have been dropped from the recommendation list:
    if(rec_idx2 != rec_initial):
        break
print("finished")
# (8. Prune the selection list.)

  pred = mean + (np.nansum(sim_values[:,user_idx] * (data_p[:,i] - np.nanmean(data_p, axis=1))))/(np.nansum(sim_values[:,user_idx]))


Initial recommendation: 7948
pareto front [6693]
recommendation after explanantion round 7948
4.99708702271909
pareto front [277]
recommendation after explanantion round 7948
4.996882079523632
pareto front [4900]
recommendation after explanantion round 7948
4.996882834313322
pareto front [ 257  602  898 2224]
recommendation after explanantion round 7948
4.9968858550256225
pareto front [314 510 659]
recommendation after explanantion round 7948
4.996888122192414
pareto front [ 520 1703 2257 3136]
recommendation after explanantion round 7948
4.996891147260688
pareto front [  43  224  899 1733 2144 3006 4607 6298 7572]
recommendation after explanantion round 7948
4.996073305533519
pareto front [ 116  506  956 1938 2077 2370 7355 8045]
recommendation after explanantion round 7948
4.995562857611577
pareto front [ 418  913 1072 3191]
recommendation after explanantion round 7948
4.995565342115098
pareto front [ 190  398  779 1153 3197 6204]
recommendation after explanantion round 7948
4.995569

KeyboardInterrupt: 

In [245]:
test = np.array([[2,4],
                 [3,5]])
np.mean(test, axis=0)

array([2.5, 4.5])