In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt

In [4]:
df_ratings = pd.read_csv("ml-latest-small/ratings.csv")
df_movies = pd.read_csv("ml-latest-small/movies.csv")

In [5]:
df_ratings = df_ratings.merge(df_movies[['movieId', 'title']],how='left', on='movieId')
train_data, test_data = train_test_split(df_ratings, test_size = 0.25, random_state=57)
test_res = test_data.copy().groupby('userId', as_index=False)['title'].agg({'actual': (lambda x: list(set(x)))})
test_res = test_res.set_index("userId")

In [None]:
#Задание 1

In [6]:
train_data_ui = train_data.pivot_table(index=['userId'], columns=['title'], values='rating')
test_data_ui = test_data.pivot_table(index=['userId'], columns=['title'], values='rating')

In [7]:
mean_rating = pd.DataFrame(train_data_ui.mean(axis=0), columns=['pred_rating']).reset_index()

In [8]:
test_data = test_data.merge(mean_rating, how='left', on='title')

In [9]:
from sklearn.metrics import mean_squared_error

In [10]:
from math import sqrt

In [15]:
def rmse(actual, pred) -> float:
    """
    Computes the root mean square error (RMSE)
    Parameters
    ----------
    actual : original true ratings or interaction values.
    y: predicted ratings or interaction values.
    Returns:
    -------
        The root mean square error (RMSE)
    """
    
    rmse = np.sqrt(np.mean((pred-actual)**2))
    
    return rmse

In [16]:
print("RMSE for CF: {:.2}".format(rmse(test_data.rating, test_data.pred_rating)))

RMSE for CF: 0.98


In [None]:
#Задание 2

In [17]:
cf_recs = []

In [18]:
%%time
corr_matrix = train_data_ui.corr(method='pearson', min_periods=100)

Wall time: 36.6 s


In [19]:
%%time
for i in test_res.index:
    user_ratings = train_data_ui.loc[i].dropna()
    simCandidates = pd.Series()

    for j in range(0, len(user_ratings.index)): # Идем по списку всех фильмов оцененных пользвателем
        sims = corr_matrix[user_ratings.index[j]].dropna() # Извлекаем фильмы, похожие на оцененные данным юзером (1)
        sims = sims.map(lambda x: x * user_ratings[j]) # Умножаем корреляцию на оценку пользователя по фильму
        simCandidates = simCandidates.append(sims) # Добавляем индекс в список сравниваемых кандидатов

    simCandidates = simCandidates.groupby(simCandidates.index).sum()
    simCandidates.sort_values(inplace = True, ascending = False)

    # выбираем фильмы, которые пользователь еще не смотрел
    intersection_set = set.intersection(set(simCandidates.index), set(user_ratings.index))
    not_watched = list(set(simCandidates.index) - set(user_ratings.index))
    colab_predictions = simCandidates[not_watched].sort_values(ascending = False).head(10).index.to_list()
    cf_recs.append(colab_predictions)



Wall time: 1min 7s


In [20]:
test_res['colab_predictions'] = cf_recs

In [21]:
#make recommendations for all members in the test data
popularity_recs = train_data.title.value_counts().head(10).index.tolist()

pop_recs = []
for user in test_res.index:
    pop_predictions = popularity_recs
    pop_recs.append(pop_predictions)
        
test_res['pop_predictions'] = pop_recs
test_res.head()

Unnamed: 0_level_0,actual,colab_predictions,pop_predictions
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"[Beetlejuice (1988), Wizard of Oz, The (1939),...","[Shawshank Redemption, The (1994), Forrest Gum...","[Shawshank Redemption, The (1994), Forrest Gum..."
2,"[Exit Through the Gift Shop (2010), Shawshank ...",[],"[Shawshank Redemption, The (1994), Forrest Gum..."
3,[Wallace & Gromit: The Best of Aardman Animati...,[],"[Shawshank Redemption, The (1994), Forrest Gum..."
4,"[Lone Star (1996), Beetlejuice (1988), Sixth S...",[Star Wars: Episode VI - Return of the Jedi (1...,"[Shawshank Redemption, The (1994), Forrest Gum..."
5,"[Addams Family Values (1993), In the Name of t...","[Silence of the Lambs, The (1991), Forrest Gum...","[Shawshank Redemption, The (1994), Forrest Gum..."


In [22]:
#make recommendations for all members in the test data

ran_recs = []
for user in test_res.index:
    random_predictions = df_ratings.title.sample(10).values.tolist()
    ran_recs.append(random_predictions)
        
test_res['rand_predictions'] = ran_recs
test_res.head()

Unnamed: 0_level_0,actual,colab_predictions,pop_predictions,rand_predictions
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"[Beetlejuice (1988), Wizard of Oz, The (1939),...","[Shawshank Redemption, The (1994), Forrest Gum...","[Shawshank Redemption, The (1994), Forrest Gum...",[Harry Potter and the Deathly Hallows: Part 1 ...
2,"[Exit Through the Gift Shop (2010), Shawshank ...",[],"[Shawshank Redemption, The (1994), Forrest Gum...","[Seven (a.k.a. Se7en) (1995), Sword in the Sto..."
3,[Wallace & Gromit: The Best of Aardman Animati...,[],"[Shawshank Redemption, The (1994), Forrest Gum...","[Runaway Bride (1999), Santa Clause, The (1994..."
4,"[Lone Star (1996), Beetlejuice (1988), Sixth S...",[Star Wars: Episode VI - Return of the Jedi (1...,"[Shawshank Redemption, The (1994), Forrest Gum...","[Forrest Gump (1994), Voices from the List (20..."
5,"[Addams Family Values (1993), In the Name of t...","[Silence of the Lambs, The (1991), Forrest Gum...","[Shawshank Redemption, The (1994), Forrest Gum...","[Die Hard (1988), Airplane II: The Sequel (198..."


In [25]:
def user_coverage(predicted) -> float:
    """
    Computes the share of test users to whom we were able to provide recommendation.
    Parameters
    ----------
    predicted : a list of lists
        Ordered predictions
        example: [['X', 'Y', 'Z'], ['X', 'Y', 'Z']]
    Returns
    ----------
    user_coverage:
        Share of test users in predicted list to whom we provided recommendation
        rounded to 2 decimal places
    """
    
    L_predictions = sum([1 if len(i) > 0 else 0 for i in predicted])
    user_coverage = round(L_predictions/(len(predicted)*1.0),2)
    
    return user_coverage

In [26]:
print("User coverage (cf): {}".format(user_coverage(cf_recs)))

User coverage (cf): 0.82


In [None]:
#Задание 3

In [35]:
def hitrate_k(predicted, actual, k: int) -> int:
    """
    Computes the average number of correct reccommendations for a user
    Parameters
    ----------
    predicted : a list of lists with recommendations
    actual: list of lists with actual movie watches
    k: integer
        The maximum number of recommendations in a list
    Returns
    ----------
    hitrate_k:
        The average number of correct recommendations for a user
    """
    L_intersect = 0
    for i in predicted.index:
        pred = predicted.loc[i][:k]
        L_intersect += len(set.intersection(set(pred), set(actual.loc[i])))
    hitrate_k = L_intersect/len(predicted.index)
    return hitrate_k

In [37]:
cf_hitrate_k = hitrate_k(test_res.colab_predictions, test_res.actual, 10)
print("Hitrate_k (cf): {}".format(cf_hitrate_k))

Hitrate_k (cf): 0.7717569786535303


In [None]:
print("Hitrate_k (cf): {}".format(cf_hitrate_k))