In [63]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval

In [71]:
credits = pd.read_csv('../dataset/credits.csv')
keywords = pd.read_csv('../dataset/keywords.csv')
links = pd.read_csv('../dataset/links_small.csv')
md = pd.read_csv('../dataset/movies_metadata.csv')
ratings = pd.read_csv('../dataset/ratings.csv')

  md = pd.read_csv('../dataset/movies_metadata.csv')


In [115]:
eval_columns = ['belongs_to_collection', 'production_companies', 'production_countries', 'spoken_languages', 'genres']

for eval_column in eval_columns:
    md[eval_column] = md[eval_column].fillna('[]').apply(literal_eval)
    md[eval_column] = md[eval_column].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

bad_data = md[md['imdb_id'] == '0'].index
md.drop(bad_data, inplace=True)

cols_to_float = ['revenue', 'vote_count', 'vote_average', 'budget', 'popularity']
md[cols_to_float] = md[cols_to_float].astype(float)

md['id'] = md['id'].astype(int)

## User-User

In [95]:
from scipy.sparse import coo_matrix

user_ids = ratings['userId'].astype('category').cat.codes
movie_ids = ratings['movieId'].astype('category').cat.codes
values = ratings['rating'].values
user_movie_sparse = coo_matrix((values, (user_ids, movie_ids)))

# Convert to CSR for efficient row slicing
user_movie_sparse = user_movie_sparse.tocsr()

user_id_to_index = dict(zip(ratings['userId'].astype('category').cat.categories, 
                           range(len(ratings['userId'].astype('category').cat.categories))))

movie_id_to_index = dict(zip(ratings['movieId'].astype('category').cat.categories, 
                            range(len(ratings['movieId'].astype('category').cat.categories))))

index_to_movie_id = dict(zip(range(len(ratings['movieId'].astype('category').cat.categories)),
                            ratings['movieId'].astype('category').cat.categories))


# user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')

## How we check similarity ? 
we would use co-rated movies similarity but for some problem we use shrinkage hyperparamter and other things.

i use dice coefficient.

In [93]:
def similarity_between_two_user(base_user, other_user):
    # Get co-rated movies similarities
    movie_rated_by_base_user = np.where(base_user > 0)[0]
    movie_rated_by_other_user = np.where(other_user > 0)[0]
    mask = np.intersect1d(movie_rated_by_base_user, movie_rated_by_other_user)
    
    n_rated_by_base_user = movie_rated_by_base_user.size
    n_rated_by_other_user = movie_rated_by_other_user.size
    n_co_rated = mask.size
    
    if mask.size > 0:
        base_user_mean_rates = base_user[base_user > 0].mean()
        other_user_mean_rates = other_user[other_user > 0].mean()
        
        base_user_common = np.expand_dims(base_user[mask], axis=0) - base_user_mean_rates
        other_user_common = np.expand_dims(other_user[mask], axis=0) - other_user_mean_rates
        
        sim = cosine_similarity(base_user_common, other_user_common)[0, 0]
    else:
        # No co-rated movies
        sim = 0.0
    
    # Overlap aware similarity
    dice_coefficient = (2 * n_co_rated) / (n_rated_by_base_user + n_rated_by_other_user)
    final_sim = sim * dice_coefficient
    return final_sim

In [97]:
def get_similar_user(user_movie_rates):
    similarity_to_other_users = np.array([
        similarity_between_two_user(user_movie_rates, user_movie_sparse.getrow(uid).toarray().flatten()) 
        for uid in range(user_movie_sparse.shape[0])
    ])
    return similarity_to_other_users

In [99]:
def recommend(user_movie_rates, top_n_users, top_n_recom):
    similarities = get_similar_user(user_movie_rates)
    top_indices = np.argsort(similarities)[::-1][1:top_n_users+1]
    top_scores = similarities[top_indices]
    
    weighted_sum = np.zeros(user_movie_sparse.shape[1])
    for idx, sim in zip(top_indices, top_scores):
        user_ratings = user_movie_sparse.getrow(idx).toarray().flatten()
        weighted_sum += user_ratings * sim
    
    if top_scores.sum() > 0:
        weighted_sum /= top_scores.sum()
    
    # for the new users
    mask = user_movie_rates > 0
    weighted_sum[mask] = -np.inf
    
    recommended_movie_indices = np.argsort(weighted_sum)[::-1][:top_n_recom]
    return [index_to_movie_id[idx] for idx in recommended_movie_indices]

In [148]:
test_user = pd.DataFrame({'userId': [1,1,1,1,1,1,1], 'movieId': [414, 268, 364, 15805, 17074, 16234, 272], 'rating': [5, 4.8, 5, 4.5, 5, 3.8 , 4]})
user_movie_rates = test_user.pivot(index='userId', columns='movieId', values='rating')
user_movie_rates

movieId,268,272,364,414,15805,16234,17074
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4.8,4.0,5.0,5.0,4.5,3.8,5.0


In [150]:
user_movie_rates = np.zeros(user_movie_sparse.shape[1])
for _, row in test_user.iterrows():
    if row['movieId'] in movie_id_to_index:
        movie_idx = movie_id_to_index[row['movieId']]
        user_movie_rates[movie_idx] = row['rating']

recommended_movies = recommend(user_movie_rates, 10, 50)
recommended_movies

[1721,
 1537,
 520,
 2359,
 2571,
 7153,
 4993,
 588,
 912,
 1961,
 1097,
 261,
 48,
 1,
 1271,
 2398,
 593,
 714,
 112852,
 480,
 2268,
 596,
 1291,
 1665,
 2424,
 2524,
 736,
 2102,
 1334,
 1094,
 2383,
 70159,
 76056,
 75938,
 75977,
 75962,
 76082,
 75950,
 75947,
 75940,
 75929,
 75981,
 76089,
 75927,
 75831,
 76077,
 75825,
 75823,
 75979,
 75983]

In [152]:
m = md['vote_count'].quantile(0.80)
C = md['vote_average'].mean()

def weighted_rating(row):
    v = row['vote_count']
    R = row['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

    
md.apply(weighted_rating, axis=1)
md['wr'] = md.apply(weighted_rating, axis=1)

In [166]:
md[md['id'].isin(recommended_movies)][['id', 'title', 'wr']].sort_values('wr', ascending=False)

Unnamed: 0,id,title,wr
3382,593,Solaris,7.448576
2978,596,The Grapes of Wrath,7.382032
944,261,Cat on a Hot Tin Roof,7.14546
11911,2359,Sicko,7.011032
11662,1271,300,6.986564
2649,912,The Thomas Crown Affair,6.458002
3293,2383,The Bear,6.425262
10942,588,Silent Hill,6.270175
5004,480,Monsoon Wedding,6.257893
11979,1721,All the Way Boys,6.135623
