In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive
drive.mount('/content/drive')
import random
import pickle

Mounted at /content/drive


In [None]:
data_path = '/content/drive/MyDrive/Project_AIL303_GROUP2/Final_dataset_in_here!!!!!/100k_final_dataset.csv'
matrix_path = '/content/drive/MyDrive/Project_AIL303_GROUP2/Final_dataset_in_here!!!!!/userId_movieId_matrix.pkl'

rating_df = pd.read_csv(data_path)
with open(matrix_path, 'rb') as f:
    users_movies_matrix = pickle.load(f)

In [29]:
def get_popular_top_rated_movies(rating_df, n_recommend=10):
    """
    Lấy ra N phim vừa phổ biến (nhiều lượt rating) vừa có rating trung bình cao.

    Args:
        rating_df (DataFrame): DataFrame chứa thông tin ratings.
        n_recommend (int): Số lượng phim cần trả về.

    Returns:
        pd.DataFrame: DataFrame chứa thông tin các phim được gợi ý.
    """
    # 1. Tính toán rating trung bình và số lượt rating cho mỗi phim
    movie_stats = rating_df.groupby('movieId').agg(
        avg_rating=('rating', 'mean'),
        rating_count=('rating', 'count')
    ).reset_index()

    # 2. Lọc nhiễu: Chỉ xem xét những phim có số lượt rating cao hơn 90% các phim khác
    # Điều này để tránh trường hợp một phim chỉ có 1 rating 5.0 lại đứng top 1.
    min_ratings_threshold = movie_stats['rating_count'].quantile(0.90)
    qualified_movies = movie_stats[movie_stats['rating_count'] >= min_ratings_threshold]

    # 3. Sắp xếp các phim đủ điều kiện theo rating trung bình giảm dần
    qualified_movies = qualified_movies.sort_values('avg_rating', ascending=False)

    # 4. Lấy thông tin chi tiết của top N phim
    top_movie_ids = qualified_movies.head(n_recommend)['movieId']

    # Lấy thông tin đầy đủ (title, poster,...) của các phim này
    top_movies_info = rating_df[rating_df['movieId'].isin(top_movie_ids)][['movieId', 'title', 'genres', 'poster_link']].drop_duplicates(subset=['movieId'])

    # Sắp xếp lại theo đúng thứ tự của qualified_movies
    top_movies_info = top_movies_info.set_index('movieId').loc[top_movie_ids].reset_index()

    return top_movies_info

def create_new_user_vector(user_id, movie_ratings, users_movies_matrix):
    """
    Tạo vector rating cho người dùng mới dựa trên userId và rating 5 bộ phim.

    Args:
        user_id (int): ID của người dùng mới.
        movie_ratings (dict): Dictionary chứa movieId và rating.
        users_movies_matrix (DataFrame): Ma trận user-movie.
    Returns:
        pd.Series: Vector rating đầy đủ của người dùng mới.
    """
    all_movie_ids = users_movies_matrix.columns
    new_user_vector = pd.Series(0, index=all_movie_ids, name=user_id)
    for movie_id, rating in movie_ratings.items():
        if movie_id in all_movie_ids:
            new_user_vector[movie_id] = rating
    # print(new_user_vector)
    return new_user_vector

# Hàm tìm top 5 user tương đồng
def find_top_similar_users(new_user_vector, users_movies_matrix, k=5):
    """
    Tìm top k user tương đồng nhất với người dùng mới.

    Args:
        new_user_vector (pd.Series): Vector rating của người dùng mới.
        users_movies_matrix (DataFrame): Ma trận user-movie.
        k (int): Số user tương đồng cần tìm. Mặc định là 5.

    Returns:
        list: Danh sách userId của top k user tương đồng.
    """
    sim_scores = cosine_similarity([new_user_vector], users_movies_matrix)[0]
    sim_df = pd.Series(sim_scores, index=users_movies_matrix.index)
    top_users = sim_df.nlargest(k + 1).index[1:]
    return top_users.tolist()

def recommend_movies_for_new_user(user_id, movie_ratings, users_movies_matrix, rating_df, k=5, n_recommend=8):
    """
    Gợi ý 8 phim dựa trên rating của 5 bộ phim và top 5 user tương đồng.
    Args:
        user_id (int): ID của người dùng mới.
        movie_ratings (dict): Dictionary chứa movieId và rating của 5 bộ phim.
        users_movies_matrix (DataFrame): Ma trận user-movie.
        rating_df (DataFrame): DataFrame chứa thông tin phim.
        k (int): Số user tương đồng cần tìm. Mặc định là 5.
        n_recommend (int): Số phim gợi ý. Mặc định là 8.

    Returns:
        list: Danh sách dictionary chứa thông tin 8 phim gợi ý.
    """
    new_user_vector = create_new_user_vector(user_id, movie_ratings, users_movies_matrix)
    top_k_users = find_top_similar_users(new_user_vector, users_movies_matrix, k)

    top_users_ratings = users_movies_matrix.loc[top_k_users]
    rated_movies_by_neighbors = top_users_ratings.columns[top_users_ratings.ne(0).any()]
    movies_rated_by_new_user = movie_ratings.keys()
    potential_recommendations = rated_movies_by_neighbors.drop(movies_rated_by_new_user, errors='ignore')
    if potential_recommendations.empty:
        return rating_df.head(n_recommend).to_dict(orient='records')
    # Tính tổng rating cho mỗi phim từ top 5 user
    total_ratings = top_users_ratings[potential_recommendations].sum()

    # Lấy top phim từ tổng rating (tối đa số phim có sẵn)
    top_movie_ids = total_ratings.nlargest(min(n_recommend, len(potential_recommendations))).index

    if len(top_movie_ids) < n_recommend:
          popular_movies_df = get_popular_top_rated_movies(rating_df, n_recommend=20)
          seen_movie_ids = list(top_movie_ids) + list(movies_rated_by_new_user)
          additional_movies = popular_movies_df[~popular_movies_df['movieId'].isin(seen_movie_ids)]

          num_needed = n_recommend - len(top_movie_ids)
          print=(f"Số lượng phim cần thiết để lấp đầy: {num_needed}")
          additional_movie_ids = additional_movies.head(num_needed)['movieId'].values

          top_movie_ids = list(top_movie_ids) + list(additional_movie_ids)
    recommended_movies_info  = rating_df[rating_df['movieId'].isin(top_movie_ids[:n_recommend])][['movieId', 'title', 'genres','imdb', 'poster_link']].drop_duplicates(subset=['movieId'])

    return recommended_movies_info .to_dict(orient='records')



In [30]:
# Giả lập dữ liệu từ web: userId và rating cho 5 bộ phim
user_id = 10000000000
movie_ratings = {
    26479: 4,    # Pirates of Penzance
    247150: 3,  # Stowaway
    7451: 4,  # Mean Girls
    61132: 4,  # Tropic Thunder
    96079: 3   # Skyfall
}

recommendations = recommend_movies_for_new_user(user_id, movie_ratings, users_movies_matrix, rating_df)
df_re = pd.DataFrame(recommendations)
df_re

Unnamed: 0,movieId,title,genres,imdb,poster_link
0,49272,Casino Royale (2006),"['Action', 'Adventure', 'Thriller']",http://www.imdb.com/title/tt0381061,https://m.media-amazon.com/images/M/MV5BMWQ1ZD...
1,63113,Quantum of Solace (2008),"['Action', 'Adventure', 'Thriller']",http://www.imdb.com/title/tt0830515,https://m.media-amazon.com/images/M/MV5BYmMwZT...
2,136020,Spectre (2015),"['Action', 'Adventure', 'Crime']",http://www.imdb.com/title/tt2379713,https://m.media-amazon.com/images/M/MV5BMzA5Mz...
3,224280,No Time to Die (2020),"['Action', 'Adventure', 'Mystery', 'Thriller']",http://www.imdb.com/title/tt2382320,https://m.media-amazon.com/images/M/MV5BZGZiOG...
4,260,Star Wars: Episode IV - A New Hope (1977),"['Action', 'Adventure', 'Sci-Fi']",http://www.imdb.com/title/tt0076759,https://m.media-amazon.com/images/M/MV5BOGUwMD...
5,1965,Repo Man (1984),"['Comedy', 'Sci-Fi']",http://www.imdb.com/title/tt0087995,https://m.media-amazon.com/images/M/MV5BMTc3NT...
6,1265,Groundhog Day (1993),"['Comedy', 'Fantasy', 'Romance']",http://www.imdb.com/title/tt0107048,https://m.media-amazon.com/images/M/MV5BOWE3Mj...
7,89302,Page Eight (2011),"['Drama', 'Thriller']",http://www.imdb.com/title/tt1797469,https://m.media-amazon.com/images/M/MV5BY2ZhYW...


In [None]:
rating_df.head(10)

Unnamed: 0,userId,movieId,rating,title,genres,imdbId,movielens_url,imdb,tag,poster_link
0,22,26479,3.5,"Pirates of Penzance, The (1983)","['Adventure', 'Comedy', 'Musical', 'Romance']",86112,https://movielens.org/movies/26479,http://www.imdb.com/title/tt0086112,Kevin Kline,https://m.media-amazon.com/images/M/MV5BOTBkMj...
1,22,247150,3.0,Stowaway (2021),"['Drama', 'Sci-Fi']",9203694,https://movielens.org/movies/247150,http://www.imdb.com/title/tt9203694,acrophobia,https://m.media-amazon.com/images/M/MV5BNGQ3Yj...
2,34,2174,4.0,Beetlejuice (1988),"['Comedy', 'Fantasy']",94721,https://movielens.org/movies/2174,http://www.imdb.com/title/tt0094721,weird,https://m.media-amazon.com/images/M/MV5BYjkwNz...
3,34,8623,4.0,Roxanne (1987),"['Comedy', 'Romance']",93886,https://movielens.org/movies/8623,http://www.imdb.com/title/tt0093886,Steve Martin,https://m.media-amazon.com/images/M/MV5BZDkyMz...
4,55,5766,4.0,Madman (1981),['Horror'],82696,https://movielens.org/movies/5766,http://www.imdb.com/title/tt0082696,the killls and the score,https://m.media-amazon.com/images/M/MV5BZDQ4Yz...
5,58,7451,0.5,Mean Girls (2004),['Comedy'],377092,https://movielens.org/movies/7451,http://www.imdb.com/title/tt0377092,teen movie,https://m.media-amazon.com/images/M/MV5BMjE1MD...
6,58,49272,4.5,Casino Royale (2006),"['Action', 'Adventure', 'Thriller']",381061,https://movielens.org/movies/49272,http://www.imdb.com/title/tt0381061,Thriller,https://m.media-amazon.com/images/M/MV5BMWQ1ZD...
7,58,61132,4.5,Tropic Thunder (2008),"['Action', 'Adventure', 'Comedy', 'War']",942385,https://movielens.org/movies/61132,http://www.imdb.com/title/tt0942385,war filmmaking,https://m.media-amazon.com/images/M/MV5BNDE5Nj...
8,58,63113,3.5,Quantum of Solace (2008),"['Action', 'Adventure', 'Thriller']",830515,https://movielens.org/movies/63113,http://www.imdb.com/title/tt0830515,stylish,https://m.media-amazon.com/images/M/MV5BYmMwZT...
9,58,96079,5.0,Skyfall (2012),"['Action', 'Adventure', 'Thriller', 'IMAX']",1074638,https://movielens.org/movies/96079,http://www.imdb.com/title/tt1074638,villain,https://m.media-amazon.com/images/M/MV5BNjAzMW...


In [None]:


[
    {'movieId': 110, 'title': 'Braveheart (1995)', 'genres': 'Action|Drama|War', 'poster_link': 'https://image.tmdb.org/t/p/w500/xyz789.jpg'},
    {'movieId': 527, 'title': "Schindler's List (1993)", 'genres': 'Drama|History', 'poster_link': 'https://image.tmdb.org/t/p/w500/abc456.jpg'},
    {'movieId': 1196, 'title': 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'genres': 'Action|Adventure|Sci-Fi', 'poster_link': 'https://image.tmdb.org/t/p/w500/def789.jpg'},
    {'movieId': 858, 'title': 'The Godfather (1972)', 'genres': 'Crime|Drama', 'poster_link': 'https://image.tmdb.org/t/p/w500/ghi012.jpg'},
    {'movieId': 123, 'title': 'Random Movie 1', 'genres': 'Action', 'poster_link': 'https://via.placeholder.com/150'},  # Phim random
    {'movieId': 456, 'title': 'Random Movie 2', 'genres': 'Comedy', 'poster_link': 'https://via.placeholder.com/150'},  # Phim random
    {'movieId': 789, 'title': 'Random Movie 3', 'genres': 'Drama', 'poster_link': 'https://via.placeholder.com/150'},   # Phim random
    {'movieId': 101, 'title': 'Random Movie 4', 'genres': 'Sci-Fi', 'poster_link': 'https://via.placeholder.com/150'}   # Phim random
]