## Dataset

In [1]:
!pip install -q opendatasets
!wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip /content/ml-latest-small.zip -d /content/data

--2024-11-14 15:22:44--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2024-11-14 15:22:45 (2.84 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  /content/ml-latest-small.zip
   creating: /content/data/ml-latest-small/
  inflating: /content/data/ml-latest-small/links.csv  
  inflating: /content/data/ml-latest-small/tags.csv  
  inflating: /content/data/ml-latest-small/ratings.csv  
  inflating: /content/data/ml-latest-small/README.txt  
  inflating: /content/data/ml-latest-small/movies.csv  


In [2]:
import opendatasets as od
od.download('https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: tranghnguyn
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset
Downloading the-movies-dataset.zip to ./the-movies-dataset


100%|██████████| 228M/228M [00:02<00:00, 102MB/s]





## Yêu cầu 1:

In [3]:
import pandas as pd
import numpy as np
import math
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [4]:
movies = pd.read_csv('/content/data/ml-latest-small/movies.csv')
ratings = pd.read_csv('/content/data/ml-latest-small/ratings.csv')

In [5]:
X_train, X_test = train_test_split(ratings, test_size=200, random_state=42)
pivot_df = pd.pivot(index='userId', columns='movieId', values='rating', data=X_train)
pivot_df

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [6]:
from scipy.stats import pearsonr, spearmanr
from scipy.spatial.distance import cosine

def get_user_ratings(movieId):
    return pivot_df[movieId].dropna()

def get_movie_ratings(userId):
    return pivot_df.loc[userId].dropna()

def calculate_user_based_similarity(user1, user2, type_='cosine'):
    # Get user's movie ratings
    user1_ratings = get_movie_ratings(user1)
    user2_ratings = get_movie_ratings(user2)

    # Get common movie id
    common_movies = []
    for movieId in user1_ratings.index:
        if movieId in user2_ratings.index:
            common_movies.append(movieId)

    # Common movie's ratings
    user1_common_ratings = user1_ratings[common_movies]
    user2_common_ratings = user2_ratings[common_movies]

    if len(user1_common_ratings) < 2 or len(user2_common_ratings) < 2:
        return 0

    if type_=='pearson':
        result = pearsonr(user1_common_ratings.values, user2_common_ratings.values).statistic
    elif type_=='spearman':
        result = spearmanr(user1_common_ratings.values, user2_common_ratings.values).statistic
    elif type_=='cosine':
        result = cosine(user1_common_ratings.values, user2_common_ratings.values)
    return result


def calculate_item_based_similarity(item1, item2, type_='cosine'):
    # Get user's movie ratings
    movie1_ratings = get_user_ratings(item1)
    movie2_ratings = get_user_ratings(item2)

    # Get common movie id
    common_users = []
    for userId in movie1_ratings.index:
        if userId in movie2_ratings.index:
            common_users.append(userId)

    # Common movie's ratings
    movie1_common_ratings = movie1_ratings[common_users]
    movie2_common_ratings = movie2_ratings[common_users]

    if len(movie1_common_ratings) < 2 or len(movie2_common_ratings) < 2:
        return 0

    if type_=='pearson':
        mask = ~np.isnan(movie1_common_ratings.to_numpy()) & ~np.isnan(movie2_common_ratings.to_numpy())
        result = pearsonr(movie1_common_ratings.to_numpy()[mask], movie2_common_ratings.to_numpy()[mask]).statistic
    elif type_=='spearman':
        mask = ~np.isnan(movie1_common_ratings.to_numpy()) & ~np.isnan(movie2_common_ratings.to_numpy())
        result = spearmanr(movie1_common_ratings.to_numpy()[mask], movie2_common_ratings.to_numpy()[mask]).statistic
    elif type_=='cosine':
        result = cosine(movie1_common_ratings.to_numpy(), movie2_common_ratings.to_numpy())
    return result

In [7]:
def get_top_k_similarities(Id, data, top_k=5, type_='cosine', based='user'):
    sim_rates = []
    if based=='user':
        user_ids = data.index.to_list()

        for id in user_ids:
            if id != Id:
                sim_rates.append({
                    'Id': id,
                    'similarity': calculate_user_based_similarity(Id, id, type_)
                })

    if based=='item':
        movie_ids = data.columns.to_list()

        for id in movie_ids:
            if id != Id:
                sim_rates.append({
                    'Id': id,
                    'similarity': calculate_item_based_similarity(Id, id, type_)
                })

    sorted_list = sorted(sim_rates, key=lambda d: d['similarity'], reverse=True)
    return sorted_list[:top_k]


In [8]:
def update_rating(data, Id, top_k=5, type_='cosine', based='user'):
    top_k_sim = get_top_k_similarities(Id, data, top_k, type_, based)
    if based == 'user':
        none_nan_columns = data.loc[Id].isna()
        unrated= pivot_df.columns[none_nan_columns].to_list()
    elif based == 'item':
        none_nan_rows = data[Id].isna()
        unrated= pivot_df.index[none_nan_rows].to_list()
    unrated_dict = {}

    for id in unrated:
        total = 0
        sum_sim = 0

        for i in range(len(top_k_sim)):
            if based == 'user':
                if pd.isna(data.loc[top_k_sim[i]['Id'], id]) == False:
                    total += top_k_sim[i]['similarity'] * data.loc[top_k_sim[i]['Id'], id]

            elif based == 'item':
                if pd.isna(data.loc[id, top_k_sim[i]['Id']]) == False:
                    total += top_k_sim[i]['similarity'] * data.loc[id, top_k_sim[i]['Id']]

            sum_sim = sum_sim + top_k_sim[i]['similarity']


        new_rating = total / sum_sim
        unrated_dict[id] = new_rating

    return unrated_dict


In [9]:
def predict_rating(data, movie_id, user_id, top_k=5, type_='cosine', based='user'):
    preds = []
    for idx in tqdm(range(len(user_id))):
        re = update_rating(data, user_id[idx], top_k, type_, based)

        if movie_id[idx] in re:
            preds.append(re[movie_id[idx]])  # Use the predicted rating
        else:
            preds.append(0)
    return preds

In [10]:
result = predict_rating(pivot_df, X_test.movieId.to_list(), X_test.userId.to_list(), top_k=5, type_='cosine', based='user')

100%|██████████| 200/200 [07:02<00:00,  2.11s/it]


In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
mse = mean_squared_error(X_test['rating'].to_list(), result, squared=True)
mae = mean_absolute_error(X_test['rating'].to_list(), result)

rmse = np.sqrt(mse)
nmae = mae / (X_test['rating'].max() - X_test['rating'].min())

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'NMAE: {nmae}')

MSE: 11.692708592502502
RMSE: 3.4194602779535987
MAE: 3.2536694853647328
NMAE: 0.723037663414385




## Yêu cầu 2:

In [12]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

nltk.download('stopwords')
stopswords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
# Viết hàm tiền xử lý dữ liệu trên cột overview
def processing_data(data):
    # Chuyển về từ thường
    data = data.lower()
    # Xóa dấu câu, ký tự đặc biệt
    data = re.sub('\W+',' ', data)
    # Xóa khoảng trắng đầu và cuối câu
    data = data.strip()
    # Xóa stopword
    data = ' '.join([word for word in data.split() if word not in stopswords])
    # Tách từ
    # word_tokens = word_tokenize(data)
    # ...
    return data

In [14]:
df = pd.read_csv('/content/the-movies-dataset/movies_metadata.csv')
df['overview'].fillna('None', inplace=True)
df['overview'] = df['overview'].apply(processing_data)
df.head()

  df = pd.read_csv('/content/the-movies-dataset/movies_metadata.csv')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['overview'].fillna('None', inplace=True)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,led woody andy toys live happily room andy bir...,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,siblings judy peter discover enchanted board g...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,family wedding reignites ancient feud next doo...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,cheated mistreated stepped women holding breat...,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,george banks recovered daughter wedding receiv...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [15]:
df_ratings = pd.read_csv('/content/the-movies-dataset/ratings.csv')
df_ratings['movieId'] = df_ratings['movieId'].astype(str)
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [16]:
movies = df_ratings.loc[df_ratings['userId']==201].sort_values('rating', ascending=False).movieId.to_list()
X_test = df.loc[df['id'].isin(movies)]
X_test, y_test = train_test_split(X_test, test_size=0.8, random_state=42)
X_train = df.drop(X_test.index)

In [17]:
y_test.shape

(132, 24)

In [18]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_matrix = vectorizer.fit_transform(X_train['overview'])
X_test_matrix = vectorizer.transform(X_test['overview'])

In [19]:
cosine_sim = linear_kernel(X_test_matrix, X_train_matrix)
cosine_sim

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.02504919,
        0.        ],
       [0.        , 0.03854099, 0.        , ..., 0.        , 0.01680099,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.0262302 , 0.01022229,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.00921692,
        0.        ],
       [0.        , 0.01475996, 0.01857413, ..., 0.        , 0.00987308,
        0.02533086]])

In [20]:
mean_correlation_scores = cosine_sim.mean(axis=0)
top_100_indices = np.argsort(mean_correlation_scores)[-100:][::-1]
top_100_indices

array([  302,  4377,  5880, 34862, 19458, 12307, 25673,  6055, 40237,
        8089, 15199, 24257, 15105,  7387, 16455, 17668, 10944, 33014,
       43860,  1427, 12055, 15747, 29408, 21595, 21817, 14947,  2591,
       19432,  4122, 11457, 27033, 16891, 31932, 10482, 41456, 42421,
       25382, 31673, 35681, 33996, 16281, 12425, 19212, 33527, 11498,
       16604, 16613, 44528, 41914,   406, 23742, 43439, 22210, 27297,
       27729, 26218,  6431, 39641, 13324, 28556,  3456, 25611, 39877,
       31326, 36671, 33897, 29554, 17226,  6657, 27669,  5024, 32148,
       30036, 39987, 35903,  7978, 25118, 16309, 15522, 13005, 33366,
       17688,  1189, 24881, 43209, 12987, 26741,  6415, 34455,   700,
       44533, 44115, 21279, 16987,  2362,  3174, 11165, 31396, 44541,
       40083])

In [21]:
recommended_items = X_train.iloc[top_100_indices]['title'].to_list()
len(recommended_items)

100

In [22]:
def precision_at_k(recommended_items, relevant_items, K):
    # Get the top-K recommended items
    top_k_items = recommended_items[:K]

    # Count the number of relevant items in the top-K recommendations
    relevant_in_top_k = sum([1 for item in top_k_items if item in relevant_items])
    # Calculate P@K
    precision_k = float(relevant_in_top_k / K)
    return precision_k


def recall_at_k(recommended_items, relevant_items, K):
    # Get the top-K recommended items
    top_k_items = recommended_items[:K]

    # Count the number of relevant items in the top-K recommendations
    relevant_in_top_k = sum([1 for item in top_k_items if item in relevant_items])

    # Calculate R@K
    recall_k = relevant_in_top_k / len(relevant_items)
    return recall_k


def f1_at_k(recommended_items, relevant_items, K):
    # Calculate F1@K
    precision = precision_at_k(recommended_items, y_test['title'].to_list(), K)
    recall = recall_at_k(recommended_items, y_test['title'].to_list(), K)
    f1_k = 2 * (precision * recall) / (precision + recall)
    return f1_k

In [23]:
def calculate_mrr(recommendations, relevant_items):
    """
    Calculate the Mean Reciprocal Rank (MRR)

    :param recommendations: List of lists, where each inner list contains recommended items for a user, in ranked order.
    :param relevant_items: List of sets, where each set contains relevant items for a user.
    :return: MRR score
    """
    reciprocal_ranks = []

    for recs, relevant in zip(recommendations, relevant_items):
        # Find the rank of the first relevant item
        for rank, item in enumerate(recs, start=1):
            if item in relevant:
                reciprocal_ranks.append(1 / rank)
                break
        else:
            # If no relevant item is found, append 0 (no reciprocal rank for this user)
            reciprocal_ranks.append(0)

    # Calculate the mean of the reciprocal ranks
    mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
    return mrr

In [24]:
import numpy as np

# Discounted Cumulative Gain (DCG)
def dcg(relevance_scores):
    return np.sum(relevance_scores / np.log2(np.arange(2, len(relevance_scores) + 2)))

# Ideal Discounted Cumulative Gain (IDCG)
def idcg(relevance_scores):
    sorted_relevance = np.sort(relevance_scores)[::-1]  # Sort in descending order
    return dcg(sorted_relevance)

# Normalized Discounted Cumulative Gain (NDCG)
def ndcg(relevance_scores, k=None):
    """
    relevance_scores: list or np.array of relevance scores for a ranked list
    k: optional parameter to select top k items for evaluation
    """
    if k is not None:
        relevance_scores = relevance_scores[:p]
    dcg_value = dcg(relevance_scores)
    idcg_value = idcg(relevance_scores)
    return dcg_value / idcg_value if idcg_value > 0 else 0


In [25]:
prec_topk = precision_at_k(recommended_items, y_test['title'].to_list(), 100)
print(f"Precision at rank 100: {prec_topk:.4f}")

Precision at rank 100: 0.0100


In [26]:
recall_topk = recall_at_k(recommended_items, y_test['title'].to_list(), 100)
print(f"Recall at rank 100: {recall_topk:.4f}")

Recall at rank 100: 0.0076


In [27]:
f1_topk = f1_at_k(recommended_items, y_test['title'].to_list(), 100)
print(f"F1 at rank 100: {f1_topk:.4f}")

F1 at rank 100: 0.0086


In [28]:
mrr_score = calculate_mrr(recommended_items, y_test['title'].to_list())

print(f"MRR score at rank 100: {mrr_score:.4f}")

MRR score at rank 100: 0.4986


In [29]:
p = 100
ndcg_score = ndcg(mean_correlation_scores[top_100_indices], p)

print(f"NDCG score at rank {p}: {ndcg_score:.4f}")

NDCG score at rank 100: 1.0000
