In [1]:
conda activate lenci_enviroment



Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import surprise
from surprise import SVD, Reader
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (
    rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, recall_at_k
)
from collections import defaultdict

# =================== 1. Preparar dataset ===================
file_path = "C:\\Users\\vlenc\\OneDrive\\Documentos\\ml-latest-small\\ratings.csv"
movies_file = "C:\\Users\\vlenc\\OneDrive\\Documentos\\ml-latest-small\\movies.csv"

df = pd.read_csv(file_path)
df = df[["userId", "movieId", "rating"]]
# Padronizar nomes de colunas para Recommenders
df = df.rename(columns={"userId": "userID", "movieId": "itemID", "rating": "rating"})

movies = pd.read_csv(movies_file)
movies = movies.rename(columns={"movieId": "itemID", "title": "title"})
movies = movies.merge(df.groupby("itemID").size().reset_index(name="num_ratings"), on="itemID")
movies = movies.sort_values("num_ratings", ascending=False).reset_index(drop=True)

# =================== 2. Definir usuário-alvo ===================
user_id = 2
user_ratings = df[df["userID"] == user_id].copy()
print(f"Usuário {user_id} avaliou {len(user_ratings)} filmes.")

# =================== 3. Criar treino/teste ===================
other_ratings = df[df["userID"] != user_id].copy()
train_df, test_df = python_random_split(other_ratings, 0.75)

# Criar dataset do Surprise
reader = Reader(rating_scale=(1, 5))
trainset = surprise.Dataset.load_from_df(train_df, reader).build_full_trainset()

# =================== 4. Treinar SVD ===================
svd = SVD(n_factors=1000, n_epochs=30, random_state=42, verbose=True)
svd.fit(trainset)

# =================== 5. Previsões ===================
"""user_movies = set(user_ratings["itemID"])
user_movies.head()

all_movies = set(df["itemID"])
movies_to_predict = list(all_movies - user_movies)

anti_testset_user = [(user_id, iid, 0) for iid in movies_to_predict]
predictions = svd.test(anti_test_dfset_user)"""


def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid in top_n:
        top_n[uid].sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = top_n[uid][:n]
    return top_n

top_recommendations = get_top_n(predictions, n=10)

print("\n🎬 Filmes recomendados para o usuário:")
for item_id, pred_rating in top_recommendations[user_id]:
    title = movies[movies["itemID"] == item_id]["title"].values[0]
    print(f"{title} (nota estimada: {pred_rating:.2f})")

# =================== 6. Mostrar filmes ocultos ===================
print("\n🎬 Filmes já avaliados (ocultos para o modelo):")
for _, row in user_ratings.sort_values("rating", ascending=False).iterrows():
    title = movies[movies["itemID"] == row["itemID"]]["title"].values[0]
    print(f"{row['itemID']} - {title}: {row['rating']}")

# =================== 7. Avaliação do modelo ===================
train_plus_user = pd.concat([train_df, user_ratings])
all_predictions = compute_ranking_predictions(svd, train_plus_user, usercol="userID", itemcol="itemID", remove_seen=True)

predictions_test = predict(svd, test_df, usercol="userID", itemcol="itemID")

eval_rmse = rmse(test_df, predictions_test, col_user="userID", col_item="itemID", col_rating="rating", col_prediction="prediction")
eval_mae = mae(test_df, predictions_test, col_user="userID", col_item="itemID", col_rating="rating", col_prediction="prediction")
eval_rsquared = rsquared(test_df, predictions_test, col_user="userID", col_item="itemID", col_rating="rating", col_prediction="prediction")
eval_exp_var = exp_var(test_df, predictions_test, col_user="userID", col_item="itemID", col_rating="rating", col_prediction="prediction")

eval_map = map_at_k(test_df, all_predictions, col_prediction="prediction", k=10)
eval_ndcg = ndcg_at_k(test_df, all_predictions, col_prediction="prediction", k=10)
eval_precision = precision_at_k(test_df, all_predictions, col_prediction="prediction", k=10)
eval_recall = recall_at_k(test_df, all_predictions, col_prediction="prediction", k=10)

print("\n📊 Avaliação do modelo:")
print(f"RMSE: {eval_rmse:.4f}, MAE: {eval_mae:.4f}, R²: {eval_rsquared:.4f}, Explained Variance: {eval_exp_var:.4f}")
print(f"MAP@10: {eval_map:.4f}, NDCG@10: {eval_ndcg:.4f}, Precision@10: {eval_precision:.4f}, Recall@10: {eval_recall:.4f}")


Usuário 2 avaliou 29 filmes.
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29

🎬 Filmes recomendados para o usuário:
Fight Club (1999) (nota estimada: 4.18)
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) (nota estimada: 4.17)
Godfather, The (1972) (nota estimada: 4.17)
Princess Bride, The (1987) (nota estimada: 4.16)
Star Wars: Episode V - The Empire Strikes Back (1980) (nota estimada: 4.15)
Star Wars: Episode IV - A New Hope 

In [12]:
import pandas as pd
import surprise
from surprise import SVD, Reader
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (
    rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, recall_at_k
)
from collections import defaultdict

In [13]:
# =================== 1. Preparar dataset ===================
file_path = "C:\\Users\\vlenc\\OneDrive\\Documentos\\ml-latest-small\\ratings.csv"
movies_file = "C:\\Users\\vlenc\\OneDrive\\Documentos\\ml-latest-small\\movies.csv"

df = pd.read_csv(file_path)
df = df[["userId", "movieId", "rating"]]
# Padronizar nomes de colunas para Recommenders
df = df.rename(columns={"userId": "userID", "movieId": "itemID", "rating": "rating"})

movies = pd.read_csv(movies_file)
movies = movies.rename(columns={"movieId": "itemID", "title": "title"})
# movies = movies.merge(df.groupby("itemID").size().reset_index(name="num_ratings"), on="itemID")
# movies = movies.sort_values("num_ratings", ascending=False).reset_index(drop=True)

In [None]:
# =================== 2. Criar treino/teste ===================
train_df, test_df = python_random_split(df, 0.8)

# Criar dataset do Surprise
reader = Reader(rating_scale=(1, 5))

# Adaptar DataFrame 'train_df' para ser utilizado na função 'SVD'
trainset = surprise.Dataset.load_from_df(train_df, reader).build_full_trainset()

In [None]:
# =================== 3. Treinar SVD ===================
svd = SVD(n_factors=200, n_epochs=30, random_state=42, verbose=True)
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1f937787460>

In [None]:
# =================== 3. Treinar SVD ===================
predictions = predict(svd, test_df, usercol="userID", itemcol="itemID")
predictions.head(5)

Unnamed: 0,userID,itemID,prediction
0,432,77866,3.370935
1,288,474,3.562762
2,599,4351,2.519616
3,42,2987,3.916927
4,75,1610,3.570998


In [8]:
all_predictions = compute_ranking_predictions(
    svd, train_df, usercol="userID", itemcol="itemID", remove_seen=True
)
all_predictions.head()

Unnamed: 0,userID,itemID,prediction
1,1,2,4.15988
3,1,4,3.767587
4,1,5,3.646058
6,1,7,3.574784
7,1,8,3.624624


In [11]:
# Resultados de 'all_predictions' para o usuário 1
df_user1 = pd.DataFrame(all_predictions)
res = df_user1[df_user1["userID"] == 1].sort_values(by="prediction", ascending=False)

res.head(50)

Unnamed: 0,userID,itemID,prediction
1237,1,1704,5.0
6447,1,66934,5.0
689,1,933,5.0
901,1,1234,5.0
893,1,1225,5.0
891,1,1223,5.0
141,1,171,5.0
870,1,1201,5.0
6773,1,78499,5.0
867,1,1198,5.0


In [None]:
# “O modelo acerta a nota que o usuário daria para um filme?”
eval_rmse = rmse(test_df, predictions)
eval_mae = mae(test_df, predictions)
eval_rsquared = rsquared(test_df, predictions)
eval_exp_var = exp_var(test_df, predictions)

# O modelo está colocando os filmes certos (que o usuário realmente gosta) nas primeiras posições das recomendações?
eval_map = map_at_k(test_df, all_predictions, col_prediction="prediction", k=10)
eval_ndcg = ndcg_at_k(test_df, all_predictions, col_prediction="prediction", k=10)
eval_precision = precision_at_k(
    test_df, all_predictions, col_prediction="prediction", k=10
)
eval_recall = recall_at_k(test_df, all_predictions, col_prediction="prediction", k=10)


print(
    "RMSE:\t\t%f" % eval_rmse,
    "MAE:\t\t%f" % eval_mae,
    "rsquared:\t%f" % eval_rsquared,
    "exp var:\t%f" % eval_exp_var,
    sep="\n",
)

print("----")

print(
    "MAP:\t\t%f" % eval_map,
    "NDCG:\t\t%f" % eval_ndcg,
    "Precision@K:\t%f" % eval_precision,
    "Recall@K:\t%f" % eval_recall,
    sep="\n",
)

RMSE:		0.885993
MAE:		0.678143
rsquared:	0.283092
exp var:	0.283143
----
MAP:		0.034073
NDCG:		0.076641
Precision@K:	0.067049
Recall@K:	0.021045


In [12]:
# =================== 5. Previsões ===================
"""
user_movies = set(user_ratings["itemID"])

all_movies = set(df["itemID"])
movies_to_predict = list(all_movies - user_movies)

anti_testset_user = [(user_id, iid, 0) for iid in movies_to_predict]
predictions = svd.test(anti_testset_user)
"""

predictions = predict(svd, test_df, usercol="userID", itemcol="itemID")
predictions.head()

"""
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid in top_n:
        top_n[uid].sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = top_n[uid][:n]
    return top_n

top_recommendations = get_top_n(predictions, n=10)

print("\n🎬 Filmes recomendados para o usuário:")
for item_id, pred_rating in top_recommendations[user_id]:
    title = movies[movies["itemID"] == item_id]["title"].values[0]
    print(f"{title} (nota estimada: {pred_rating:.2f})")    
"""

'\ndef get_top_n(predictions, n=10):\n    top_n = defaultdict(list)\n    for uid, iid, true_r, est, _ in predictions:\n        top_n[uid].append((iid, est))\n    for uid in top_n:\n        top_n[uid].sort(key=lambda x: x[1], reverse=True)\n        top_n[uid] = top_n[uid][:n]\n    return top_n\n\ntop_recommendations = get_top_n(predictions, n=10)\n\nprint("\n🎬 Filmes recomendados para o usuário:")\nfor item_id, pred_rating in top_recommendations[user_id]:\n    title = movies[movies["itemID"] == item_id]["title"].values[0]\n    print(f"{title} (nota estimada: {pred_rating:.2f})")    \n'

In [11]:
# =================== 6. Mostrar filmes ocultos ===================
print("\n🎬 Filmes já avaliados (ocultos para o modelo):")
for _, row in user_ratings.sort_values("rating", ascending=False).iterrows():
    title = movies[movies["itemID"] == row["itemID"]]["title"].values[0]
    print(f"{row['itemID']} - {title}: {row['rating']}")


🎬 Filmes já avaliados (ocultos para o modelo):
131724.0 - The Jinx: The Life and Deaths of Robert Durst (2015): 5.0
122882.0 - Mad Max: Fury Road (2015): 5.0
106782.0 - Wolf of Wall Street, The (2013): 5.0
89774.0 - Warrior (2011): 5.0
60756.0 - Step Brothers (2008): 5.0
80906.0 - Inside Job (2010): 5.0
1704.0 - Good Will Hunting (1997): 4.5
58559.0 - Dark Knight, The (2008): 4.5
68157.0 - Inglourious Basterds (2009): 4.5
80489.0 - Town, The (2010): 4.5
333.0 - Tommy Boy (1995): 4.0
112552.0 - Whiplash (2014): 4.0
86345.0 - Louis C.K.: Hilarious (2010): 4.0
79132.0 - Inception (2010): 4.0
74458.0 - Shutter Island (2010): 4.0
48516.0 - Departed, The (2006): 4.0
46970.0 - Talladega Nights: The Ballad of Ricky Bobby (2006): 4.0
6874.0 - Kill Bill: Vol. 1 (2003): 4.0
3578.0 - Gladiator (2000): 4.0
91529.0 - Dark Knight Rises, The (2012): 3.5
99114.0 - Django Unchained (2012): 3.5
8798.0 - Collateral (2004): 3.5
115713.0 - Ex Machina (2015): 3.5
77455.0 - Exit Through the Gift Shop (2010):

In [13]:
# =================== 7. Avaliação do modelo ===================
train_plus_user = pd.concat([train_df, user_ratings])
all_predictions = compute_ranking_predictions(svd, train_plus_user, usercol="userID", itemcol="itemID", remove_seen=True)

predictions_test = predict(svd, test_df, usercol="userID", itemcol="itemID")

eval_rmse = rmse(test_df, predictions_test, col_user="userID", col_item="itemID", col_rating="rating", col_prediction="prediction")
eval_mae = mae(test_df, predictions_test, col_user="userID", col_item="itemID", col_rating="rating", col_prediction="prediction")
eval_rsquared = rsquared(test_df, predictions_test, col_user="userID", col_item="itemID", col_rating="rating", col_prediction="prediction")
eval_exp_var = exp_var(test_df, predictions_test, col_user="userID", col_item="itemID", col_rating="rating", col_prediction="prediction")

eval_map = map_at_k(test_df, all_predictions, col_prediction="prediction", k=10)
eval_ndcg = ndcg_at_k(test_df, all_predictions, col_prediction="prediction", k=10)
eval_precision = precision_at_k(test_df, all_predictions, col_prediction="prediction", k=10)
eval_recall = recall_at_k(test_df, all_predictions, col_prediction="prediction", k=10)

print("\n📊 Avaliação do modelo:")
print(f"RMSE: {eval_rmse:.4f}, MAE: {eval_mae:.4f}, R²: {eval_rsquared:.4f}, Explained Variance: {eval_exp_var:.4f}")
print(f"MAP@10: {eval_map:.4f}, NDCG@10: {eval_ndcg:.4f}, Precision@10: {eval_precision:.4f}, Recall@10: {eval_recall:.4f}")


📊 Avaliação do modelo:
RMSE: 0.8873, MAE: 0.6793, R²: 0.2805, Explained Variance: 0.2808
MAP@10: 0.0332, NDCG@10: 0.0730, Precision@10: 0.0612, Recall@10: 0.0203
