### **1. 加载数据**

In [1]:
import numpy as np
from scipy.io import loadmat
import matplotlib.pyplot as plt
from scipy.optimize import minimize
import pandas as pd

In [2]:
data = loadmat("../data/ex10_movies.mat")
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Y', 'R'])

In [3]:
Y, R = data['Y'], data['R']
print(f"Y_shape: {Y.shape}, R_shape: {R.shape}")

Y_shape: (1682, 943), R_shape: (1682, 943)


In [4]:
param_mat = loadmat("../data/ex10_movieParams.mat")
param_mat.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Theta', 'num_users', 'num_movies', 'num_features'])

In [5]:
X, Theta = param_mat['X'], param_mat['Theta']

In [6]:
num_users, num_movies, num_features = param_mat['num_users'], param_mat['num_movies'], param_mat['num_features']
num_users, num_movies, num_features = num_users[0][0], num_movies[0][0], num_features[0][0]

In [7]:
print(f"X_shape: {X.shape}, Theta_shape: {Theta.shape}")
print(f"num_movies: {num_movies}, num_users: {num_users}, num_features: {num_features}")

X_shape: (1682, 10), Theta_shape: (943, 10)
num_movies: 1682, num_users: 943, num_features: 10


### **2. 扁平化参数** 

In [8]:
def flatten_params(X, Theta):
    """ Flatten X and Theta into a single array """
    return np.concatenate([X.ravel(), Theta.ravel()])

def unflatten_params(flattened_XTheta, num_movies, num_users, num_features):
    """ Extract X and Theta from a flattened array """
    first_X = num_movies * num_features
    X = flattened_XTheta[:first_X].reshape(num_movies, num_features)
    Theta = flattened_XTheta[first_X:].reshape(num_users, num_features)
    return X, Theta

### **3. 协同过滤算法**

In [9]:
def compute_cost(params, num_movies, num_users, num_features, Y, R, lambda_reg=0):
    X, Theta = unflatten_params(params, num_movies, num_users, num_features)
    prediction_errors = (X @ Theta.T - Y) * R
    cost = 0.5 * np.sum(prediction_errors**2) + (lambda_reg/2.) * (np.sum(Theta**2) + np.sum(X**2))
    return cost

In [10]:
params = flatten_params(X, Theta)
print(compute_cost(params, num_movies, num_users, num_features, Y, R, lambda_reg=0))

27918.64012454421


In [11]:
def compute_gradient(params, num_movies, num_users, num_features, Y, R, lambda_reg=0):
    X, Theta = unflatten_params(params, num_movies, num_users, num_features)
    prediction_errors = (X @ Theta.T - Y) * R
    grad_X = prediction_errors @ Theta + lambda_reg * X
    grad_Theta = prediction_errors.T @ X + lambda_reg * Theta
    return flatten_params(grad_X, grad_Theta)
print(len(compute_gradient(params, num_movies, num_users, num_features, Y, R)))

26250


In [12]:
def collaborative_filtering(Y, R, num_features, initial_X, initial_Theta, lambda_reg=0):
    num_movies, num_users = Y.shape
    initial_params = flatten_params(initial_X, initial_Theta)

    def callback_func(params):
        cost = compute_cost(params, num_movies, num_users, num_features, Y, R, lambda_reg)
        print(f"Current cost: {cost:.4f}")

    result = minimize(
        fun=lambda params: compute_cost(params, num_movies, num_users, num_features, Y, R, lambda_reg),
        jac=lambda params: compute_gradient(params, num_movies, num_users, num_features, Y, R, lambda_reg),
        x0=initial_params,
        method='TNC',
        callback=callback_func
    )

    X_final, Theta_final = unflatten_params(result.x, num_movies, num_users, num_features)
    return X_final, Theta_final

In [13]:
Y_norm = Y - Y.mean()
X_optimized, Theta_optimized = collaborative_filtering(Y_norm, R, num_features, X, Theta, lambda_reg=1)

print("Optimized X:", X_optimized)
print("Optimized Theta:", Theta_optimized)

Current cost: 32034.0495
Current cost: 31197.4967
Current cost: 30411.7811
Current cost: 30043.1585
Current cost: 29905.1804
Current cost: 29828.9493
Current cost: 29793.1699
Current cost: 29752.0114
Current cost: 29742.0871
Current cost: 29716.3399
Current cost: 29714.4318
Current cost: 29692.6849
Current cost: 29673.6206
Current cost: 29657.8697
Current cost: 29645.7841
Current cost: 29635.9516
Current cost: 29630.8295
Current cost: 29628.5156
Current cost: 29621.5419
Current cost: 29619.7662
Current cost: 29615.1756
Current cost: 29612.0605
Current cost: 29608.3777
Current cost: 29607.1438
Current cost: 29606.0043
Current cost: 29605.5690
Current cost: 29605.0898
Current cost: 29604.2683
Current cost: 29603.5302
Current cost: 29603.2532
Current cost: 29602.9015
Current cost: 29602.7603
Current cost: 29602.6438
Current cost: 29602.4940
Current cost: 29602.4590
Current cost: 29602.3824
Current cost: 29602.2882
Current cost: 29602.2322
Current cost: 29602.1341
Current cost: 29602.1197


### **4. 推荐电影**

In [14]:
def predict_ratings(X, Theta):
    """Predict ratings by computing the dot product of X and Theta.T"""
    return X @ Theta.T

In [15]:
def load_movie_list(filepath):
    """Load movie list from a text file."""
    movie_list = []
    with open(filepath, 'r', encoding='Latin-1') as file:
        for line in file:
            parts = line.strip().split(' ')
            movie_id = int(parts[0])
            title = ' '.join(parts[1:])  # Join the rest which is the movie title
            movie_list.append((movie_id, title))
    movie_df = pd.DataFrame(movie_list, columns=['movie_id', 'title'])
    return movie_df

In [16]:
def recommend_movies(predicted_ratings, movie_list, num_recommendations=5):
    """Recommend movies for each user based on predicted ratings.
    
    Args:
        predicted_ratings (np.array): The matrix of predicted ratings.
        movie_list (pd.DataFrame): The DataFrame of movies with 'movie_id' and 'title'.
        num_recommendations (int): The number of top recommendations to return for each user.

    Returns:
        dict: A dictionary containing recommended movie ids, titles, and predicted ratings.
    """
    movie_recommendations = {}
    for user_id in range(predicted_ratings.shape[1]):
        user_ratings = predicted_ratings[:, user_id]
        top_movie_indices = user_ratings.argsort()[::-1][:num_recommendations]
        top_movie_ids = movie_list.iloc[top_movie_indices]['movie_id'].values
        top_movie_titles = movie_list.iloc[top_movie_indices]['title'].values
        top_movie_ratings = user_ratings[top_movie_indices]
        recommendations = pd.DataFrame({
            'movie_id': top_movie_ids,
            'title': top_movie_titles,
            'predicted_rating': top_movie_ratings
        })
        movie_recommendations[user_id] = recommendations
    return movie_recommendations

In [17]:
movie_file_path = "../data/movie_ids.txt"
movie_list = load_movie_list(movie_file_path)

# 预测评分
predicted_ratings = predict_ratings(X_optimized, Theta_optimized)

# 为每个用户推荐评分最高的电影
user_recommendations = recommend_movies(predicted_ratings, movie_list)

# 输出第一个用户的推荐电影
print("Recommended movies for User 1:")
print(user_recommendations[10])

Recommended movies for User 1:
   movie_id                       title  predicted_rating
0       902    Big Lebowski, The (1998)          4.787682
1       613       My Man Godfrey (1936)          4.765411
2       347          Wag the Dog (1997)          4.761938
3       709    Strictly Ballroom (1992)          4.584831
4       169  Wrong Trousers, The (1993)          4.507411
