In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
import requests
from io import BytesIO
from zipfile import ZipFile

# Load the Movielens 100k dataset
def load_movielens_data():
    """Load the Movielens 100k dataset"""
    url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"
    
    # Check if the data has been downloaded
    if not os.path.exists('ml-100k'):
        print("Download dataset...")
        response = requests.get(url)
        zip_file = ZipFile(BytesIO(response.content))
        zip_file.extractall()
        print("Dataset download and decompression completed")
    
    # Load rating data
    columns = ['user_id', 'item_id', 'rating', 'timestamp']
    ratings_df = pd.read_csv('ml-100k/u.data', sep='\t', names=columns)
    
    # Loading movie data for display
    movie_columns = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 
                     'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
                     'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                     'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
    movies_df = pd.read_csv('ml-100k/u.item', sep='|', names=movie_columns, encoding='latin-1')
    
    return ratings_df, movies_df

# Build a user item rating matrix
def create_utility_matrix(ratings_df):
    """Create a user item rating matrix and centralize it"""
    # Create the original rating matrix
    utility_matrix = ratings_df.pivot(index='user_id', columns='item_id', values='rating')
    

    user_means = utility_matrix.mean(axis=1)
    
    # Centralized processing: Subtract the user's average rating from each rating
    utility_matrix_centered = utility_matrix.sub(user_means, axis=0)
    
 
    utility_matrix_centered = utility_matrix_centered.fillna(0)
    
    return utility_matrix, utility_matrix_centered, user_means

# Calculate similarity and find the most similar user
def find_similar_users(target_user_id, utility_matrix_centered, n=10):


    target_user_vector = utility_matrix_centered.loc[target_user_id].values.reshape(1, -1)
    

    similarities = cosine_similarity(target_user_vector, utility_matrix_centered)[0]
    

    similarity_df = pd.DataFrame({
        'user_id': utility_matrix_centered.index,
        'similarity': similarities
    })
    

    similarity_df = similarity_df[similarity_df['user_id'] != target_user_id]
    

    top_similar_users = similarity_df.sort_values('similarity', ascending=False).head(n)
    
    return top_similar_users

# predict
def predict_rating(target_user_id, item_id, utility_matrix, utility_matrix_centered, user_means, top_similar_users):

    similar_users_ratings = utility_matrix.loc[top_similar_users['user_id'], item_id]
    

    rated_users = similar_users_ratings.dropna()
    
    if len(rated_users) == 0:

        return user_means.loc[target_user_id]
    

    similarities = top_similar_users.set_index('user_id').loc[rated_users.index, 'similarity']
    
    # Calculate weighted average score (simple average, as similarity has already been taken into account)
    predicted_rating = rated_users.mean()
    
    # Convert centralized scoring back to the original scoring scale
    predicted_rating_original = predicted_rating + user_means.loc[target_user_id]
    
    return predicted_rating_original


def main():
    #Load data
    ratings_df, movies_df = load_movielens_data()
    
    #Create rating matrix
    utility_matrix, utility_matrix_centered, user_means = create_utility_matrix(ratings_df)
    
    # Target users and items
    target_user_id = 1
    target_item_id = 508
    
    # Find the 10 most similar users
    top_similar_users = find_similar_users(target_user_id, utility_matrix_centered, n=10)
    
    # prediction score
    predicted_rating = predict_rating(
        target_user_id, 
        target_item_id, 
        utility_matrix, 
        utility_matrix_centered, 
        user_means, 
        top_similar_users
    )
    

    print(f"\nTarget users: {target_user_id}")
    print(f" Target item(ID: {target_item_id}): {movies_df.loc[movies_df['item_id'] == target_item_id, 'title'].values[0]}")
    
    print("\nThe 10 most similar users:")
    for _, user in top_similar_users.iterrows():
        print(f"user's ID: {user['user_id']}, similarity: {user['similarity']:.4f}")
    
    print(f"\nPredicting Users {target_user_id}  rating for item {target_item_id} to be: {predicted_rating:.2f}")

if __name__ == "__main__":
    main()    


Target users: 1
 Target item(ID: 508): People vs. Larry Flynt, The (1996)

The 10 most similar users:
user's ID: 773.0, similarity: 0.2048
user's ID: 868.0, similarity: 0.2023
user's ID: 592.0, similarity: 0.1966
user's ID: 880.0, similarity: 0.1958
user's ID: 429.0, similarity: 0.1907
user's ID: 276.0, similarity: 0.1875
user's ID: 916.0, similarity: 0.1864
user's ID: 222.0, similarity: 0.1824
user's ID: 457.0, similarity: 0.1823
user's ID: 8.0, similarity: 0.1809

Predicting Users 1  rating for item 508 to be: 7.81


In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
import requests
from io import BytesIO
from zipfile import ZipFile

# Data loading and preprocessing
def load_movielens_data():
    """Download and load the MovieLens 100k dataset"""
    url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"
    
    # Check if data already exists
    if not os.path.exists('ml-100k'):
        print("Downloading dataset...")
        response = requests.get(url)
        zip_file = ZipFile(BytesIO(response.content))
        zip_file.extractall()
        print("Dataset downloaded and extracted")
    
    # Load ratings data
    columns = ['user_id', 'item_id', 'rating', 'timestamp']
    ratings_df = pd.read_csv('ml-100k/u.data', sep='\t', names=columns)
    
    # Load movie data for display purposes
    movie_columns = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 
                     'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
                     'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                     'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
    movies_df = pd.read_csv('ml-100k/u.item', sep='|', names=movie_columns, encoding='latin-1')
    
    return ratings_df, movies_df

# Create user-item rating matrix and center the data by user
def create_utility_matrix(ratings_df):
    """Create user-item rating matrix and center by user ratings"""
    # Create the raw rating matrix
    utility_matrix = ratings_df.pivot(index='user_id', columns='item_id', values='rating')
    
    # Calculate mean rating for each user
    user_means = utility_matrix.mean(axis=1)
    
    # Center the data: subtract each user's mean rating from their ratings
    utility_matrix_centered = utility_matrix.sub(user_means, axis=0)
    
    # Fill NaN with 0 (indicating no rating)
    utility_matrix_centered = utility_matrix_centered.fillna(0)
    
    return utility_matrix, utility_matrix_centered, user_means

def analyze_user_item_similarity(user_ids, item_id, utility_matrix_centered, movies_df):
    """Build user profiles and calculate cosine similarity to a specific item using movie metadata"""
    results = {}
    
    # 提取电影元数据特征（类型信息）
    genre_columns = ['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
                     'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                     'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
    
    # 创建物品特征矩阵（物品ID -> 类型向量）
    item_features = movies_df.set_index('item_id')[genre_columns]
    
    # 获取目标物品的特征向量
    item_vector = item_features.loc[item_id].values.reshape(1, -1)
    print(f"Item {item_id} feature vector shape: {item_vector.shape}")
    
    # 构建用户偏好向量（用户对各类型的偏好度）
    user_profiles = {}
    for user_id in user_ids:
        # 用户对各类型的偏好度 = 用户评分矩阵 × 类型矩阵
        user_ratings = utility_matrix_centered.loc[user_id]
        user_profile = np.dot(user_ratings, item_features)
        
        # 归一化用户偏好向量
        if np.linalg.norm(user_profile) > 0:
            user_profile = user_profile / np.linalg.norm(user_profile)
        
        user_profiles[user_id] = user_profile.reshape(1, -1)
        print(f"User {user_id} profile shape: {user_profiles[user_id].shape}")
        
        # 计算用户与物品的余弦相似度
        similarity = cosine_similarity(user_profiles[user_id], item_vector)[0][0]
        distance = 1 - similarity
        
        results[user_id] = {
            'similarity': similarity,
            'distance': distance
        }
    
    # 获取电影标题
    movie_title = movies_df.loc[movies_df['item_id'] == item_id, 'title'].values[0]
    
    return results, movie_title
# Determine which user to recommend the movie to
def recommend_to_user(results):
    """Determine which user the movie should be recommended to based on similarity"""
    # Recommend to the user with higher similarity (lower distance)
    recommended_user = max(results, key=lambda x: results[x]['similarity'])
    return recommended_user

# Main function
def main():
    # Load data
    ratings_df, movies_df = load_movielens_data()
    
    # Create utility matrix
    utility_matrix, utility_matrix_centered, user_means = create_utility_matrix(ratings_df)
    
    # Users and item to analyze
    user_ids = [200, 15]
    item_id = 95
    
    # Analyze user-item similarity
    results, movie_title = analyze_user_item_similarity(
        user_ids, 
        item_id, 
        utility_matrix_centered, 
        movies_df
    )
    
    # Determine recommended user
    recommended_user = recommend_to_user(results)
    
    # Output results
    print(f"\nAnalyzing Movie (ID: {item_id}): {movie_title}")
    
    for user_id in user_ids:
        print(f"\nUser {user_id}:")
        print(f"  Cosine Similarity to Movie {item_id}: {results[user_id]['similarity']:.4f}")
        print(f"  Cosine Distance to Movie {item_id}: {results[user_id]['distance']:.4f}")
    
    print(f"\nRecommendation: Movie {item_id} should be recommended to User {recommended_user}")
    print(f"Reason: User {recommended_user} has a higher similarity score to the movie's profile.")

if __name__ == "__main__":
    main()    

Item 95 feature vector shape: (1, 19)
User 200 profile shape: (1, 19)
User 15 profile shape: (1, 19)

Analyzing Movie (ID: 95): Aladdin (1992)

User 200:
  Cosine Similarity to Movie 95: -0.2652
  Cosine Distance to Movie 95: 1.2652

User 15:
  Cosine Similarity to Movie 95: -0.3259
  Cosine Distance to Movie 95: 1.3259

Recommendation: Movie 95 should be recommended to User 200
Reason: User 200 has a higher similarity score to the movie's profile.
