In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
import requests
from io import BytesIO
from zipfile import ZipFile

def load_movielens_data():
    """Load the Movielens 100k dataset"""
    url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"
    
    # Check if the data has been downloaded
    if not os.path.exists('ml-100k'):
        print("Download dataset...")
        response = requests.get(url)
        zip_file = ZipFile(BytesIO(response.content))
        zip_file.extractall()
        print("Dataset download and decompression completed")
    
    # Load rating data
    columns = ['user_id', 'item_id', 'rating', 'timestamp']
    ratings_df = pd.read_csv('ml-100k/u.data', sep='\t', names=columns)
    
    # Loading movie data for display
    movie_columns = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 
                     'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
                     'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                     'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
    movies_df = pd.read_csv('ml-100k/u.item', sep='|', names=movie_columns, encoding='latin-1')
    
    return ratings_df, movies_df

# Load the data
ratings_df, movies_df = load_movielens_data()

# Create utility matrix (user-item matrix)
utility_matrix = ratings_df.pivot_table(index='user_id', columns='item_id', values='rating')

# Center the ratings by subtracting each user's mean rating
user_means = utility_matrix.mean(axis=1)
centered_matrix = utility_matrix.sub(user_means, axis=0)

# Fill NaN values with 0 (neutral rating after centering)
centered_matrix_filled = centered_matrix.fillna(0)  # Fixed the variable name here

# Calculate cosine similarity between users
user_similarities = cosine_similarity(centered_matrix_filled)

# Convert to DataFrame for easier handling
similarity_df = pd.DataFrame(user_similarities, 
                            index=centered_matrix_filled.index, 
                            columns=centered_matrix_filled.index)

# Get top 10 most similar users to user 1 (excluding user 1 themselves)
similar_users = similarity_df[1].sort_values(ascending=False)[1:11].index.tolist()

# Get ratings for item 508 from similar users
item_508_ratings = utility_matrix.loc[similar_users, 508]

# Calculate the average rating for item 508 from similar users
# Only consider users who actually rated the item
valid_ratings = item_508_ratings.dropna()
if len(valid_ratings) > 0:
    predicted_rating = valid_ratings.mean()
else:
    # If none of the similar users rated the item, use global average
    predicted_rating = user_means.mean()

print(f"Top 10 similar users to user 1: {similar_users}")
print(f"Ratings for item 508 from similar users: {valid_ratings.to_dict()}")
print(f"Predicted rating for user 1 on item 508: {predicted_rating:.2f}")

Top 10 similar users to user 1: [773, 868, 592, 880, 429, 276, 916, 222, 457, 8]
Ratings for item 508 from similar users: {592: 5.0, 880: 4.0, 429: 4.0, 276: 5.0, 222: 3.0}
Predicted rating for user 1 on item 508: 4.20


In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
import requests
from io import BytesIO
from zipfile import ZipFile

# Data loading and preprocessing
def load_movielens_data():
    """Download and load the MovieLens 100k dataset"""
    url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"
    
    # Check if data already exists
    if not os.path.exists('ml-100k'):
        print("Downloading dataset...")
        response = requests.get(url)
        zip_file = ZipFile(BytesIO(response.content))
        zip_file.extractall()
        print("Dataset downloaded and extracted")
    
    # Load ratings data
    columns = ['user_id', 'item_id', 'rating', 'timestamp']
    ratings_df = pd.read_csv('ml-100k/u.data', sep='\t', names=columns)
    
    # Load movie data for display purposes
    movie_columns = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 
                     'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
                     'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                     'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
    movies_df = pd.read_csv('ml-100k/u.item', sep='|', names=movie_columns, encoding='latin-1')
    
    return ratings_df, movies_df

# Create user-item rating matrix and center the data by user
def create_utility_matrix(ratings_df):
    """Create user-item rating matrix and center by user ratings"""
    # Create the raw rating matrix
    utility_matrix = ratings_df.pivot(index='user_id', columns='item_id', values='rating')
    
    # Calculate mean rating for each user
    user_means = utility_matrix.mean(axis=1)
    
    # Center the data: subtract each user's mean rating from their ratings
    utility_matrix_centered = utility_matrix.sub(user_means, axis=0)
    
    # Fill NaN with 0 (indicating no rating)
    utility_matrix_centered = utility_matrix_centered.fillna(0)
    
    return utility_matrix, utility_matrix_centered, user_means

def analyze_user_item_similarity(user_ids, item_id, utility_matrix_centered, movies_df):
    """Build user profiles and calculate cosine similarity to a specific item using movie metadata"""
    results = {}
    
    # 提取电影元数据特征（类型信息）
    genre_columns = ['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
                     'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                     'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
    
    # 创建物品特征矩阵（物品ID -> 类型向量）
    item_features = movies_df.set_index('item_id')[genre_columns]
    
    # 获取目标物品的特征向量
    item_vector = item_features.loc[item_id].values.reshape(1, -1)
    print(f"Item {item_id} feature vector shape: {item_vector.shape}")
    
    # 构建用户偏好向量（用户对各类型的偏好度）
    user_profiles = {}
    for user_id in user_ids:
        # 用户对各类型的偏好度 = 用户评分矩阵 × 类型矩阵
        user_ratings = utility_matrix_centered.loc[user_id]
        user_profile = np.dot(user_ratings, item_features)
        
        # 归一化用户偏好向量
        if np.linalg.norm(user_profile) > 0:
            user_profile = user_profile / np.linalg.norm(user_profile)
        
        user_profiles[user_id] = user_profile.reshape(1, -1)
        print(f"User {user_id} profile shape: {user_profiles[user_id].shape}")
        
        # 计算用户与物品的余弦相似度
        similarity = cosine_similarity(user_profiles[user_id], item_vector)[0][0]
        distance = 1 - similarity
        
        results[user_id] = {
            'similarity': similarity,
            'distance': distance
        }
    
    # 获取电影标题
    movie_title = movies_df.loc[movies_df['item_id'] == item_id, 'title'].values[0]
    
    return results, movie_title
# Determine which user to recommend the movie to
def recommend_to_user(results):
    """Determine which user the movie should be recommended to based on similarity"""
    # Recommend to the user with higher similarity (lower distance)
    recommended_user = max(results, key=lambda x: results[x]['similarity'])
    return recommended_user

# Main function
def main():
    # Load data
    ratings_df, movies_df = load_movielens_data()
    
    # Create utility matrix
    utility_matrix, utility_matrix_centered, user_means = create_utility_matrix(ratings_df)
    
    # Users and item to analyze
    user_ids = [200, 15]
    item_id = 95
    
    # Analyze user-item similarity
    results, movie_title = analyze_user_item_similarity(
        user_ids, 
        item_id, 
        utility_matrix_centered, 
        movies_df
    )
    
    # Determine recommended user
    recommended_user = recommend_to_user(results)
    
    # Output results
    print(f"\nAnalyzing Movie (ID: {item_id}): {movie_title}")
    
    for user_id in user_ids:
        print(f"\nUser {user_id}:")
        print(f"  Cosine Similarity to Movie {item_id}: {results[user_id]['similarity']:.4f}")
        print(f"  Cosine Distance to Movie {item_id}: {results[user_id]['distance']:.4f}")
    
    print(f"\nRecommendation: Movie {item_id} should be recommended to User {recommended_user}")
    print(f"Reason: User {recommended_user} has a higher similarity score to the movie's profile.")

if __name__ == "__main__":
    main()    

Item 95 feature vector shape: (1, 19)
User 200 profile shape: (1, 19)
User 15 profile shape: (1, 19)

Analyzing Movie (ID: 95): Aladdin (1992)

User 200:
  Cosine Similarity to Movie 95: -0.2652
  Cosine Distance to Movie 95: 1.2652

User 15:
  Cosine Similarity to Movie 95: -0.3259
  Cosine Distance to Movie 95: 1.3259

Recommendation: Movie 95 should be recommended to User 200
Reason: User 200 has a higher similarity score to the movie's profile.
