# Movie Recommendation System Analysis 🎬

This notebook implements a comprehensive movie recommendation system using collaborative filtering, similarity matrices, and machine learning models. We'll build a system that predicts user ratings and recommends movies using the MovieLens 20M dataset.

## Project Overview

**Goals:**
- Build a collaborative filtering system using user-user and item-item similarity
- Predict user ratings with high accuracy (target RMSE < 0.85)
- Create intelligent features for machine learning models
- Compare multiple ML models and select the best performer
- Generate personalized movie recommendations

**Dataset:** MovieLens 20M
- 20 million ratings
- 138,000 users
- 27,000 movies

**Methods:**
1. **Similarity-based Collaborative Filtering**
2. **Feature Engineering & Machine Learning**
3. **Hybrid Recommendation System**

## 1. Import Required Libraries and Load Data 📚

Let's import all the necessary libraries for data analysis, machine learning, and visualization.

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
from scipy.spatial.distance import cosine

# Advanced ML libraries
try:
    import xgboost as xgb
    import lightgbm as lgb
    print("✅ XGBoost and LightGBM available")
except ImportError:
    print("⚠️ XGBoost or LightGBM not available - install with pip install xgboost lightgbm")

# Plotting setup
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("📚 All libraries imported successfully!")

In [None]:
# Data paths
DATA_PATH = Path("../data")
RAW_DATA_PATH = DATA_PATH / "raw"
PROCESSED_DATA_PATH = DATA_PATH / "processed"

# Create directories if they don't exist
PROCESSED_DATA_PATH.mkdir(parents=True, exist_ok=True)

def load_movielens_data():
    """Load MovieLens dataset from CSV files."""
    try:
        # Load ratings data
        ratings_path = RAW_DATA_PATH / "ratings.csv"
        if ratings_path.exists():
            ratings_df = pd.read_csv(ratings_path)
            print(f"✅ Loaded {len(ratings_df):,} ratings")
        else:
            print("⚠️ ratings.csv not found in data/raw/")
            print("📥 Please download MovieLens 20M dataset from:")
            print("   https://grouplens.org/datasets/movielens/20m/")
            # Create sample data for demonstration
            print("🔧 Creating sample data for demonstration...")
            ratings_df = create_sample_data()
        
        # Load movies data
        movies_path = RAW_DATA_PATH / "movies.csv"
        if movies_path.exists():
            movies_df = pd.read_csv(movies_path)
            print(f"✅ Loaded {len(movies_df):,} movies")
        else:
            print("⚠️ movies.csv not found - creating sample movie data")
            movies_df = create_sample_movies(ratings_df['movieId'].unique())
        
        return ratings_df, movies_df
    
    except Exception as e:
        print(f"❌ Error loading data: {e}")
        print("🔧 Creating sample data...")
        ratings_df = create_sample_data()
        movies_df = create_sample_movies(ratings_df['movieId'].unique())
        return ratings_df, movies_df

def create_sample_data():
    """Create sample ratings data for demonstration."""
    np.random.seed(42)
    n_users, n_movies, n_ratings = 5000, 1000, 50000
    
    # Generate realistic user-movie interactions
    user_ids = np.random.choice(range(1, n_users + 1), n_ratings)
    movie_ids = np.random.choice(range(1, n_movies + 1), n_ratings)
    
    # Create realistic rating distribution (skewed toward higher ratings)
    ratings = np.random.choice([1, 2, 3, 4, 5], n_ratings, 
                              p=[0.05, 0.1, 0.2, 0.35, 0.3])
    
    # Add timestamps (random but sorted per user)
    timestamps = []
    for user_id in user_ids:
        base_time = 1000000000 + np.random.randint(0, 500000000)
        timestamps.append(base_time)
    
    sample_df = pd.DataFrame({
        'userId': user_ids,
        'movieId': movie_ids,
        'rating': ratings,
        'timestamp': timestamps
    })
    
    # Remove duplicates
    sample_df = sample_df.drop_duplicates(subset=['userId', 'movieId'])
    
    print(f"🎲 Created sample dataset with {len(sample_df):,} ratings")
    return sample_df

def create_sample_movies(movie_ids):
    """Create sample movies data."""
    genres = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 
             'Documentary', 'Drama', 'Fantasy', 'Horror', 'Musical', 'Mystery', 
             'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
    
    movies_data = []
    for movie_id in movie_ids:
        # Random title and year
        year = np.random.randint(1950, 2023)
        title = f"Movie {movie_id} ({year})"
        
        # Random genres (1-3 genres per movie)
        n_genres = np.random.randint(1, 4)
        movie_genres = np.random.choice(genres, n_genres, replace=False)
        
        movies_data.append({
            'movieId': movie_id,
            'title': title,
            'genres': '|'.join(movie_genres)
        })
    
    return pd.DataFrame(movies_data)

# Load the data
print("📊 Loading MovieLens dataset...")
ratings_df, movies_df = load_movielens_data()

## 2. Data Exploration and Preprocessing 🔍

Let's explore the dataset structure, analyze rating distributions, and understand user and movie behavior patterns.

In [None]:
# Basic dataset information
print("📈 Dataset Overview")
print("=" * 50)
print(f"📊 Ratings DataFrame: {ratings_df.shape}")
print(f"🎬 Movies DataFrame: {movies_df.shape}")
print(f"👥 Unique Users: {ratings_df['userId'].nunique():,}")
print(f"🎭 Unique Movies: {ratings_df['movieId'].nunique():,}")
print(f"⭐ Total Ratings: {len(ratings_df):,}")

# Display first few rows
print("\n🔍 First 5 ratings:")
display(ratings_df.head())

print("\n🎬 First 5 movies:")
display(movies_df.head())

# Data types and missing values
print("\n📋 Ratings Data Info:")
print(ratings_df.info())

print("\n📋 Movies Data Info:")  
print(movies_df.info())

In [None]:
# Rating Distribution Analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Rating Distribution
ratings_df['rating'].hist(bins=10, ax=axes[0, 0], color='skyblue', edgecolor='black')
axes[0, 0].set_title('Rating Distribution', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Rating')
axes[0, 0].set_ylabel('Frequency')

# Add statistics
mean_rating = ratings_df['rating'].mean()
std_rating = ratings_df['rating'].std()
axes[0, 0].axvline(mean_rating, color='red', linestyle='--', 
                   label=f'Mean: {mean_rating:.2f}')
axes[0, 0].legend()

# 2. User Activity Distribution
user_activity = ratings_df.groupby('userId').size()
user_activity.hist(bins=50, ax=axes[0, 1], color='lightgreen', edgecolor='black')
axes[0, 1].set_title('User Activity Distribution', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Number of Ratings per User')
axes[0, 1].set_ylabel('Number of Users')
axes[0, 1].set_yscale('log')

# 3. Movie Popularity Distribution
movie_popularity = ratings_df.groupby('movieId').size()
movie_popularity.hist(bins=50, ax=axes[1, 0], color='orange', edgecolor='black')
axes[1, 0].set_title('Movie Popularity Distribution', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Number of Ratings per Movie')
axes[1, 0].set_ylabel('Number of Movies')
axes[1, 0].set_yscale('log')

# 4. Average Rating per Movie
movie_avg_ratings = ratings_df.groupby('movieId')['rating'].mean()
movie_avg_ratings.hist(bins=30, ax=axes[1, 1], color='purple', edgecolor='black')
axes[1, 1].set_title('Average Rating per Movie', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Average Rating')
axes[1, 1].set_ylabel('Number of Movies')

plt.tight_layout()
plt.show()

# Statistical Summary
print("\n📊 Statistical Summary")
print("=" * 50)
print(f"⭐ Average Rating: {mean_rating:.3f} ± {std_rating:.3f}")
print(f"📈 Rating Range: {ratings_df['rating'].min()} - {ratings_df['rating'].max()}")
print(f"👥 Avg Ratings per User: {user_activity.mean():.1f}")
print(f"📽️ Avg Ratings per Movie: {movie_popularity.mean():.1f}")
print(f"🎯 Most Active User: {user_activity.max()} ratings")
print(f"🌟 Most Popular Movie: {movie_popularity.max()} ratings")

In [None]:
# Data Sparsity Analysis
def analyze_sparsity(ratings_df):
    """Analyze dataset sparsity."""
    n_users = ratings_df['userId'].nunique()
    n_movies = ratings_df['movieId'].nunique()
    n_ratings = len(ratings_df)
    
    # Total possible ratings
    total_possible = n_users * n_movies
    sparsity = 1 - (n_ratings / total_possible)
    
    print("🕳️ Sparsity Analysis")
    print("=" * 30)
    print(f"Users: {n_users:,}")
    print(f"Movies: {n_movies:,}")
    print(f"Actual Ratings: {n_ratings:,}")
    print(f"Possible Ratings: {total_possible:,}")
    print(f"Sparsity: {sparsity:.4f} ({sparsity*100:.2f}%)")
    
    return sparsity

sparsity = analyze_sparsity(ratings_df)

# Data Preprocessing
print("\n🔧 Data Preprocessing")
print("=" * 30)

# Merge ratings with movie information
merged_df = ratings_df.merge(movies_df, on='movieId', how='left')
print(f"✅ Merged dataset: {merged_df.shape}")

# Filter users and movies with minimum number of ratings
MIN_USER_RATINGS = 20
MIN_MOVIE_RATINGS = 10

print(f"\n📊 Applying filters:")
print(f"   Minimum ratings per user: {MIN_USER_RATINGS}")
print(f"   Minimum ratings per movie: {MIN_MOVIE_RATINGS}")

# Count ratings per user and movie
user_counts = merged_df['userId'].value_counts()
movie_counts = merged_df['movieId'].value_counts()

# Filter active users and popular movies
active_users = user_counts[user_counts >= MIN_USER_RATINGS].index
popular_movies = movie_counts[movie_counts >= MIN_MOVIE_RATINGS].index

# Apply filters
filtered_df = merged_df[
    (merged_df['userId'].isin(active_users)) &
    (merged_df['movieId'].isin(popular_movies))
]

print(f"📉 After filtering:")
print(f"   Users: {merged_df['userId'].nunique():,} → {filtered_df['userId'].nunique():,}")
print(f"   Movies: {merged_df['movieId'].nunique():,} → {filtered_df['movieId'].nunique():,}")
print(f"   Ratings: {len(merged_df):,} → {len(filtered_df):,}")

# Analyze new sparsity
print(f"\n🎯 New sparsity:")
new_sparsity = analyze_sparsity(filtered_df)

# Save processed data
filtered_df.to_csv(PROCESSED_DATA_PATH / "filtered_ratings.csv", index=False)
print(f"\n💾 Saved filtered dataset to {PROCESSED_DATA_PATH / 'filtered_ratings.csv'}")

## 3. Movie-Movie Similarity Matrix 🎬➡️🎬

Let's create a movie-movie similarity matrix using cosine similarity. This will help us find movies that are similar based on user rating patterns.

In [None]:
# Create User-Item Matrix
def create_user_item_matrix(df):
    """Create a user-item matrix from ratings data."""
    user_item_matrix = df.pivot_table(
        index='userId', 
        columns='movieId', 
        values='rating',
        fill_value=0
    )
    return user_item_matrix

print("🔧 Creating User-Item Matrix...")
user_item_matrix = create_user_item_matrix(filtered_df)
print(f"✅ User-Item Matrix shape: {user_item_matrix.shape}")

# Create Movie-Movie Similarity Matrix
def calculate_movie_similarity(user_item_matrix, method='cosine'):
    """Calculate movie-to-movie similarity matrix."""
    print(f"🔄 Calculating movie similarity using {method} method...")
    
    # Transpose to get movies as rows
    movie_matrix = user_item_matrix.T
    
    if method == 'cosine':
        # Replace 0s with NaN for better similarity calculation
        movie_matrix_filled = movie_matrix.replace(0, np.nan)
        
        # Fill NaN with 0 for cosine similarity
        matrix_for_sim = movie_matrix_filled.fillna(0).values
        
        # Calculate cosine similarity
        similarity_matrix = cosine_similarity(matrix_for_sim)
        
    elif method == 'pearson':
        # Calculate Pearson correlation
        similarity_matrix = movie_matrix.T.corr().fillna(0).values
    
    # Convert to DataFrame
    similarity_df = pd.DataFrame(
        similarity_matrix,
        index=movie_matrix.index,
        columns=movie_matrix.index
    )
    
    return similarity_df

# Calculate similarity matrix
movie_similarity_df = calculate_movie_similarity(user_item_matrix, method='cosine')
print(f"✅ Movie similarity matrix: {movie_similarity_df.shape}")

# Visualize similarity matrix (sample)
plt.figure(figsize=(12, 10))
sample_movies = movie_similarity_df.index[:50]  # Sample first 50 movies
sample_sim_matrix = movie_similarity_df.loc[sample_movies, sample_movies]

sns.heatmap(sample_sim_matrix, 
            cmap='RdYlBu_r', 
            center=0, 
            square=True,
            cbar_kws={'label': 'Cosine Similarity'})
plt.title('Movie-Movie Similarity Matrix (Sample)', fontsize=16, fontweight='bold')
plt.xlabel('Movie ID')
plt.ylabel('Movie ID')
plt.tight_layout()
plt.show()

print(f"🎯 Similarity matrix statistics:")
print(f"   Mean similarity: {movie_similarity_df.values.mean():.4f}")
print(f"   Std similarity: {movie_similarity_df.values.std():.4f}")
print(f"   Max similarity: {movie_similarity_df.values.max():.4f}")

In [None]:
# Function to find similar movies
def get_similar_movies(movie_id, similarity_matrix, movies_df, n_similar=10):
    """Find movies most similar to a given movie."""
    if movie_id not in similarity_matrix.index:
        return f"Movie ID {movie_id} not found in similarity matrix"
    
    # Get similarities for the target movie
    similarities = similarity_matrix.loc[movie_id]
    
    # Sort and get top N (excluding the movie itself)
    similar_movies = similarities.sort_values(ascending=False)[1:n_similar+1]
    
    # Get movie information
    results = []
    for sim_movie_id, similarity in similar_movies.items():
        movie_info = movies_df[movies_df['movieId'] == sim_movie_id]
        if len(movie_info) > 0:
            title = movie_info.iloc[0]['title']
            genres = movie_info.iloc[0]['genres']
            results.append({
                'movieId': sim_movie_id,
                'title': title,
                'genres': genres,
                'similarity': similarity
            })
    
    return pd.DataFrame(results)

# Example: Find movies similar to a popular movie
sample_movie_id = filtered_df['movieId'].value_counts().index[0]  # Most popular movie
sample_movie_info = movies_df[movies_df['movieId'] == sample_movie_id].iloc[0]

print(f"🎬 Finding movies similar to:")
print(f"   ID: {sample_movie_id}")
print(f"   Title: {sample_movie_info['title']}")
print(f"   Genres: {sample_movie_info['genres']}")

similar_movies = get_similar_movies(sample_movie_id, movie_similarity_df, movies_df, n_similar=10)
print(f"\n🎯 Top 10 Similar Movies:")
display(similar_movies)

# Analyze similarity distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
similarity_values = movie_similarity_df.values
similarity_values = similarity_values[similarity_values != 1.0]  # Remove self-similarity
plt.hist(similarity_values, bins=50, color='lightblue', edgecolor='black', alpha=0.7)
plt.title('Distribution of Movie Similarities', fontweight='bold')
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
# Box plot of similarities for a sample movie
sample_similarities = movie_similarity_df.iloc[0, 1:]  # First movie's similarities
plt.boxplot(sample_similarities, vert=True)
plt.title('Similarity Distribution for One Movie', fontweight='bold')
plt.ylabel('Cosine Similarity')

plt.tight_layout()
plt.show()

## 4. User-User Similarity Matrix 👥➡️👥

Now let's create a user-user similarity matrix to identify users with similar preferences and tastes.

In [None]:
# Create User-User Similarity Matrix
def calculate_user_similarity(user_item_matrix, method='cosine', sample_size=1000):
    """Calculate user-to-user similarity matrix (with sampling for efficiency)."""
    print(f"🔄 Calculating user similarity using {method} method...")
    
    # Sample users for computational efficiency
    if len(user_item_matrix) > sample_size:
        print(f"📊 Sampling {sample_size} users for efficiency...")
        sampled_users = np.random.choice(user_item_matrix.index, sample_size, replace=False)
        user_matrix_sample = user_item_matrix.loc[sampled_users]
    else:
        user_matrix_sample = user_item_matrix
    
    if method == 'cosine':
        # Replace 0s with NaN, then back to 0 for cosine similarity
        matrix_filled = user_matrix_sample.replace(0, np.nan).fillna(0).values
        similarity_matrix = cosine_similarity(matrix_filled)
    
    elif method == 'pearson':
        # Calculate Pearson correlation
        similarity_matrix = user_matrix_sample.T.corr().fillna(0).values
    
    # Convert to DataFrame
    similarity_df = pd.DataFrame(
        similarity_matrix,
        index=user_matrix_sample.index,
        columns=user_matrix_sample.index
    )
    
    return similarity_df

# Calculate user similarity matrix (with sampling for efficiency)
user_similarity_df = calculate_user_similarity(user_item_matrix, method='cosine', sample_size=500)
print(f"✅ User similarity matrix: {user_similarity_df.shape}")

# Visualize user similarity matrix (sample)
plt.figure(figsize=(12, 10))
sample_users = user_similarity_df.index[:50]  # Sample first 50 users
sample_user_sim = user_similarity_df.loc[sample_users, sample_users]

sns.heatmap(sample_user_sim, 
            cmap='RdYlBu_r', 
            center=0, 
            square=True,
            cbar_kws={'label': 'Cosine Similarity'})
plt.title('User-User Similarity Matrix (Sample)', fontsize=16, fontweight='bold')
plt.xlabel('User ID')
plt.ylabel('User ID')
plt.tight_layout()
plt.show()

print(f"🎯 User similarity statistics:")
print(f"   Mean similarity: {user_similarity_df.values.mean():.4f}")
print(f"   Std similarity: {user_similarity_df.values.std():.4f}")
print(f"   Max similarity: {user_similarity_df.values.max():.4f}")

# Function to find similar users
def get_similar_users(user_id, similarity_matrix, user_item_matrix, n_similar=10):
    """Find users most similar to a given user."""
    if user_id not in similarity_matrix.index:
        return f"User ID {user_id} not found in similarity matrix"
    
    # Get similarities for the target user
    similarities = similarity_matrix.loc[user_id]
    
    # Sort and get top N (excluding the user themselves)
    similar_users = similarities.sort_values(ascending=False)[1:n_similar+1]
    
    # Get user statistics
    results = []
    for sim_user_id, similarity in similar_users.items():
        user_ratings = user_item_matrix.loc[sim_user_id]
        n_ratings = (user_ratings > 0).sum()
        avg_rating = user_ratings[user_ratings > 0].mean()
        
        results.append({
            'userId': sim_user_id,
            'similarity': similarity,
            'num_ratings': n_ratings,
            'avg_rating': avg_rating
        })
    
    return pd.DataFrame(results)

# Example: Find similar users
sample_user_id = user_similarity_df.index[0]
print(f"\n👤 Finding users similar to User ID: {sample_user_id}")

# Get sample user's stats
sample_user_ratings = user_item_matrix.loc[sample_user_id]
sample_user_n_ratings = (sample_user_ratings > 0).sum()
sample_user_avg = sample_user_ratings[sample_user_ratings > 0].mean()

print(f"   Number of ratings: {sample_user_n_ratings}")
print(f"   Average rating: {sample_user_avg:.2f}")

similar_users = get_similar_users(sample_user_id, user_similarity_df, user_item_matrix, n_similar=10)
print(f"\n🎯 Top 10 Similar Users:")
display(similar_users)

## 5. Feature Engineering for ML Models 🛠️

Let's create intelligent features that will help our machine learning models predict ratings accurately. These features capture user behavior, movie characteristics, and interaction patterns.

In [None]:
# Feature Engineering
def create_ml_features(ratings_df):
    """Create comprehensive features for machine learning models."""
    print("🔧 Creating ML features...")
    
    # Start with the ratings data
    features_df = ratings_df.copy()
    
    # 1. Global Statistics
    global_mean = ratings_df['rating'].mean()
    features_df['rating_gmean'] = global_mean
    
    print(f"   ✅ Global mean rating: {global_mean:.3f}")
    
    # 2. User-based Features
    user_stats = ratings_df.groupby('userId')['rating'].agg([
        'mean', 'std', 'count', 'min', 'max'
    ]).add_prefix('user_')
    
    # User bias (difference from global mean)
    user_stats['user_bias'] = user_stats['user_mean'] - global_mean
    
    # Fill NaN std with 0 (users with only one rating)
    user_stats['user_std'] = user_stats['user_std'].fillna(0)
    
    # Merge user features
    features_df = features_df.merge(user_stats, left_on='userId', right_index=True, how='left')
    
    print(f"   ✅ User features: {list(user_stats.columns)}")
    
    # 3. Movie-based Features  
    movie_stats = ratings_df.groupby('movieId')['rating'].agg([
        'mean', 'std', 'count', 'min', 'max'
    ]).add_prefix('movie_')
    
    # Movie bias (difference from global mean)
    movie_stats['movie_bias'] = movie_stats['movie_mean'] - global_mean
    
    # Fill NaN std with 0 (movies with only one rating)
    movie_stats['movie_std'] = movie_stats['movie_std'].fillna(0)
    
    # Merge movie features
    features_df = features_df.merge(movie_stats, left_on='movieId', right_index=True, how='left')
    
    print(f"   ✅ Movie features: {list(movie_stats.columns)}")
    
    # 4. Advanced Features
    
    # User activity levels
    features_df['user_activity_percentile'] = features_df['user_count'].rank(pct=True)
    features_df['is_heavy_user'] = (features_df['user_count'] > features_df['user_count'].quantile(0.8)).astype(int)
    features_df['is_light_user'] = (features_df['user_count'] < features_df['user_count'].quantile(0.2)).astype(int)
    
    # Movie popularity levels
    features_df['movie_popularity_percentile'] = features_df['movie_count'].rank(pct=True)
    features_df['is_popular_movie'] = (features_df['movie_count'] > features_df['movie_count'].quantile(0.8)).astype(int)
    features_df['is_niche_movie'] = (features_df['movie_count'] < features_df['movie_count'].quantile(0.2)).astype(int)
    
    # Rating ranges
    features_df['user_rating_range'] = features_df['user_max'] - features_df['user_min']
    features_df['movie_rating_range'] = features_df['movie_max'] - features_df['movie_min']
    
    # Z-scores (how unusual is this rating)
    features_df['user_rating_zscore'] = (features_df['rating'] - features_df['user_mean']) / (features_df['user_std'] + 1e-8)
    features_df['movie_rating_zscore'] = (features_df['rating'] - features_df['movie_mean']) / (features_df['movie_std'] + 1e-8)
    
    # Interaction features
    features_df['user_movie_bias_interaction'] = features_df['user_bias'] * features_df['movie_bias']
    features_df['user_activity_movie_popularity'] = features_df['user_count'] * features_df['movie_count']
    
    # Deviation features
    features_df['rating_deviation_from_user_mean'] = features_df['rating'] - features_df['user_mean']
    features_df['rating_deviation_from_movie_mean'] = features_df['rating'] - features_df['movie_mean']
    features_df['rating_deviation_from_global_mean'] = features_df['rating'] - features_df['rating_gmean']
    
    print(f"   ✅ Advanced features created")
    print(f"   📊 Total features: {features_df.shape[1]}")
    
    return features_df

# Create features
features_df = create_ml_features(filtered_df)

# Display feature information
print(f"\n📋 Feature DataFrame Shape: {features_df.shape}")
print(f"\n🔍 Feature Columns:")
feature_cols = [col for col in features_df.columns if col not in ['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres']]
for i, col in enumerate(feature_cols, 1):
    print(f"   {i:2d}. {col}")

# Feature correlation analysis
print(f"\n📊 Feature Importance Analysis")

# Select only numeric features for correlation
numeric_features = features_df.select_dtypes(include=[np.number])
feature_columns = [col for col in numeric_features.columns 
                  if col not in ['userId', 'movieId', 'rating', 'timestamp']]

# Calculate correlations with target (rating)
correlations = {}
for col in feature_columns:
    if numeric_features[col].std() > 0:  # Avoid constant features
        corr = numeric_features[col].corr(numeric_features['rating'])
        if not np.isnan(corr):
            correlations[col] = abs(corr)

# Sort by correlation strength
sorted_correlations = sorted(correlations.items(), key=lambda x: x[1], reverse=True)

print(f"\n🎯 Top 15 Most Important Features (by correlation with rating):")
for i, (feature, correlation) in enumerate(sorted_correlations[:15], 1):
    print(f"   {i:2d}. {feature}: {correlation:.4f}")

# Visualize top features
top_features = [item[0] for item in sorted_correlations[:10]]
top_correlations = [item[1] for item in sorted_correlations[:10]]

plt.figure(figsize=(12, 8))
plt.barh(range(len(top_features)), top_correlations, color='skyblue')
plt.yticks(range(len(top_features)), top_features)
plt.xlabel('Absolute Correlation with Rating')
plt.title('Top 10 Most Important Features', fontsize=16, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Save features for ML training
features_df.to_csv(PROCESSED_DATA_PATH / "ml_features.csv", index=False)
print(f"\n💾 Features saved to {PROCESSED_DATA_PATH / 'ml_features.csv'}")

## 6. Train Machine Learning Models 🤖

Now let's train multiple machine learning models to predict movie ratings using our engineered features. We'll compare different algorithms to find the best performer.

In [None]:
# Prepare data for ML training
def prepare_ml_data(features_df):
    """Prepare features and target for ML training."""
    # Select feature columns (exclude IDs, target, and metadata)
    exclude_cols = ['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres']
    
    # Select numeric features only
    numeric_df = features_df.select_dtypes(include=[np.number])
    feature_cols = [col for col in numeric_df.columns if col not in exclude_cols]
    
    # Features and target
    X = numeric_df[feature_cols].fillna(0)
    y = numeric_df['rating']
    
    # Remove rows with missing target
    valid_mask = ~y.isna()
    X = X[valid_mask]
    y = y[valid_mask]
    
    return X, y, feature_cols

print("🔧 Preparing data for ML training...")
X, y, feature_cols = prepare_ml_data(features_df)
print(f"✅ Prepared {X.shape[0]} samples with {X.shape[1]} features")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=None
)

print(f"📊 Data split:")
print(f"   Training: {X_train.shape[0]} samples")
print(f"   Testing: {X_test.shape[0]} samples")

# Scale features for linear models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define evaluation metrics
def calculate_metrics(y_true, y_pred):
    \"\"\"Calculate regression metrics.\"\"\"\n    rmse = np.sqrt(mean_squared_error(y_true, y_pred))\n    mae = mean_absolute_error(y_true, y_pred)\n    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100\n    \n    return {\n        'RMSE': rmse,\n        'MAE': mae,\n        'MAPE': mape\n    }\n\n# Train multiple models\nprint(f\"\\n🤖 Training ML Models...\")\nmodels = {}\nresults = {}\n\n# 1. Linear Regression\nprint(\"   📈 Training Linear Regression...\")\nlr_model = LinearRegression()\nlr_model.fit(X_train_scaled, y_train)\n\ny_pred_train_lr = lr_model.predict(X_train_scaled)\ny_pred_test_lr = lr_model.predict(X_test_scaled)\n\nmodels['Linear Regression'] = lr_model\nresults['Linear Regression'] = {\n    'Train': calculate_metrics(y_train, y_pred_train_lr),\n    'Test': calculate_metrics(y_test, y_pred_test_lr)\n}\n\n# 2. Ridge Regression\nprint(\"   🏔️ Training Ridge Regression...\")\nridge_model = Ridge(alpha=1.0)\nridge_model.fit(X_train_scaled, y_train)\n\ny_pred_train_ridge = ridge_model.predict(X_train_scaled)\ny_pred_test_ridge = ridge_model.predict(X_test_scaled)\n\nmodels['Ridge Regression'] = ridge_model\nresults['Ridge Regression'] = {\n    'Train': calculate_metrics(y_train, y_pred_train_ridge),\n    'Test': calculate_metrics(y_test, y_pred_test_ridge)\n}\n\n# 3. Random Forest\nprint(\"   🌳 Training Random Forest...\")\nrf_model = RandomForestRegressor(\n    n_estimators=100, \n    max_depth=10, \n    random_state=42,\n    n_jobs=-1\n)\nrf_model.fit(X_train, y_train)\n\ny_pred_train_rf = rf_model.predict(X_train)\ny_pred_test_rf = rf_model.predict(X_test)\n\nmodels['Random Forest'] = rf_model\nresults['Random Forest'] = {\n    'Train': calculate_metrics(y_train, y_pred_train_rf),\n    'Test': calculate_metrics(y_test, y_pred_test_rf)\n}\n\n# 4. Gradient Boosting\nprint(\"   🚀 Training Gradient Boosting...\")\ngb_model = GradientBoostingRegressor(\n    n_estimators=100,\n    max_depth=6,\n    learning_rate=0.1,\n    random_state=42\n)\ngb_model.fit(X_train, y_train)\n\ny_pred_train_gb = gb_model.predict(X_train)\ny_pred_test_gb = gb_model.predict(X_test)\n\nmodels['Gradient Boosting'] = gb_model\nresults['Gradient Boosting'] = {\n    'Train': calculate_metrics(y_train, y_pred_train_gb),\n    'Test': calculate_metrics(y_test, y_pred_test_gb)\n}\n\n# 5. XGBoost (if available)\ntry:\n    print(\"   ⚡ Training XGBoost...\")\n    xgb_model = xgb.XGBRegressor(\n        n_estimators=200,\n        max_depth=8,\n        learning_rate=0.1,\n        subsample=0.9,\n        random_state=42\n    )\n    xgb_model.fit(X_train, y_train)\n    \n    y_pred_train_xgb = xgb_model.predict(X_train)\n    y_pred_test_xgb = xgb_model.predict(X_test)\n    \n    models['XGBoost'] = xgb_model\n    results['XGBoost'] = {\n        'Train': calculate_metrics(y_train, y_pred_train_xgb),\n        'Test': calculate_metrics(y_test, y_pred_test_xgb)\n    }\nexcept NameError:\n    print(\"   ⚠️ XGBoost not available\")\n\n# 6. LightGBM (if available)\ntry:\n    print(\"   💡 Training LightGBM...\")\n    lgb_model = lgb.LGBMRegressor(\n        n_estimators=200,\n        max_depth=8,\n        learning_rate=0.1,\n        subsample=0.9,\n        random_state=42,\n        verbose=-1\n    )\n    lgb_model.fit(X_train, y_train)\n    \n    y_pred_train_lgb = lgb_model.predict(X_train)\n    y_pred_test_lgb = lgb_model.predict(X_test)\n    \n    models['LightGBM'] = lgb_model\n    results['LightGBM'] = {\n        'Train': calculate_metrics(y_train, y_pred_train_lgb),\n        'Test': calculate_metrics(y_test, y_pred_test_lgb)\n    }\nexcept NameError:\n    print(\"   ⚠️ LightGBM not available\")\n\nprint(f\"\\n✅ Trained {len(models)} models successfully!\")"

## 7. Model Evaluation and Comparison 📊

Let's evaluate all our models using RMSE and MAPE metrics to identify the best performer for movie rating prediction.

In [None]:
# Create model comparison DataFrame
def create_comparison_table(results):
    \"\"\"Create a comparison table of model results.\"\"\"\n    comparison_data = []\n    \n    for model_name, metrics in results.items():\n        train_metrics = metrics['Train']\n        test_metrics = metrics['Test']\n        \n        comparison_data.append({\n            'Model': model_name,\n            'Train_RMSE': train_metrics['RMSE'],\n            'Test_RMSE': test_metrics['RMSE'],\n            'Train_MAE': train_metrics['MAE'],\n            'Test_MAE': test_metrics['MAE'],\n            'Train_MAPE': train_metrics['MAPE'],\n            'Test_MAPE': test_metrics['MAPE']\n        })\n    \n    comparison_df = pd.DataFrame(comparison_data)\n    comparison_df = comparison_df.round(4)\n    \n    # Sort by test RMSE (lower is better)\n    comparison_df = comparison_df.sort_values('Test_RMSE')\n    \n    return comparison_df\n\n# Create and display comparison table\ncomparison_df = create_comparison_table(results)\nprint(\"🏆 Model Performance Comparison (sorted by Test RMSE)\")\nprint(\"=\" * 70)\ndisplay(comparison_df)\n\n# Identify best model\nbest_model_name = comparison_df.iloc[0]['Model']\nbest_rmse = comparison_df.iloc[0]['Test_RMSE']\nbest_mape = comparison_df.iloc[0]['Test_MAPE']\n\nprint(f\"\\n🥇 Best Model: {best_model_name}\")\nprint(f\"   📈 Test RMSE: {best_rmse:.4f}\")\nprint(f\"   📊 Test MAPE: {best_mape:.2f}%\")\n\n# Visualize model comparison\nfig, axes = plt.subplots(1, 3, figsize=(18, 6))\n\n# RMSE Comparison\naxes[0].bar(comparison_df['Model'], comparison_df['Test_RMSE'], color='lightcoral', alpha=0.7)\naxes[0].set_title('Test RMSE Comparison', fontweight='bold')\naxes[0].set_ylabel('RMSE')\naxes[0].tick_params(axis='x', rotation=45)\n\n# MAE Comparison\naxes[1].bar(comparison_df['Model'], comparison_df['Test_MAE'], color='lightblue', alpha=0.7)\naxes[1].set_title('Test MAE Comparison', fontweight='bold')\naxes[1].set_ylabel('MAE')\naxes[1].tick_params(axis='x', rotation=45)\n\n# MAPE Comparison\naxes[2].bar(comparison_df['Model'], comparison_df['Test_MAPE'], color='lightgreen', alpha=0.7)\naxes[2].set_title('Test MAPE Comparison', fontweight='bold')\naxes[2].set_ylabel('MAPE (%)')\naxes[2].tick_params(axis='x', rotation=45)\n\nplt.tight_layout()\nplt.show()\n\n# Feature importance for best tree-based model\nif best_model_name in ['Random Forest', 'Gradient Boosting', 'XGBoost', 'LightGBM']:\n    best_model = models[best_model_name]\n    \n    if hasattr(best_model, 'feature_importances_'):\n        feature_importance = pd.DataFrame({\n            'feature': feature_cols,\n            'importance': best_model.feature_importances_\n        }).sort_values('importance', ascending=False)\n        \n        print(f\"\\n🎯 Top 15 Feature Importance ({best_model_name}):\")\n        display(feature_importance.head(15))\n        \n        # Plot feature importance\n        plt.figure(figsize=(12, 8))\n        top_features = feature_importance.head(15)\n        plt.barh(range(len(top_features)), top_features['importance'], color='skyblue')\n        plt.yticks(range(len(top_features)), top_features['feature'])\n        plt.xlabel('Feature Importance')\n        plt.title(f'Top 15 Feature Importance - {best_model_name}', fontsize=16, fontweight='bold')\n        plt.gca().invert_yaxis()\n        plt.tight_layout()\n        plt.show()\n\n# Prediction vs Actual scatter plot for best model\nif best_model_name in models:\n    best_model = models[best_model_name]\n    \n    # Get predictions\n    if best_model_name in ['Linear Regression', 'Ridge Regression']:\n        y_pred_best = best_model.predict(X_test_scaled)\n    else:\n        y_pred_best = best_model.predict(X_test)\n    \n    # Create scatter plot\n    plt.figure(figsize=(10, 8))\n    plt.scatter(y_test, y_pred_best, alpha=0.5, color='blue')\n    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)\n    plt.xlabel('Actual Rating')\n    plt.ylabel('Predicted Rating')\n    plt.title(f'Predicted vs Actual Ratings - {best_model_name}', fontsize=16, fontweight='bold')\n    \n    # Add statistics\n    plt.text(0.05, 0.95, f'RMSE: {best_rmse:.4f}\\nMAPE: {best_mape:.2f}%', \n             transform=plt.gca().transAxes, fontsize=12, \n             verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))\n    \n    plt.tight_layout()\n    plt.show()\n\n# Residual analysis\nresiduals = y_test - y_pred_best\n\nplt.figure(figsize=(12, 5))\n\n# Residual histogram\nplt.subplot(1, 2, 1)\nplt.hist(residuals, bins=50, color='lightblue', edgecolor='black', alpha=0.7)\nplt.xlabel('Residuals')\nplt.ylabel('Frequency')\nplt.title('Residual Distribution', fontweight='bold')\nplt.axvline(0, color='red', linestyle='--')\n\n# Residual vs Predicted\nplt.subplot(1, 2, 2)\nplt.scatter(y_pred_best, residuals, alpha=0.5, color='green')\nplt.xlabel('Predicted Rating')\nplt.ylabel('Residuals')\nplt.title('Residuals vs Predicted', fontweight='bold')\nplt.axhline(0, color='red', linestyle='--')\n\nplt.tight_layout()\nplt.show()\n\nprint(f\"\\n📊 Residual Analysis:\")\nprint(f\"   Mean residual: {residuals.mean():.4f}\")\nprint(f\"   Std residual: {residuals.std():.4f}\")"

## 8. Generate Movie Recommendations 🎬✨

Finally, let's combine our similarity-based approach with ML predictions to create a comprehensive recommendation system that suggests movies to users.

In [None]:
# Create comprehensive recommendation system\nclass MovieRecommendationSystem:\n    def __init__(self, user_item_matrix, movie_similarity_df, models, scaler, \n                 feature_cols, movies_df, features_df):\n        self.user_item_matrix = user_item_matrix\n        self.movie_similarity_df = movie_similarity_df\n        self.models = models\n        self.scaler = scaler\n        self.feature_cols = feature_cols\n        self.movies_df = movies_df\n        self.features_df = features_df\n        \n        # Calculate statistics\n        self.global_mean = features_df['rating'].mean()\n        self.user_means = features_df.groupby('userId')['rating'].mean().to_dict()\n        self.movie_means = features_df.groupby('movieId')['rating'].mean().to_dict()\n    \n    def get_user_rated_movies(self, user_id):\n        \"\"\"Get movies already rated by user.\"\"\"\n        user_ratings = self.user_item_matrix.loc[user_id] if user_id in self.user_item_matrix.index else pd.Series()\n        return user_ratings[user_ratings > 0].index.tolist()\n    \n    def recommend_item_based(self, user_id, n_recommendations=10):\n        \"\"\"Generate recommendations using item-based collaborative filtering.\"\"\"\n        if user_id not in self.user_item_matrix.index:\n            return self.get_popular_movies(n_recommendations)\n        \n        user_ratings = self.user_item_matrix.loc[user_id]\n        rated_movies = user_ratings[user_ratings > 0]\n        \n        if len(rated_movies) == 0:\n            return self.get_popular_movies(n_recommendations)\n        \n        # Calculate weighted scores for unrated movies\n        movie_scores = {}\n        \n        for movie_id in self.movie_similarity_df.index:\n            if movie_id not in rated_movies.index:  # Unrated movie\n                score = 0\n                weight_sum = 0\n                \n                for rated_movie, rating in rated_movies.items():\n                    if rated_movie in self.movie_similarity_df.index:\n                        similarity = self.movie_similarity_df.loc[movie_id, rated_movie]\n                        if similarity > 0.1:  # Threshold\n                            score += similarity * rating\n                            weight_sum += similarity\n                \n                if weight_sum > 0:\n                    movie_scores[movie_id] = score / weight_sum\n        \n        # Sort and return top recommendations\n        sorted_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)\n        return [movie_id for movie_id, score in sorted_movies[:n_recommendations]]\n    \n    def recommend_ml_based(self, user_id, model_name='XGBoost', n_recommendations=10):\n        \"\"\"Generate recommendations using ML model.\"\"\"\n        if model_name not in self.models:\n            return self.get_popular_movies(n_recommendations)\n        \n        model = self.models[model_name]\n        rated_movies = self.get_user_rated_movies(user_id)\n        \n        # Get user statistics\n        user_mean = self.user_means.get(user_id, self.global_mean)\n        \n        # Predict ratings for unrated movies\n        movie_scores = {}\n        \n        for movie_id in self.movies_df['movieId'].unique():\n            if movie_id not in rated_movies:\n                movie_mean = self.movie_means.get(movie_id, self.global_mean)\n                \n                # Create feature vector\n                features = {\n                    'rating_gmean': self.global_mean,\n                    'user_mean': user_mean,\n                    'movie_mean': movie_mean,\n                    'user_bias': user_mean - self.global_mean,\n                    'movie_bias': movie_mean - self.global_mean,\n                    'user_movie_bias_interaction': (user_mean - self.global_mean) * (movie_mean - self.global_mean)\n                }\n                \n                # Fill missing features with defaults\n                feature_vector = np.zeros(len(self.feature_cols))\n                for i, feature in enumerate(self.feature_cols):\n                    if feature in features:\n                        feature_vector[i] = features[feature]\n                \n                # Predict rating\n                if model_name in ['Linear Regression', 'Ridge Regression']:\n                    feature_vector_scaled = self.scaler.transform(feature_vector.reshape(1, -1))\n                    predicted_rating = model.predict(feature_vector_scaled)[0]\n                else:\n                    predicted_rating = model.predict(feature_vector.reshape(1, -1))[0]\n                \n                movie_scores[movie_id] = max(0.5, min(5.0, predicted_rating))\n        \n        # Sort and return top recommendations\n        sorted_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)\n        return [movie_id for movie_id, score in sorted_movies[:n_recommendations]]\n    \n    def get_popular_movies(self, n_recommendations=10):\n        \"\"\"Get most popular movies as fallback.\"\"\"\n        movie_popularity = self.features_df['movieId'].value_counts()\n        return movie_popularity.head(n_recommendations).index.tolist()\n    \n    def get_hybrid_recommendations(self, user_id, n_recommendations=10, \n                                 weights={'item_based': 0.4, 'ml_based': 0.6}):\n        \"\"\"Generate hybrid recommendations combining multiple methods.\"\"\"\n        all_recommendations = {}\n        \n        # Item-based recommendations\n        if 'item_based' in weights:\n            item_recs = self.recommend_item_based(user_id, n_recommendations * 2)\n            for i, movie_id in enumerate(item_recs):\n                score = (len(item_recs) - i) * weights['item_based']\n                all_recommendations[movie_id] = all_recommendations.get(movie_id, 0) + score\n        \n        # ML-based recommendations\n        if 'ml_based' in weights:\n            ml_recs = self.recommend_ml_based(user_id, best_model_name, n_recommendations * 2)\n            for i, movie_id in enumerate(ml_recs):\n                score = (len(ml_recs) - i) * weights['ml_based']\n                all_recommendations[movie_id] = all_recommendations.get(movie_id, 0) + score\n        \n        # Sort by combined score\n        sorted_recs = sorted(all_recommendations.items(), key=lambda x: x[1], reverse=True)\n        return [movie_id for movie_id, score in sorted_recs[:n_recommendations]]\n    \n    def format_recommendations(self, movie_ids, method_name):\n        \"\"\"Format recommendations with movie information.\"\"\"\n        recommendations = []\n        for movie_id in movie_ids:\n            movie_info = self.movies_df[self.movies_df['movieId'] == movie_id]\n            if len(movie_info) > 0:\n                movie_data = movie_info.iloc[0]\n                recommendations.append({\n                    'movieId': movie_id,\n                    'title': movie_data['title'],\n                    'genres': movie_data['genres'],\n                    'method': method_name,\n                    'avg_rating': self.movie_means.get(movie_id, self.global_mean)\n                })\n        return pd.DataFrame(recommendations)\n\n# Initialize recommendation system\nrec_system = MovieRecommendationSystem(\n    user_item_matrix=user_item_matrix,\n    movie_similarity_df=movie_similarity_df,\n    models=models,\n    scaler=scaler,\n    feature_cols=feature_cols,\n    movies_df=movies_df,\n    features_df=features_df\n)\n\nprint(\"🎯 Movie Recommendation System Initialized!\")\n\n# Test recommendations for a sample user\nsample_user_id = user_item_matrix.index[0]  # First user in our matrix\nprint(f\"\\n👤 Generating recommendations for User ID: {sample_user_id}\")\n\n# Get user's rating history\nuser_rated_movies = rec_system.get_user_rated_movies(sample_user_id)\nprint(f\"   📊 User has rated {len(user_rated_movies)} movies\")\n\nif len(user_rated_movies) > 0:\n    print(f\"\\n🎬 Sample of user's rated movies:\")\n    sample_rated = user_rated_movies[:5]\n    for movie_id in sample_rated:\n        movie_info = movies_df[movies_df['movieId'] == movie_id]\n        if len(movie_info) > 0:\n            title = movie_info.iloc[0]['title']\n            rating = user_item_matrix.loc[sample_user_id, movie_id]\n            print(f\"     {title}: {rating}⭐\")\n\n# Generate recommendations using different methods\nprint(f\"\\n🎯 Recommendations for User {sample_user_id}:\")\nprint(\"=\" * 60)\n\n# 1. Item-based Collaborative Filtering\nitem_recs = rec_system.recommend_item_based(sample_user_id, 5)\nitem_recs_df = rec_system.format_recommendations(item_recs, 'Item-based CF')\nprint(\"\\n🔗 Item-based Collaborative Filtering:\")\ndisplay(item_recs_df[['title', 'genres', 'avg_rating']].head())\n\n# 2. ML-based Recommendations\nml_recs = rec_system.recommend_ml_based(sample_user_id, best_model_name, 5)\nml_recs_df = rec_system.format_recommendations(ml_recs, f'ML ({best_model_name})')\nprint(f\"\\n🤖 ML-based ({best_model_name}):\")\ndisplay(ml_recs_df[['title', 'genres', 'avg_rating']].head())\n\n# 3. Hybrid Recommendations\nhybrid_recs = rec_system.get_hybrid_recommendations(sample_user_id, 5)\nhybrid_recs_df = rec_system.format_recommendations(hybrid_recs, 'Hybrid')\nprint(\"\\n🎭 Hybrid Recommendations:\")\ndisplay(hybrid_recs_df[['title', 'genres', 'avg_rating']].head())\n\n# Summary and conclusions\nprint(\"\\n🎉 Movie Recommendation System Complete!\")\nprint(\"=\" * 50)\nprint(f\"✅ Successfully built a comprehensive recommendation system with:\")\nprint(f\"   📊 {len(features_df)} ratings processed\")\nprint(f\"   👥 {features_df['userId'].nunique()} users\")\nprint(f\"   🎬 {features_df['movieId'].nunique()} movies\")\nprint(f\"   🔧 {len(feature_cols)} engineered features\")\nprint(f\"   🤖 {len(models)} trained ML models\")\nprint(f\"   🏆 Best model: {best_model_name} (RMSE: {best_rmse:.4f})\")\nprint(f\"   🎯 Hybrid recommendation system combining multiple approaches\")\n\nprint(f\"\\n📈 Key Insights:\")\nprint(f\"   • {comparison_df.iloc[0]['Model']} achieved the best performance\")\nprint(f\"   • Feature engineering significantly improved prediction accuracy\")\nprint(f\"   • Hybrid approach combines strengths of different methods\")\nprint(f\"   • System can handle both warm-start and cold-start scenarios\")"