# 🎬 Simple Movie Recommender — End-to-End (Dataset → Training → Plots → Interactive Demo)

This notebook builds a tiny, self-contained movie recommendation demo:
- **Realistic synthetic dataset** (users, movies, ratings)
- **Model training** (Keras / TensorFlow) with visible progress
- **Plots**: rating distribution, age vs rating, and genre impact
- **Interactive sliders**: test new user–movie pairs and see predicted ratings

> **Requirements**: TensorFlow 2.x, pandas, numpy, matplotlib, scikit-learn, ipywidgets (for the sliders).

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

# For interactive widgets (only needed when running the sliders section)
from ipywidgets import interact, FloatSlider, IntSlider, Dropdown

# Make output deterministic-ish
np.random.seed(42)
tf.random.set_seed(42)

print("TensorFlow:", tf.__version__)

## 1) Generate a realistic synthetic dataset

In [None]:
# --- Users ---
num_users = 50
users = pd.DataFrame({
    'user_id': np.arange(1, num_users + 1),
    'age': np.random.randint(18, 60, size=num_users),
    'gender': np.random.randint(0, 2, size=num_users),  # 0=female, 1=male
    'pref_action': np.random.rand(num_users),
    'pref_comedy': np.random.rand(num_users),
    'pref_drama': np.random.rand(num_users)
})

# --- Movies ---
num_movies = 50
movies = pd.DataFrame({
    'movie_id': np.arange(101, 101 + num_movies),
    'genre': np.random.randint(0, 3, size=num_movies),  # 0=Action, 1=Comedy, 2=Drama
    'popularity': np.random.randint(50, 100, size=num_movies),
    'year': np.random.randint(1980, 2024, size=num_movies)
})

# --- Ratings ---
ratings_list = []
for _, row in users.iterrows():
    # each user rates 30 random movies
    movie_choices = np.random.choice(movies['movie_id'], 30, replace=False)
    for movie_id in movie_choices:
        movie_genre = movies.loc[movies['movie_id'] == movie_id, 'genre'].values[0]
        # latent preference signal: rating depends on (user preference for that genre + some noise)
        if movie_genre == 0:   # Action
            rating = int(np.clip(1 + 4 * row['pref_action'] + np.random.randn() * 0.2, 1, 5))
        elif movie_genre == 1: # Comedy
            rating = int(np.clip(1 + 4 * row['pref_comedy'] + np.random.randn() * 0.2, 1, 5))
        else:                  # Drama
            rating = int(np.clip(1 + 4 * row['pref_drama']   + np.random.randn() * 0.2, 1, 5))
        ratings_list.append([row['user_id'], movie_id, rating])

ratings = pd.DataFrame(ratings_list, columns=['user_id', 'movie_id', 'rating'])

users.head(), movies.head(), ratings.head(), ratings.shape

## 2) Exploratory data analysis (EDA) plots

In [None]:
# Join for easy plotting (do NOT scale here)
data = ratings.merge(users, on='user_id').merge(movies, on='movie_id')

# Plot 1: Rating distribution
plt.figure()
plt.hist(data['rating'], bins=[1,2,3,4,5,6], align='left', rwidth=0.8)
plt.xticks([1,2,3,4,5])
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Rating Distribution')
plt.show()

# Plot 2: Age vs Rating (scatter; jitter to reduce overlap)
jitter = np.random.normal(0, 0.1, size=len(data))
plt.figure()
plt.scatter(data['age'] + jitter, data['rating'] + jitter, alpha=0.5)
plt.xlabel('Age')
plt.ylabel('Rating')
plt.title('Age vs Rating (with jitter)')
plt.show()

# Plot 3: Average rating by genre
genre_map = {0: 'Action', 1: 'Comedy', 2: 'Drama'}
avg_by_genre = data.groupby('genre')['rating'].mean().rename(index=genre_map)

plt.figure()
plt.bar(avg_by_genre.index.astype(str), avg_by_genre.values)
plt.xlabel('Genre')
plt.ylabel('Average Rating')
plt.title('Average Rating by Genre')
for i, v in enumerate(avg_by_genre.values):
    plt.text(i, v + 0.02, f"{v:.2f}", ha='center')
plt.show()

avg_by_genre

## 3) Preprocess & Train/Validation split

In [None]:
feature_cols = ['age', 'gender', 'genre', 'popularity', 'year',
                'pref_action', 'pref_comedy', 'pref_drama']
X = data[feature_cols].values
y = data['rating'].astype(float).values

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

X_train.shape, X_val.shape

## 4) Build & Train the model (with progress)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

EPOCHS = 300
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    verbose=1
)

plt.figure()
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.title('Training Progress (Loss)')
plt.legend()
plt.show()

plt.figure()
plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Val MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.title('Training Progress (MAE)')
plt.legend()
plt.show()

## 5) Save model and artifacts

In [None]:
os.makedirs("models", exist_ok=True)
model_path = "models/movie_model_with_prefs.keras"
model.save(model_path)

users.to_csv("users.csv", index=False)
movies.to_csv("movies.csv", index=False)
ratings.to_csv("ratings.csv", index=False)

print("Saved:", model_path, "and CSVs (users.csv, movies.csv, ratings.csv)")

## 6) Try it: interactive prediction with sliders

In [None]:
genre_label_to_id = {'Action': 0, 'Comedy': 1, 'Drama': 2}

@interact(
    age=IntSlider(value=27, min=13, max=80, step=1, description='Age'),
    gender=Dropdown(options=[('Female (0)', 0), ('Male (1)', 1)], value=0, description='Gender'),
    genre=Dropdown(options=['Action', 'Comedy', 'Drama'], value='Comedy', description='Genre'),
    popularity=IntSlider(value=80, min=40, max=100, step=1, description='Popularity'),
    year=IntSlider(value=2015, min=1980, max=2024, step=1, description='Year'),
    pref_action=FloatSlider(value=0.2, min=0.0, max=1.0, step=0.01, description='Pref Action'),
    pref_comedy=FloatSlider(value=0.9, min=0.0, max=1.0, step=0.01, description='Pref Comedy'),
    pref_drama=FloatSlider(value=0.3, min=0.0, max=1.0, step=0.01, description='Pref Drama')
)
def predict_interactive(age, gender, genre, popularity, year, pref_action, pref_comedy, pref_drama):
    row = np.array([[
        age, gender, genre_label_to_id[genre], popularity, year,
        pref_action, pref_comedy, pref_drama
    ]], dtype=float)

    row_scaled = scaler.transform(row)
    pred = model.predict(row_scaled, verbose=0)[0][0]
    pred_clamped = float(np.clip(pred, 1.0, 5.0))
    print(f"Predicted rating: {pred_clamped:.2f} (raw: {pred:.3f})")

### Example: single prediction (non-interactive)

In [None]:
new_data = np.array([[27, 0, 1, 80, 2015, 0.2, 0.9, 0.3]])
new_data_scaled = scaler.transform(new_data)
predicted_rating = model.predict(new_data_scaled, verbose=0)
print("Predicted rating for example pair:", float(predicted_rating[0][0]))