In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import os
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dropout

# Paths

In [2]:

RAW_DATA_DIR = "../data/raw/"
PROCESSED_DATA_DIR = "../data/processed/"
MODEL_DIR = "../models/"
MODEL_PATH = os.path.join(MODEL_DIR, "recommender_model.h5")


# Load raw datasets

In [3]:

users_file = os.path.join(RAW_DATA_DIR, "users.dat")
movies_file = os.path.join(RAW_DATA_DIR, "movies.dat")
ratings_file = os.path.join(RAW_DATA_DIR, "ratings.dat")

users = pd.read_csv(users_file, sep="::", engine="python", names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])
movies = pd.read_csv(movies_file, sep="::", engine="python", names=["MovieID", "Title", "Genres"], encoding="latin1")
ratings = pd.read_csv(ratings_file, sep="::", engine="python", names=["UserID", "MovieID", "Rating", "Timestamp"])

# Preprocessing

In [4]:

ratings["Timestamp"] = pd.to_datetime(ratings["Timestamp"], unit="s")
users["Gender"] = users["Gender"].map({"F": 0, "M": 1})

genres = movies["Genres"].str.get_dummies("|")
movies = pd.concat([movies.drop(columns=["Genres"]), genres], axis=1)

merged_df = ratings.merge(users, on="UserID").merge(movies, on="MovieID")
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
merged_df.to_csv(os.path.join(PROCESSED_DATA_DIR, "movielens_1m_preprocessed.csv"), index=False)
print("✅ Data preprocessed and saved!")

✅ Data preprocessed and saved!


# Load preprocessed data

In [5]:

df = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, "movielens_1m_preprocessed.csv"), low_memory=False)

In [6]:
df.columns


Index(['UserID', 'MovieID', 'Rating', 'Timestamp', 'Gender', 'Age',
       'Occupation', 'Zip-code', 'Title', 'Action', 'Adventure', 'Animation',
       'Children's', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Thriller', 'War', 'Western'],
      dtype='object')

# Convert IDs to indexes

In [7]:

user_ids = df["UserID"].unique().tolist()
movie_ids = df["MovieID"].unique().tolist()
user_id_to_index = {user_id: i for i, user_id in enumerate(user_ids)}
movie_id_to_index = {movie_id: i for i, movie_id in enumerate(movie_ids)}
df["UserID"] = df["UserID"].map(user_id_to_index)
df["MovieID"] = df["MovieID"].map(movie_id_to_index)

# Train-test split

In [8]:

train, test = train_test_split(df, test_size=0.2, random_state=42)
num_users = len(user_ids)
num_movies = len(movie_ids)
embedding_size = 50

# Build Model

In [9]:

# User and movie ID inputs
input_user = Input(shape=(1,))
input_movie = Input(shape=(1,))

# Additional user features
input_gender = Input(shape=(1,))      # Binary (M/F)
input_age = Input(shape=(1,))         # Categorized age
input_occupation = Input(shape=(1,))  # Needs embedding

# Additional movie features
input_genres = Input(shape=(num_genres,))  # One-hot encoded genres

# Time feature
input_timestamp = Input(shape=(1,))


user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size)(input_user)
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_size)(input_movie)
occupation_embedding = Embedding(input_dim=num_occupations, output_dim=embedding_size)(input_occupation)

# Flatten embeddings
user_vec = Flatten()(user_embedding)
movie_vec = Flatten()(movie_embedding)
occupation_vec = Flatten()(occupation_embedding)
user_vec = Flatten()(user_embedding)
movie_vec = Flatten()(movie_embedding)


concat = Concatenate()([user_vec, movie_vec, input_gender, input_age, occupation_vec, input_genres, input_timestamp])

dense1 = Dense(128, activation="relu", kernel_regularizer=l2(0.001))(concat)
drop1 = Dropout(0.2)(dense1)
dense2 = Dense(64, activation="relu", kernel_regularizer=l2(0.001))(drop1)
drop2 = Dropout(0.2)(dense2)
output = Dense(1, activation="linear")(drop2)

model = keras.Model(inputs=[input_user, input_movie, input_gender, input_age, input_occupation, input_genres, input_timestamp], 
                    outputs=output)

opt = keras.optimizers.Adam(learning_rate=0.0005)
model.compile(optimizer=opt, loss="mse", metrics=["mae"])

NameError: name 'num_genres' is not defined

# Train Model

In [None]:
train_X = [train["UserID"].values, train["MovieID"].values, train["Gender"].values, 
           train["Age"].values, train["Occupation"].values, train[genre_columns].values, 
           train["Timestamp"].values]

test_X = [test["UserID"].values, test["MovieID"].values, test["Gender"].values, 
          test["Age"].values, test["Occupation"].values, test[genre_columns].values, 
          test["Timestamp"].values]

early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

model.fit(train_X, train_y, 
          epochs=20, batch_size=128, 
          validation_data=(test_X, test_y), 
          callbacks=[early_stopping])

os.makedirs(MODEL_DIR, exist_ok=True)
model.save(MODEL_PATH)
print("✅ Model trained and saved!")

# Load model

In [None]:

df = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, "movielens_1m_preprocessed.csv"), low_memory=False)
model = tf.keras.models.load_model(MODEL_PATH)

user_id_to_index = {user_id: i for i, user_id in enumerate(user_ids)}
movie_id_to_index = {movie_id: i for i, movie_id in enumerate(movie_ids)}
index_to_movie_id = {i: movie_id for movie_id, i in movie_id_to_index.items()}
movie_id_to_title = df.set_index("MovieID")["Title"].to_dict()

# Recommendation Function

In [None]:

def get_top_n_recommendations(user_id, n=10):
    if user_id not in user_id_to_index:
        print(f"⚠️ User id {user_id} not found.")
        return []
    
    user_index = user_id_to_index[user_id]
    rated_movies = df[df["UserID"] == user_id]["MovieID"].values
    unrated_movie_ids = [m for m in movie_ids if m not in rated_movies]
    
    if not unrated_movie_ids:
        print(f"🎬 User {user_id} has rated all movies. No new recommendations.")
        return []
    
    unrated_movie_indices = np.array([movie_id_to_index[m] for m in unrated_movie_ids])
    user_indices = np.full_like(unrated_movie_indices, user_index)
    predicted_ratings = model.predict([user_indices, unrated_movie_indices]).flatten()
    
    top_n_indices = np.argsort(predicted_ratings)[-n:][::-1]
    top_n_movies = [(index_to_movie_id[unrated_movie_indices[i]], predicted_ratings[i]) for i in top_n_indices]
    
    return top_n_movies


# Get User ID

In [None]:

try:
    min_id, max_id = min(user_ids), max(user_ids)
    print(f"\n👥 Available user ids: {min_id} - {max_id}")
    user_id = int(input("Enter UserID: ").strip())
    recommendations = get_top_n_recommendations(user_id, 10)
    
    if recommendations:
        print("\n🎥 Top 10 movie recommendations:")
        for movie_id, rating in recommendations:
            movie_title = movie_id_to_title.get(movie_id, "Unknown Movie")
            print(f"⭐ {movie_title} (Predicted rating: {rating:.2f})")
except ValueError:
    print("❌ Invalid input! Enter a numeric UserID.")
