In [None]:
!pip install boto3
!pip install s3fs

# Section 1 — Data Loading

In this section, I loaded the MovieLens 20M dataset from AWS S3.
We load the `ratings.csv` and `movies.csv` files as pandas DataFrames for preprocessing.

Key steps:
- Read CSV files into pandas
- Inspect columns and sample rows
- Verify data integrity


In [None]:
#aws SDK for python that enable direct API interaction with S3
import boto3
import s3fs
import pandas as pd

In [None]:
s3 = boto3.client(
    "s3",
    aws_access_key_id='ID',
    aws_secret_access_key='KEY',
    region_name='region'
)

In [None]:
obj = s3.get_object(
    Bucket="movielens-rec-bucket",
    Key="raw/rating.csv"   # make sure filename is exact
)

ratings = pd.read_csv(obj["Body"])

obj = s3.get_object(
    Bucket="movielens-rec-bucket",
    Key="raw/movie.csv"
)

movies = pd.read_csv(obj["Body"])

ratings.head()

## Section 2 — Preprocessing

This section prepares the dataset for training the ALS recommender.

Key steps:
- Convert explicit ratings to implicit feedback (rating >= 4 → interaction = 1)
- Encode `userId` and `movieId` into contiguous integer indices
- Build sparse user-item matrix (CSR) for efficient ALS training
- Split data into training and test sets using temporal split


In [None]:
#preprocessing data
from scipy.sparse import csr_matrix

def convert_to_implicit(ratings, threshold: float = 4.0):
  '''
  convert explicit ratings to implicit feedback.
  Rating >= threshold becomes 1 interaction
  '''

  #add interaction column to ratings df
  ratings['interaction'] = (ratings['rating'] >= threshold).astype(int)
  ratings = ratings[ratings['interaction'] > 0]
  return ratings

In [None]:
def encoding(df, user_map=None, item_map=None):
  '''
  encode userId and movieId using provided mappings.
  If mappings are not provided, create them from the DataFrame.
  Removes rows with users/items not found in the provided maps.
  '''
  if user_map is None:
    user_map = {user: id for id, user in enumerate(df['userId'].unique())}
  if item_map is None:
    item_map = {item: id for id, item in enumerate(df['movieId'].unique())}

  df['user_idx'] = df['userId'].map(user_map)
  df['item_idx'] = df['movieId'].map(item_map)

  # Drop rows where mapping failed (user/item not in map)
  # This ensures that all indices used in the sparse matrix are valid within the global maps
  df.dropna(subset=['user_idx', 'item_idx'], inplace=True)
  df['user_idx'] = df['user_idx'].astype(int)
  df['item_idx'] = df['item_idx'].astype(int)

  return df, user_map, item_map

In [None]:
def build_sparse_matrix(ratings, n_users: int, n_items:int) -> csr_matrix:
  '''
  building a sparse matrix(user x item) for training the ALS implicit later
  '''
  interactions = csr_matrix(
      (ratings['interaction'], (ratings['user_idx'], ratings['item_idx'])),
      shape=(n_users, n_items)
  )

  return interactions

In [None]:
#testing to make sure the 3 functions work
# The initial encoding should be based on the full dataset for consistent mappings
ratings_imp = convert_to_implicit(ratings) # This line was already executed above
ratings_encode_test, test_userId_map, test_itemId_map = encoding(ratings_imp)
print(ratings_encode_test.head())

# Use these maps to build the sparse matrix for testing
sparse_matrix = build_sparse_matrix(ratings_encode_test, len(test_userId_map), len(test_itemId_map))
print(sparse_matrix.shape)

## Section 3 — Training

Here I trained the Implicit ALS model using the preprocessed sparse matrix.

Key steps:
- Initialize ALS model with chosen hyperparameters (factors, regularization, iterations)
- Train the model on the training sparse matrix
- Save model metrics (optional) for evaluation
- Model is now ready to generate recommendations


In [None]:
!pip install implicit

In [None]:
import implicit
import pickle

In [None]:
def train_als(sparse_matrix, factors = 50, regularization = 0.1, iterations = 20):
  '''
  train implicit ALS model on the user_item sparse matrix.

  parameters:
  - sparse_matrix: csr_matrix, user-item interaction matrix
  - factors: int, number of latent
  - regularization: float, regularization parameter
  - iterations: int, number of iterations
  - use_gpu: bool, whether to use GPU for training

  returns:
  - model: implicit ALS model
  '''

  model = implicit.als.AlternatingLeastSquares(
      factors=factors,
      regularization=regularization,
      calculate_training_loss=True,
      iterations=iterations
  )

  # The implicit.als.AlternatingLeastSquares.fit method generally expects a user x item matrix
  # (users as rows, items as columns). Passing the already user x item sparse_matrix directly.
  model.fit(sparse_matrix)

  return model

In [None]:
def save_model_to_s3(model, filename, bucket_name):
  '''
  Save trained ALS model to local file and upload to S3
  '''

  with open(filename, "wb") as f:
    pickle.dump(model, f)

  s3 = boto3.client("s3")
  s3.upload_file(filename, bucket_name, f"models/{filename}")

In [None]:
# Train ALS
als_model = train_als(sparse_matrix, factors=100, iterations=20, regularization=0.05)

# Generate recommendations for user 0
user_idx = 0
recommended = als_model.recommend(user_idx, sparse_matrix[user_idx], N=10)

# Save model to S3
#save_model_to_s3(als_model, "als_model.pkl", bucket_name="movielens-rec-bucket")

In [None]:
def train_test_split_by_time(ratings, test_ratio=0.2):
  '''
  Split data into train/test set by timestamp
  '''
  ratings = ratings.sort_values("timestamp")

  train_rows = []
  test_rows = []

  for user_id, user_data in ratings.groupby("userId"):
      n_test = int(len(user_data) * test_ratio)
      if n_test == 0:
          continue

      train_rows.append(user_data.iloc[:-n_test])
      test_rows.append(user_data.iloc[-n_test:])

  train = pd.concat(train_rows)
  test = pd.concat(test_rows)

  return train, test


In [None]:
ratings_imp = convert_to_implicit(ratings)

# Create global mappings from the entire implicit dataset
# Using .copy() to avoid SettingWithCopyWarning if ratings_imp is a view
ratings_encoded_full, global_userId_map, global_itemId_map = encoding(ratings_imp.copy())

train_ratings, test_ratings = train_test_split_by_time(ratings_imp)

# Encode train_ratings using the global mappings
train_ratings_enc, _, _ = encoding(train_ratings.copy(), global_userId_map, global_itemId_map)

# Encode test_ratings using the global mappings
test_ratings_enc, _, _ = encoding(test_ratings.copy(), global_userId_map, global_itemId_map)

train_matrix = build_sparse_matrix(train_ratings_enc, len(global_userId_map), len(global_itemId_map))

In [None]:
#train ALS model only on train set
als_model = train_als(train_matrix, factors=100, iterations=20, regularization=0.05)

## Section 4 — Evaluation

I evaluated the model’s ranking quality and characteristics.

Key steps:
- Compute Recall@K and MAP@K to measure hit rate and ranking accuracy
- Analyze genre diversity in top-K recommendations
- Assess popularity bias (does the model over-recommend blockbuster movies?)
- Discuss cold-start limitations for users/items not seen during training


In [None]:
def recall_at_k(model, train_matrix, test_ratings, userId_map, itemId_map, K=10):
  idx2item = {v: k for k, v in itemId_map.items()}
  recalls = []

  for user_id, user_data in test_ratings.groupby("userId"):
    if user_id not in userId_map:
      continue  # skip unknown users

    internal_uid = userId_map[user_id]  # ALS internal index

    # Pass only the current user's interaction vector
    recommended = model.recommend(
      userid=internal_uid,
      user_items=train_matrix[internal_uid],
      N=K
    )
    rec_items = {idx2item[i] for i in recommended[0] if i in idx2item}
    true_items = set(user_data["movieId"])
    recall = len(rec_items & true_items) / len(true_items)
    recalls.append(recall)

  return sum(recalls) / len(recalls)

In [None]:
recall_10 = recall_at_k(
    als_model,
    train_matrix,
    test_ratings_enc, # Pass the encoded test ratings
    global_userId_map, # Pass the global userId map
    global_itemId_map, # Pass the global itemId map
    K=10
)

print("Recall@10:", recall_10)

In [None]:
#use map@k to check if the model rank relevant items high in position 1/2
import numpy as np

def average_precision_at_k(recommended, relevant, k=10):
  '''
  recommended: list of recommended item indices
  relevant: set/list of true item indices
  '''
  if len(relevant) == 0:
    return 0.0

  score = 0.0
  hits = 0.0

  for i in range(min(k, len(recommended))):
    if recommended[i] in relevant:
      hits += 1
      score += hits / (i + 1)

  return score / min(len(relevant), k)



In [None]:
def map_at_k(model, train_matrix, test_dict, k = 10):
  ap_scores = []

  for user_idx, true_items in test_dict.items():
    user_items = train_matrix[user_idx]

    recommended, _ = model.recommend(
        userid=user_idx,
        user_items=user_items,
        N=k)

    ap = average_precision_at_k(recommended, set(true_items), k)
    ap_scores.append(ap)

  return np.mean(ap_scores)

In [None]:
def build_test_dict(test_df):
  return(
      test_df.groupby("user_idx")["item_idx"]
      .apply(list)
      .to_dict()
  )

test_dict = build_test_dict(test_ratings_enc)

In [None]:
map10 = map_at_k(als_model, train_matrix, test_dict, k=10)
print("MAP@10:", map10)

Diversity Analysis:
Are my recommendations varied, or are they all similar movies?

In [None]:
#Prepare genre data
movies["genre_list"] = movies['genres'].apply(lambda x: x.split("|"))

In [None]:
reverse_item_map = {v: k for k, v in global_itemId_map.items()}

In [None]:
# ~0.4-0.6 -> balanced, <0.3 -> repetitive, 0.7 -> very diverse
import random

def genre_diversity_sample(
    model,
    train_matrix,
    movies,
    reverse_item_map,
    k=10,
    sample_size=1000
):

    user_indices = random.sample(
        range(train_matrix.shape[0]),
        min(sample_size, train_matrix.shape[0])
    )

    diversities = []

    for user_idx in user_indices:

        user_items = train_matrix[user_idx]

        recommended, _ = model.recommend(
            userid=user_idx,
            user_items=user_items,
            N=k
        )

        genres = []

        for item_idx in recommended:
            movie_id = reverse_item_map[item_idx]
            row = movies[movies["movieId"] == movie_id]

            if not row.empty:
                genres.extend(row["genre_list"].values[0])

        if len(genres) > 0:
            diversities.append(len(set(genres)) / len(genres))

    return np.mean(diversities)



In [None]:
div_score = genre_diversity_sample(
    als_model,
    train_matrix,
    movies,
    reverse_item_map,
    sample_size=1000
)

print("Average Genre Diversity@10:", div_score)


Popularity Bias Analysis:
 is the model just recommending blockbuster movies?

In [None]:
movie_popularity = train_ratings.groupby("movieId").size()

In [None]:
print(movie_popularity.mean())

In [None]:
def recommendation_popularity(model, train_matrix, reverse_item_map, movie_popularity, k=10):
  rec_pop = []

  for user_idx in range(train_matrix.shape[0]):
    user_items = train_matrix[user_idx]

    recommended, _ = model.recommend(
        user_idx,
        user_items,
        N=k
    )

    for item_idx in recommended:
      movie_id = reverse_item_map[item_idx]
      if movie_id in movie_popularity:
        rec_pop.append(movie_popularity[movie_id])

  return np.mean(rec_pop)

In [None]:
rec_popularity = recommendation_popularity(
    als_model,
    train_matrix,
    reverse_item_map,
    movie_popularity
)

print("Avg popularity in recommendations:", rec_popularity)
#if rec popularity > dataset avg -> strong popularity bias

In [None]:
#Demo
def recommend_for_user(
    user_id,
    user_map,
    model,
    train_matrix,
    reverse_item_map,
    movies,
    k=10
):

    if user_id not in user_map:
        return "Cold-start user. Use popularity fallback."

    user_idx = user_map[user_id]
    user_items = train_matrix[user_idx]

    recommended, _ = model.recommend(
        user_idx,
        user_items,
        N=k
    )

    movie_ids = [reverse_item_map[i] for i in recommended]

    return movies[movies["movieId"].isin(movie_ids)][["title", "genres"]]


In [None]:
#testing demo
recommend_for_user(1, global_userId_map, als_model, train_matrix, reverse_item_map, movies)


## Section 5 — Model Saving

After training and evaluation, I saved all artifacts needed for deployment.

Key steps:
- Save trained ALS model (`als_model.pkl`)
- Save user and item mappings (`user_map.pkl` and `reverse_item_map.pkl`)
- Save sparse training matrix (`train_matrix.npz`) for inference
- Upload all files to AWS S3 to be used by the EC2 inference API

This ensures reproducibility and allows deployment without retraining.
