In [1]:
from schemas import MAL_Schemas

import polars as pl
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Load datasets
def load_datasets():
    animelist = pl.read_csv(
        'dataset/animelist.csv',
        schema=MAL_Schemas.animelist(),
        ignore_errors=True
    )
    anime = pl.read_csv(
        'dataset/anime.csv',
        schema=MAL_Schemas.anime(),
        ignore_errors=True
    )
    return animelist, anime

# Basic Preprocessing
def preprocess_data(animelist):
    # Filter out ratings of 0 (if present)
    animelist = animelist.filter(pl.col("rating") > 0)
    return animelist

# Split data into train and test sets
def split_data(animelist):
    df = animelist.to_pandas()
    train, test = train_test_split(df, test_size=0.2, random_state=42)
    return train, test

In [3]:
# Create a popularity-based baseline model
def popularity_recommendation(train, anime, top_n=10):
    # Calculate mean rating for each anime
    anime_popularity = (
        train.groupby("anime_id")["rating"].mean()
        .reset_index()
        .rename(columns={"rating": "mean_rating"})
    )
    
    # Merge with anime metadata for better readability
    anime_popularity = anime_popularity.merge(
        anime.to_pandas(),
        left_on="anime_id",
        right_on="MAL_ID",
        how="left"
    ).sort_values(by="mean_rating", ascending=False)

    # Select top N recommendations
    top_anime = anime_popularity[["Name", "mean_rating", "Genres"]].head(top_n)
    return top_anime

In [None]:
animelist, anime = load_datasets()
animelist = preprocess_data(animelist)
train, test = split_data(animelist)

# Generate Top-N Recommendations
top_recommendations = popularity_recommendation(train, anime)
print("Top Recommendations:")
print(top_recommendations)