In [2]:
!uv run dvc pull

Everything is up to date.


In [4]:
from pathlib import Path

import pandas as pd

from anime_recommender.scripts.setup import DatasetLoader, DatasetProcessor
from anime_recommender.scripts.callbacks import EventsCallback

In [5]:
config_file = Path.cwd().parent.parent / "config" / "log-config.yaml"
archive_path = Path.cwd().parent / "data" / "archive.zip"
callback = EventsCallback()
callback.reconfigure_from_file(config_file=config_file)
log = callback.logger

ds_loader = DatasetLoader(log=log, archive_path=archive_path)
anime_pd, ratings_pd = ds_loader.load_pandas_data_frames()
ds_processor = DatasetProcessor(log=log, anime_pd=anime_pd, ratings_pd=ratings_pd)

tables_merge = ds_processor._merge()

EventsCallback [INFO]: ===== Unpack Archive Job =====
EventsCallback [DEBUG]: Unlinking file animelist.csv
EventsCallback [DEBUG]: Unlinking file anime_with_synopsis.csv
EventsCallback [DEBUG]: Removing tree html folder
EventsCallback [DEBUG]: Unlinking file watching_status.csv
EventsCallback [INFO]: ===== Join Tables Job =====
EventsCallback [DEBUG]: Total records: 603_038


In [6]:
log.debug(f"Records: {tables_merge.shape[0]:_}")
tables_merge.sample(3)

EventsCallback [DEBUG]: Records: 603_038


Unnamed: 0,rating,user_id,anime_id,name,genres
272587,8.0,160267,32281,Kimi no Na wa.,"Romance, Supernatural, School, Drama"
17336,10.0,10397,28851,Koe no Katachi,"Drama, School, Shounen"
433879,8.0,254091,32281,Kimi no Na wa.,"Romance, Supernatural, School, Drama"


In [7]:
tables_merge.rating.value_counts()

rating
10.0    194136
9.0     191974
8.0     133064
7.0      53712
6.0      17178
5.0       7033
4.0       3111
3.0       1336
1.0        845
2.0        649
Name: count, dtype: int64

In [8]:
def get_unique_genres(genre_series: pd.Series) -> list[str]:
    """Unique Genres of all Anime Movies."""

    genres = set()

    for pdtuple in genre_series.to_frame().itertuples():
        row_split = pdtuple.genres.split(", ")
        for genre in row_split:
            genres.add(genre)

    return sorted(genres)

In [10]:
unique_genres = get_unique_genres(genre_series=tables_merge.genres)
log.info(f"Unique Genres:\n{unique_genres}")

EventsCallback [INFO]: Unique Genres:
['Action', 'Adventure', 'Comedy', 'Drama', 'Fantasy', 'Historical', 'Magic', 'Mystery', 'Parody', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shounen', 'Slice of Life', 'Supernatural', 'Vampire']


In [None]:
feature_dim = [len(tables_merge[col].unique()) for col in ("user_id", "anime_id")]

log.info(f"feature_dim: {sum(feature_dim)}")

EventsCallback [INFO]: feature_dim: 182794
