In [1]:
!uv run dvc pull

A       ..\data\archive.zip
1 file added


In [3]:
from pathlib import Path

import pandas as pd

from anime_recommender.scripts.setup import DatasetLoader, DatasetProcessor
from anime_recommender.scripts.callbacks import EventsCallback

In [4]:
config_file = Path.cwd().parent.parent / "config" / "log-config.yaml"
archive_path = Path.cwd().parent / "data" / "archive.zip"
callback = EventsCallback()
callback.reconfigure_from_file(config_file=config_file)
log = callback.logger

ds_loader = DatasetLoader(log=log, archive_path=archive_path)
anime_pd, ratings_pd = ds_loader.load_pandas_data_frames()
ds_processor = DatasetProcessor(log=log, anime_pd=anime_pd, ratings_pd=ratings_pd)

tables_merge = ds_processor._merge()

EventsCallback [INFO]: ===== Unpack Archive Job =====
EventsCallback [DEBUG]: Unlinking file animelist.csv
EventsCallback [DEBUG]: Unlinking file anime_with_synopsis.csv
EventsCallback [DEBUG]: Removing tree html folder
EventsCallback [DEBUG]: Unlinking file watching_status.csv
EventsCallback [INFO]: ===== Filter users Job =====
EventsCallback [INFO]: ===== Join Tables Job =====
EventsCallback [DEBUG]: Total Records: 752_850


In [5]:
log.debug(f"Records: {tables_merge.shape[0]:_}")
tables_merge.sample(3)

EventsCallback [DEBUG]: Records: 752_850


Unnamed: 0,user_id,rating,anime_id,name,genres
600923,300428,1.0,29949,Nami,Dementia
672733,327150,10.0,1825,Hadashi no Gen 2,"Drama, Historical, Shounen"
330339,176954,6.0,40733,Zhen Hun Jie: Bei Luo Shi Men Pian Part 1,"Action, Supernatural"


In [6]:
tables_merge.rating.value_counts()

rating
7.0     172657
6.0     135045
5.0     108034
8.0      99217
4.0      57766
10.0     49852
9.0      39938
3.0      36257
1.0      28859
2.0      25225
Name: count, dtype: int64

In [7]:
def get_unique_genres(genre_series: pd.Series) -> list[str]:
    """Unique Genres of all Anime Movies."""

    genres = set()

    for pdtuple in genre_series.to_frame().itertuples():
        row_split = pdtuple.genres.split(", ")
        for genre in row_split:
            genres.add(genre)

    return sorted(genres)

In [10]:
unique_genres = get_unique_genres(genre_series=tables_merge.genres)
unique_genres_str = "\n".join(unique_genres)
log.info(f"Unique Genres:\n{unique_genres_str}")

EventsCallback [INFO]: Unique Genres:
Action
Adventure
Cars
Comedy
Dementia
Demons
Drama
Ecchi
Fantasy
Game
Harem
Hentai
Historical
Horror
Josei
Kids
Magic
Martial Arts
Mecha
Military
Music
Mystery
Parody
Police
Psychological
Romance
Samurai
School
Sci-Fi
Seinen
Shoujo
Shoujo Ai
Shounen
Shounen Ai
Slice of Life
Space
Sports
Super Power
Supernatural
Thriller
Unknown
Vampire
Yaoi
Yuri


In [11]:
feature_dim = [len(tables_merge[col].unique()) for col in ("user_id", "anime_id")]

log.info(f"feature_dim: {sum(feature_dim)}")

EventsCallback [INFO]: feature_dim: 16972
