# MyAnimeStats

Explore insightful statistics about a [MyAnimeList](https://myanimelist.net/) user list.

### List Import

In [None]:
import httpx
from pathlib import Path
from rich import print
from src.jupyter_utils import *
from src.user_list import UserList
from src.actions import get_user_animes
from src.franchises import get_user_franchises

data = Path('data')
user_list_xml_path = data / 'animelist.xml.gz'
# anime_db_path = data / 'anime_db.parquet'
anime_db_path = data / 'anime_db.franchise.parquet'
# manga_db_path = data / 'manga_db.parquet'
character_db_path = data / 'character_db.parquet'
people_db_path = data / 'people_db.parquet'

cache = data / 'cache'
cache.mkdir(exist_ok=True)
user_name_cache = cache / 'user_name.txt'
user_list_cache = cache / 'user_list.parquet'

if not anime_db_path.exists():
	raise Exception(f"Missing anime_db {anime_db_path}")

def get_user_name():
	if user_name_cache.exists():
		print(f"[green]Loading user_name from cache {user_name_cache}[/green]")
		return user_name_cache.read_text().strip()
	print(f"[red]Failed to load user_name from cache {user_name_cache}[/red]")

	user = input('Enter your MAL user_name: ')
	if user != '':
		print(f"[green]Got user_name from user input, saving to cache {user_name_cache}[/green]") 
		user_name_cache.write_text(user)
		return user
	print("[red]Failed to get user_name from user input[/red]")

	return None

async def get_user_list():
	if user_list_cache.exists():
		print(f"[green]Loading user_list from cache {user_list_cache}[/green]")
		return pl.read_parquet(user_list_cache)
	print(f"[red]Failed to load user_list from cache {user_list_cache}[/red]")

	if user_list_xml_path.exists():
		print(f"[green]Loading user_list from MAL export {user_list_xml_path}[/green]")
		user_list = UserList.from_xml(user_list_xml_path)
		# user_list.write_parquet(user_list_cache)
		return user_list
	print(f"[red]Failed to load user_list from MAL export {user_list_xml_path}[/red]")

	user_name = get_user_name()
	if user_name:
		print(f"[green]Scraping user_list from MAL user_name {user_name}[/green]")
		async with httpx.AsyncClient() as client:
			user_list = await UserList.from_user_name(client, user_name)
			user_list.write_parquet(user_list_cache)
			return user_list
	print("[red]Failed to get user_name to scrape user_list[/red]")

	raise Exception(f"Failed to get user_list, need {user_list_xml_path} or user_name")

user_list = await get_user_list()

user_animes = get_user_animes(user_list, anime_db_path)
describe(user_animes, "User Animes")

user_franchises = get_user_franchises(user_animes)
print(f"[green]Found {user_franchises.height} franchises[/green]")
describe(user_franchises, "User Franchises")

### List Analysis

In [None]:
import tzlocal
from src.actions import get_stats

# Get local timezone
local_tz = tzlocal.get_localzone_name()
print(f"[green]Detected local timezone: {local_tz}[/green]")

stats = get_stats(user_animes, user_franchises, local_tz)

for name, stat in stats.items():
	print_df(stat, name)

In [None]:
import hvplot.polars, hvplot.pandas
import holoviews as hv
hv.renderer('bokeh').theme = 'dark_minimal'

display(
	# Histograms of 'score' and 'my_score'
	user_animes.hvplot.kde(y=['scored_avg', 'user_scored'], alpha=0.5, title='Franchises Score Distribution', legend='top_right', height=500, width=800, xlim=(0, 10)) +
	# Average user score by air year
	user_animes.hvplot.scatter(x='air_start', y='user_scored', title='User Score Distribution by Air Year', height=500, width=800, hover_cols=['title_english'])
)

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

def score_box_plot(key: str):
	threshold = 8
	box_data = user_animes.filter(
		pl.col('user_scored').is_not_null()
	).select("user_scored", key).explode(key).group_by(key).all().filter(
		pl.col(key).is_not_null() & (pl.col('user_scored').list.len() >= threshold)
	).cast({
		# Removes filtered keys from the plot
		key: pl.String
	}).with_columns(
		median_score=pl.col('user_scored').list.median()
	).explode('user_scored').sort('median_score', key, descending=True)

	return box_data.hvplot.box(
		y='user_scored', by=key, title=f'User Score Distribution by {key.capitalize()}',
		height=500, width=1200, rot=45, legend='top_right'
	)

display(
	score_box_plot('genres'),
	score_box_plot('themes'),
	score_box_plot('studios'),
	score_box_plot('demographics')
)

In [None]:
# Scale scores to remove bias in MAL users scoring and user scoring
def scale_scores(col: str):
	"Scale scores to a range of 0 to 1 using rank scaling."
	return pl.lit(1) - (pl.col(col).rank(descending=True) - 1) / (pl.col(col).count() - 1)

unpopular_data = user_animes.filter(
	pl.col('scored_avg').is_not_null() & pl.col('user_scored').is_not_null()
).with_columns(
	user_scored_scaled = scale_scores('user_scored'),
	scored_avg_scaled = scale_scores('scored_avg'),
).with_columns(
	score_difference = pl.col('user_scored_scaled') - pl.col('scored_avg_scaled')
).with_columns(
	score_difference_abs = pl.col('score_difference').abs()
).sort("score_difference_abs", descending=True)

print_df(unpopular_data.select("title_english", "score_difference", "scored_avg", "user_scored").head(10), "Most unpopular opinions")

In [None]:
normie_ness = 1 - (unpopular_data.get_column("score_difference_abs").mean() * 2)
print(f"[green]Normie-ness: {normie_ness:.2%}[/green]")

In [None]:
unpopular_data_colored = unpopular_data.with_columns(
    color = pl.when(pl.col('score_difference_abs') <= 0.05)
		.then(pl.lit('green'))
		.when(pl.col('score_difference_abs') <= 0.15)
		.then(pl.lit('orange'))
		.otherwise(pl.lit('red'))
)

display(
	unpopular_data_colored.hvplot.scatter(
		x='scored_avg_scaled', y='user_scored_scaled', c='color', title='User Score vs MyAnimeList Score',
		height=500, width=1200, grid=True,
		hover_cols=['title_english']
	)
	* hv.Curve([(0, 0), (1, 1)], 'black', 'y=x')
)

In [None]:
from itertools import combinations

def co_occurrence(data: pl.Series):
	"Compute co-occurrence data from a list of lists."

	co_occurrences = []
	for row in data:
		for feature1, feature2 in combinations(sorted(row), 2):
			co_occurrences.append({
				'feature1': feature1,
				'feature2': feature2,
			})

	df = pl.DataFrame(
		co_occurrences,
	)

	# Count co-occurrences
	co_occurrence_counts = df.group_by(['feature1', 'feature2']).agg(
		pl.len().alias('count')
	).sort('count', descending=True)

	return co_occurrence_counts

def draw_co_occurrence(feature: str):
	"Draw a co-occurrence matrix with a title and masks the upper triangle."
	occ_data = co_occurrence(user_animes.get_column(feature))
	
	# TODO format data into a matrix with lower triangle masked
	return occ_data.hvplot.heatmap(
		x='feature1', y='feature2', C='count',
		title=f"{feature.capitalize()} Co-occurrence Matrix",
		width=800, height=800,
		rot=45,
	)

display(
	draw_co_occurrence('genres'),
	draw_co_occurrence('themes')
)