# MyAnimeStats

Explore insightful statistics about a [MyAnimeList](https://myanimelist.net/) user list.

## List Import

Importer script

In [None]:
import pandas as pd
from IPython.display import display, HTML
from pathlib import Path
from src.log import logger
from src.api import import_data

log = logger.getChild("main")
data = Path('data')
pd.set_option('display.max_rows', None)

# Get username
def get_username():
    file = data / 'username.txt'
    try:
        return file.read_text().strip()
    except:
        log.warning(f"Failed to load {file}")
        pass

    try:
        return input('Enter your MAL username: ')
    except:
        log.warning('Failed to get username')
        pass

    return None

def get_xml():
    file = data / 'animelist.xml.gz'
    try:
        return file.open('rb')
    except:
        log.warning(f"Failed to load {file}")
        pass

    return None

# Prioritize xml
xml = get_xml()

# Get username
if xml is None:
    username = get_username()

# Load data
animes, staff = await import_data(username=username, export_file=xml)
df_animes = pd.DataFrame(animes)

display(HTML("<h3>MyAnimeStats successfully imported data.</h3>"))

## Best scored franchises

What anime franchises do you like the most?

In [None]:
from src.franchises import get_franchises

franchises = get_franchises(animes)
df = pd.DataFrame(franchises)

# Get the top 10 franchises
top_10 = df.sort_values('my_score', ascending=False).head(10)
top_10 = top_10[[
	'title_english',
	'my_score',
	'score',
	'genres',
	'themes',
	'demographics',
	'episodes',
	'studios',
]]

header = "<h3>Best scored franchises</h3>"
display(HTML(header + top_10.to_html(index=False)))

## Air Schedule

When does the next episode air?

In [None]:
import time
from src.schedule import get_schedule

schedule = get_schedule(animes)

def get_local_tz_name():
	# Get the current local time
	local_time = time.localtime()
	# Get the timezone name based on whether DST is in effect
	is_dst = local_time.tm_isdst
	return time.tzname[is_dst] if is_dst in [0, 1] else 'Unknown'

# Show schedule
if schedule is None:
	display(HTML("<h3>User is not watching anything currently airing</h3>"))
else:
	header = f"<h3>Currently airing shows</h3><h4>Japan air time converted to {get_local_tz_name()}</h4>"
	display(HTML(header + schedule.to_html(index=False)))

## Scores stats

How do you score anime and how does it compare to others?

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create the figure and subplots for histograms
fig, axs = plt.subplots(1, 2, figsize=(14, 6))

# Histograms of 'score' and 'my_score'
sns.histplot(df['score'], kde=True, bins=30, ax=axs[0])
axs[0].set_title('MyAnimeList Score Distribution of Franchises')
axs[0].set_xlabel('MyAnimeList Score')
axs[0].set_ylabel('Density')

df['my_score_rounded'] = df['my_score'].round(0)
sns.histplot(df['my_score_rounded'], kde=True, bins=30, ax=axs[1])
axs[1].set_title('User Score Distribution of Franchises')
axs[1].set_xlabel('User Score')
axs[1].set_ylabel('Density')

# Average user score by air year
(fig, axs) = plt.subplots(1, 1, figsize=(14, 6))
sns.histplot(df_animes, x='year', y='my_score', bins=30, ax=axs)
axs.set_title('User Score Distribution by Air Year')
axs.set_xlabel('Start Year')
axs.set_ylabel('Average User Score')

plt.tight_layout()
plt.show()

## Genres, Themes, and Studios stats

What are your favorite genres, themes, and studios?

In [None]:
# Helper function for generating box plots
def box_plot(data, x, y, title, xlabel, ylabel, order):
    if data[x].notna().any():  # Check if there are any non-null values
        plt.figure(figsize=(14, 6))
        sns.boxplot(x=x, y=y, data=data, order=order)
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.xticks(rotation=90)
        plt.show()

# Unnest 'genres', 'themes', and 'studios' fields and create box plots
for field in ['genres', 'themes', 'studios', 'demographics']:
    df_unnested = df.explode(field).reset_index(drop=True)

    # Filter out entries with too few data points
    threshold = 8
    df_unnested = df_unnested.groupby(field).filter(lambda x: len(x) >= threshold)

    # Sort by descending median my_score
    order = df_unnested.groupby(field)['my_score'].median().sort_values(ascending=False).index

    box_plot(df_unnested, field, 'my_score', f'User Score Distribution by {field.capitalize()}', field.capitalize(), 'My Score', order)


## Most unpopular opinions

What franchises do you score very differently than the community?

In [None]:
# Rank scaled score
def scale_scores(scores: pd.DataFrame) -> pd.DataFrame:
    """Scale scores to a range of 0 to 1 using rank scaling."""
    ranks = scores.rank(ascending=False)
    scaled = 1 - (ranks - 1) / (len(scores) - 1)
    return scaled

# Ignore NaN scores
df_notna = df[df['score'].notna()]
df_notna = df_notna[df_notna['my_score'].notna()]

# Scaled score to remove my own bias
df_notna['score_scaled'] = scale_scores(df_notna['score'])
df_notna['my_score_scaled'] = scale_scores(df_notna['my_score'])

df_notna['score_difference'] = df_notna['my_score_scaled'] - df_notna['score_scaled']
df_notna['score_difference_abs'] = (df_notna['score_difference']).abs()
df_sorted_abs = df_notna.sort_values(by='score_difference_abs', ascending=False)

header = "<h3>Most unpopular opinions</h3>"
df_sorted_abs_head = df_sorted_abs[['title_english', 'score_difference']].head(10)
display(HTML(header + df_sorted_abs_head.to_html(index=False)))

## Normie-ness

How popular are your opinions?

In [None]:
normie_ness = 1 - (df_sorted_abs["score_difference_abs"].mean() * 2)
display(HTML(f"<h3>Normie-ness: {normie_ness:.2%}</h3>"))

## My opinion vs the world's

Is your scoring similar to the community?

In [None]:
# Define the color thresholds using List of Tuples
color_thresholds = [
    (0.05, 'green'),
    (0.15, 'orange'),
]

# Define a function to get color based on score difference and the List of Tuples representation
def get_color(score_diff, thresholds):
    for threshold, color in thresholds:
        if score_diff <= threshold:
            return color
    return 'red'

# Apply the function to the dataframe
df_notna['Color'] = df_notna['score_difference_abs'].apply(lambda x: get_color(x, color_thresholds))

# Using the provided code to plot the scatter plot
ax = df_notna.plot.scatter(x='my_score_scaled', y='score_scaled', c=df_notna['Color'], figsize=(10, 10), alpha=0.7, s=50, edgecolor='k')

# y=x guide line
ax.plot([0, 1], [0, 1], 'k--', linewidth=0.5)

plt.title('User Score vs MyAnimeList Score')
plt.xlabel('User Score')
plt.ylabel('MyAnimeList Score')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.show()

## Co-occurrence stats

What are the most common combinations of genres and themes?

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

def draw_co_occurrence_matrix(data, title):
    """Draw a co-occurrence matrix with a title and masks the upper triangle."""
    co_occurrence_matrix = np.dot(data.transpose(), data)
    co_occurrence_df = pd.DataFrame(co_occurrence_matrix, index=data.columns, columns=data.columns)
    mask = np.triu(np.ones_like(co_occurrence_df, dtype=bool))
    plt.figure(figsize=(10, 10))
    sns.heatmap(co_occurrence_df, mask=mask, cmap='coolwarm', center=0, annot=False, fmt='d')
    plt.title(title)
    plt.show()

# Initialize MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Extract and one-hot encode genres and themes
for feature in ['genres', 'themes']:
    feature_list = [franchise[feature] for franchise in franchises]
    feature_encoded = mlb.fit_transform(feature_list)
    df_feature = pd.DataFrame(feature_encoded, columns=mlb.classes_)
    draw_co_occurrence_matrix(df_feature, f'{feature.capitalize()} Co-occurrence Matrix')

## Reoccurring staff members

Who worked on a lot of anime you loved?

In [None]:
# Show staff
from src.staff import reoccurring_staff
display(HTML(reoccurring_staff(staff, show_top=10)))