## Import libraries

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

## Read the dataset

In [None]:
df_anime = pd.read_csv('data/anime-dataset-2023.csv')
print(df_anime.shape)
df_anime.head()

In [None]:
df_user = pd.read_csv('data/users-details-2023.csv')
print(df_user.shape)
df_user.head()

In [None]:
df_score = pd.read_csv('data/users-score-2023.csv')
print(df_score.shape)
df_score.head()

## Explore the data

In [None]:
# check the information of df_anime
df_anime.info()

In [None]:
# check the amount of values in Score column
df_anime['Score'].value_counts()

### Many rows in the Score column have the value UNKNOWN. These rows are useless and need to be dropped.

In [None]:
df_anime = df_anime[df_anime['Score'] != 'UNKNOWN'] # drop the UNKNOWN rows
df_anime['Score'].value_counts()

In [None]:
df_anime.info()

In [None]:
# check the amount of values in Rank column
df_anime['Rank'].value_counts()

### The Rank column has 2991 rows marked as UNKNOWN. These can be changed to null values instead of being dropped, as some important animation data might be lost.

In [None]:
df_anime['Rank'] = df_anime['Rank'].replace('UNKNOWN', np.nan) # change the value to nan
df_anime['Rank'].value_counts()

In [None]:
# check the amount of values in Episodes column
df_anime['Episodes'].value_counts()

### The Episodes column has 88 rows marked as UNKNOWN. These should be changed to null values as well.

In [None]:
df_anime['Episodes'] = df_anime['Episodes'].replace('UNKNOWN', np.nan) # # change the value to nan

In [None]:
df_anime.info()

### Score,Scored By,Rank,Episodes are object type.They need to be converted to numerical data type.

In [None]:
# convert object to float64
df_anime['Score'] = df_anime['Score'].astype(np.float64)
df_anime['Scored By'] = df_anime['Scored By'].astype(np.float64)
df_anime['Rank'] = df_anime['Rank'].astype(np.float64)
df_anime['Episodes'] = df_anime['Episodes'].astype(np.float64)

In [None]:
df_anime.info()

In [None]:
# check the amount of values in Duration column
df_anime['Duration'].value_counts()

### The Duration column is not purely numerical and needs to be converted entirely to numerical values, with the unit being minutes.

In [None]:
import re # it is the support for regular expressions

# convert duration to minutes
def convert_duration_to_minutes(duration):   
    # use regular expressions to match hours and minutes
    hr_pattern = re.compile(r'(\d+)\s*hr')
    min_pattern = re.compile(r'(\d+)\s*min')
    # find hours and minutes in the Duration column
    hours = hr_pattern.findall(duration)
    minutes = min_pattern.findall(duration)
    # convert hours and minutes into integers, and convert hours into minutes
    hours = int(hours[0]) * 60 if hours else 0
    minutes = int(minutes[0]) if minutes else 0
    
    return hours + minutes # return the total minutes

In [None]:
df_anime['Duration'] = df_anime['Duration'].apply(convert_duration_to_minutes) # apply the above function to Duration column
df_anime.head()

In [None]:
df_anime.info()

### What is the overall distribution of anime ratings on MyAnimeList, and what key features does it exhibit? (RQ1)


In [None]:
# extract the Score column and count it, then merge it into a new dataframe
df_score_counts = df_anime['Score'].value_counts().reset_index()
df_score_counts.columns = ['Score', 'Count']
df_score_counts.head()

In [None]:
# check the distribution of Score (Animation Ratings)
fig = px.histogram(
    df_score_counts,
    x='Score',
    y='Count',
    labels={'Ratings': 'Ratings', 'Count': 'Rating Amount'}
)
fig.update_layout(
    title={'text': 'Distribution of Animation Ratings', 'x': 0.5}, # set the title and make it in the center
    width=1000,
    height=600, 
    showlegend=False # hide the legend
)
fig.update_xaxes(dtick=0.1) # set the x-axis interval to 0.1
fig.update_traces(xbins=dict(size=0.1)) # set the width of bars to 0.1
fig.show()

#### **(1) explore the relationship between Type and Ratings**

In [None]:
# check the amount of values in Type column
df_anime['Type'].value_counts()

In [None]:
df_anime = df_anime[df_anime['Type'] != 'UNKNOWN'] # drop the UNKNOWN column

In [None]:
# draw the violin picture for Type and Score (Rating)
fig = px.violin(
    df_anime, 
    x='Type', 
    y='Score', 
    labels={'Type':'Anime Type', 'Score':'Rating'},
    color='Type'
)
fig.update_layout(
    title={'text': 'Distribution of Anime Rating by Type', 'x': 0.5}, # set the title and make it in the center
    width=800,
    height=500,
    showlegend=True # show the legend (in the right part of the picture)
)
fig.show()

In [None]:
# handle NaNs and infinite values
df_anime = df_anime.replace([np.inf, -np.inf], np.nan)
df_anime_num_cat1 = df_anime.dropna(subset=['Score', 'Type'])
# filter out 'Type' categories with insufficient data
min_elements = 2  # Set the minimum number of elements required in each group
filtered_types = df_anime_num_cat1.groupby('Type').filter(lambda x: len(x) >= min_elements)
# prepare data for KDE plot
types = filtered_types['Type'].unique()
hist_data = [filtered_types[filtered_types['Type'] == t]['Score'].values for t in types]
# check if there is sufficient data to plot
if all(len(data) > 1 for data in hist_data):
    # create KDE plot
    fig = ff.create_distplot(hist_data, types, show_hist=False, show_rug=False)

    fig.update_layout(
        title={'text': "KDE Plot of Rating by Type", "x": 0.5}, # set the title and make it in the center
        xaxis_title="Rating",
        yaxis_title="Density",
        legend_title="Anime Type",
        width=1200,
        height=600
    )

    fig.show()
else:
    print("No enough data to create a KDE plot for each Type.")

#### **(2) exploring the relationship between the number of episodes, episode duration, popularity, number of favorites, and ratings of an animation**

In [None]:
# Score vs Episodes
fig1 = go.Figure()
fig1.add_trace(
    go.Scatter(
        x=df_anime['Score'],
        y=df_anime['Episodes'],
        mode='markers',
        name='Rating vs Episodes',
        marker=dict(
            size=5,  # set the size of the point
            color=df_anime['Score'],  # set the color of the point using value of Score column
            colorscale='Viridis',  # set the color mapping
            showscale=True  # show the color bar
        )
    )
)
fig1.update_layout(
    title={'text': 'Rating vs Episodes', 'x': 0.5},
    xaxis_title='Rating',
    yaxis_title='Episodes',
    width=450,
    height=400,
    showlegend=False
)
fig1.show()

In [None]:
# Score vs Duration
fig2 = go.Figure()
fig2.add_trace(
    go.Scatter(
        x=df_anime['Score'],
        y=df_anime['Duration'],
        mode='markers',
        name='Rating vs Duration',
        marker=dict(
            size=5,
            color=df_anime['Duration'],
            colorscale='Cividis',
            showscale=True
        )
    )
)
fig2.update_layout(
    title={'text': 'Rating vs Duration', 'x': 0.5},
    xaxis_title='Rating',
    yaxis_title='Duration',
    width=450,
    height=400,
    showlegend=False
)
fig2.show()

In [None]:
# Score vs Popularity
fig3 = go.Figure()
fig3.add_trace(
    go.Scatter(
        x=df_anime['Score'],
        y=df_anime['Popularity'],
        mode='markers',
        name='Rating vs Popularity',
        marker=dict(
            size=5,
            color=df_anime['Popularity'],
            colorscale='Bluered',
            showscale=True
        )
    )
)
fig3.update_layout(
    title={'text': 'Rating vs Popularity', 'x': 0.5},
    xaxis_title='Rating',
    yaxis_title='Popularity',
    width=450,
    height=400,
    showlegend=False
)
fig3.show()

In [None]:
# Score vs Favorites
fig4 = go.Figure()
fig4.add_trace(
    go.Scatter(
        x=df_anime['Score'],
        y=df_anime['Favorites'],
        mode='markers',
        name='Rating vs Favorites',
        marker=dict(
            size=5,
            color=df_anime['Favorites'],
            colorscale='Portland',
            showscale=True
        )
    )
)
fig4.update_layout(
    title={'text': 'Rating vs Favorites', 'x': 0.5},
    xaxis_title='Rating',
    yaxis_title='Favorites',
    width=450,
    height=400,
    showlegend=False
)
fig4.show()

#### **(3) explore the relationship between the premiere year and ratings**

In [None]:
# crate a new dataframe to process the data about premiered year
df_premiered = pd.DataFrame(df_anime)
# extract the year and drop UNKNOWN rows
df_premiered['Year'] = df_premiered['Premiered'].apply(lambda x: x.split()[-1] if x != 'UNKNOWN' else None)
# filter the null values
df_premiered = df_premiered.dropna(subset=['Year'])
# convert the data type to int64
df_premiered['Year'] = df_premiered['Year'].astype(np.int64)

# create the picture
fig = px.scatter(
    df_premiered, 
    x='Year', 
    y='Score', 
    labels={'Year': 'Premiered Year', 'Score': 'Score'}, 
    title='Premiered Year vs Score'
)

fig.update_layout(
    title={'text': 'Premiered Year vs Score', 'x': 0.5}
)

fig.update_xaxes(dtick=1) # set the interval of x-axis to 1 year

fig.show()

In [None]:
# calculate the average score for each year
average_scores = df_premiered.groupby('Year')['Score'].mean().reset_index()

# create line chart
fig = px.line(
    average_scores, 
    x='Year', 
    y='Score', 
    labels={'Year': 'Premiered Year', 'Score': 'Average Rating'}, 
    title='Average Score by Premiered Year'
)

fig.update_layout(
    title={'text': 'Average Rating by Premiered Year', 'x': 0.5},
    width=900,
    height=600,
)

fig.update_xaxes(dtick=1)

fig.update_traces(line=dict(color='blue')) # set the color of the line

fig.show()

#### **(4) explore the differences in ratings for animations of various genres**

In [None]:
# process NaN and outliers
df_anime = df_anime.replace([np.inf, -np.inf], np.nan)
df_anime_num_cat2 = df_anime.dropna(subset=['Score', 'Genres'])
# split the Genres column
df_anime_num_cat2 = df_anime_num_cat1.assign(Genres=df_anime_num_cat2['Genres'].str.split(', ')).explode('Genres')
# filter out categories with insufficient samples
min_elements = 2  # eet the minimum number of samples for each category
filtered_genres = df_anime_num_cat2.groupby('Genres').filter(lambda x: len(x) >= min_elements)
# prepare data for KDE chart
genres = filtered_genres['Genres'].unique()
hist_data = [filtered_genres[filtered_genres['Genres'] == genre]['Score'].values for genre in genres]
# check if there is enough data to create a chart
if all(len(data) > 1 for data in hist_data):
    # create KDE chart
    fig = ff.create_distplot(hist_data, genres, show_hist=False, show_rug=False)

    fig.update_layout(
        title={'text': "KDE Plot of 'Score' by 'Genres'", "x": 0.5},
        xaxis_title="Score",
        yaxis_title="Density",
        legend_title="Genres",
        width=1200,
        height=700,
    )

    fig.show()
else:
    print("Not enough data to create a KDE plot for each 'Genres'.")

## Clean up user-details-2023

In [None]:
df_user.info()

In [None]:
df_user_num_list = df_user.select_dtypes(include=['number']).columns.tolist() # numerical data
df_user_cat_list = df_user.select_dtypes(include=['object', 'category']).columns.tolist() # object data
print(df_user_num_list)
print(df_user_cat_list)

In [None]:
df_user.isnull().sum() / df_user.shape[0] * 100 # check the amount of null values

### Over 69% of the fields for Gender, Birthday, and Location are empty.

In [None]:
df_user.describe() # check the data distribution of df_user

### It is shocking that over 50% of the accounts are useless; they are merely registered and have not fully watched any anime. These entries must be cleared.

In [None]:
# remove useless accounts
drop_id = ((df_user['Days Watched']==0) & (df_user['Mean Score']==0) & (df_user['Watching']==0) & (df_user['Completed']==0) 
           & (df_user['On Hold']==0) & (df_user['Dropped']==0) & (df_user['Plan to Watch']==0) & (df_user['Total Entries']==0) 
           & (df_user['Rewatched']==0))
df_user = df_user[~drop_id]
df_user.shape

In [None]:
df_user.describe()

In [None]:
df_user.info()

## How do fundamental features of anime, such as genre, premiere year, and number of episodes, correlate with ratings? (RQ2)


In [None]:
# extract the Mean Score of users and count them
df_user_mean_score_counts = df_user['Mean Score'].value_counts().reset_index()
df_user_mean_score_counts.columns = ['Mean Score', 'Count']
df_user_mean_score_counts = df_user_mean_score_counts[df_user_mean_score_counts['Mean Score'] != 0] # drop 0 rows
df_user_mean_score_counts.head()

In [None]:
# check the distribution of Mean Score
fig = px.histogram(
    df_user_mean_score_counts,
    x='Mean Score',
    y='Count',
    labels={'Mean Score': 'Mean Rating', 'Count': 'Amount'}
)
fig.update_layout(
    title={'text': 'Distribution of Users\' Mean Ratings', 'x': 0.5},
    width=1200,
    height=600,
    showlegend=False # hide the legend
)
fig.update_xaxes(range=[0, 10], dtick=0.1) # set the range of x-axis as 0 to 10, and the interval to 0.1
fig.update_traces(xbins=dict(size=0.1)) # set the width of bars to 0.1
fig.show()

## How are user ratings distributed on the platform, and are there observable biases or tendencies in user rating behaviors? (RQ3)

In [None]:
# Mean Score and gender
df_user_gender_score = df_user.dropna(subset=['Mean Score', 'Gender'])
# filter out 'Gender' categories with insufficient data
min_elements = 2  # set the minimum number of elements required in each group
filtered_genders = df_user_gender_score.groupby('Gender').filter(lambda x: len(x) >= min_elements)
# prepare data for KDE plot
genders = filtered_genders['Gender'].unique()
hist_data = [filtered_genders[filtered_genders['Gender'] == t]['Mean Score'].values for t in genders]
# check if there is sufficient data to plot
if all(len(data) > 1 for data in hist_data):
    # create KDE plot
    fig = ff.create_distplot(hist_data, genders, show_hist=False, show_rug=False) # hide the bar and rug plot

    fig.update_layout(
        title={'text': "KDE Plot of Mean Rating by Gender", "x": 0.5},
        xaxis_title="Mean Rating",
        yaxis_title="Density",
        legend_title="Gender",
        width=1200,
        height=600
    )

    fig.show()
else:
    print("Not enough data to create a KDE plot for each Gender.")

In [None]:
# Mean Score and age
df_user_age = df_user.dropna(subset=['Birthday', 'Mean Score'])
df_user_age['Birthday'] = pd.to_datetime(df_user_age['Birthday'], errors='coerce') # ensure Birthday is the type of datatime

current_year = 2023 # set current year
df_user_age['Age'] = current_year - df_user_age['Birthday'].dt.year # calculate the age

age_slice = [0, 18, 25, 35, 45, 55, 65, 120] # split the age
labels = ['0-18', '18-25', '25-35', '35-45', '45-55', '55-65', '65+'] # set the age group
df_user_age['Age Group'] = pd.cut(df_user_age['Age'], age_slice, labels=labels, right=False)
# filter out 'Age Group' categories with insufficient data
min_elements = 2  # set the minimum number of elements required in each group
filtered_age_groups = df_user_age.groupby('Age Group').filter(lambda x: len(x) >= min_elements)
# prepare data for KDE plot
age_groups = filtered_age_groups['Age Group'].unique()
hist_data = [filtered_age_groups[filtered_age_groups['Age Group'] == t]['Mean Score'].values for t in age_groups]
# check if there is sufficient data to plot
if all(len(data) > 1 for data in hist_data):
    # create KDE plot
    fig = ff.create_distplot(hist_data, age_groups, show_hist=False, show_rug=False)

    fig.update_layout(
        title={'text': "KDE Plot of Mean Rating by Age Group", "x": 0.5},
        xaxis_title="Mean Rating",
        yaxis_title="Density",
        legend_title="Age Group",
        width=1200,
        height=600
    )

    fig.show()
else:
    print("Not enough data to create a KDE plot for each Age Group.")