In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Phase 1 - ETL

Here i will load and transform the data i will be using;
Originally, that were datasets:
- anime_cleaned.csv; (cointains data about the animes)
- animelists_cleaned.csv; (contains the data about the users ratings of each anime)
- users_cleaned.csv (contains user data)

In order to ensure the experiment remains compatible with my system’s processing capacity, i needed to reduce the data volume, so i opted to:
- Remove animes in which the total number of ratings were bellow the median of all anime;
- Remove users who had whatch less than the median number of shows or more than the median plus the standart deviation, in order to handle outliers.

In this phase i also normalized the values.

## Step 1 - Function declaration

Here i will declare the functions that will be used for loading and transformation


In [5]:
# Paths for the raw files
datasets_folder_path: str = "./DataSets"
anime_file_name: str = "anime_cleaned.csv"
users_file_name: str = "users_cleaned.csv"
ratings_file_name: str = "animelists_cleaned.csv"


def generate_key_user_atributes() -> pd.DataFrame :
    """Generates a dataframe keeping only the user atributes that will be used 
    Args:
        path: String with the original dataset path
    Returns:
        users: pd.DataFrame containning usersnames, user_ids and number or completed shows
    """
    users = pd.read_csv(f"{datasets_folder_path}/{users_file_name}")
    users = users[["username","user_id","user_completed"]]
    
    return users


def generate_key_ratings_atributes() -> pd.DataFrame :
    """Generates a dataframe keeping only the user atributes that will be used 
    Args:
        path: String with the original dataset path
    Returns:
        ratings: pd.DataFrame containning usersnames, anime_ids and ratings"""
    ratings = pd.read_csv(f"{datasets_folder_path}/{ratings_file_name}")
    ratings = ratings[["username","anime_id","my_score"]]

    return ratings


def generate_key_anime_atributes() -> pd.DataFrame :
    """Generates a dataframe keeping only the user atributes that will be used 
    Args:
        path: String with the original dataset path
    Returns:
        ratings: pd.DataFrame containning anime_ids and the number of ratings"""
    animes = pd.read_csv(f"{datasets_folder_path}/{anime_file_name}")
    animes = animes [["anime_id","scored_by"]]

    return animes


def add_user_data_to_rating(ratings: pd.DataFrame, user_data: pd.DataFrame) -> pd.DataFrame:
    """Merges user data and rating data
    Args:
        ratings: pd.DataFrame with the ratings
        user_data: pd.DataFrame with the user data
    Returns:
        ratings: pd.DataFrame containing the merged DataFrames"""
    ratings = ratings.merge(user_data, on="username", how="inner")

    return ratings


def normalize_ratings(ratings: pd.DataFrame) -> pd.DataFrame:
    """Normalizes the column "my_score", by dividing all values by the biggest value
    Args:
        ratings: pd.Dataframe with the original data
    Returns:
        ratings: pd.DataFrame with an aditional column, called "normalized_score" """
    ratings["normalized_score"] = ratings["my_score"] / ratings["my_score"].max()

    return ratings


def remove_users_with_low_number_of_ratings(users: pd.DataFrame) -> pd.DataFrame:
    """Removes users who have rated less than the median number of anime or more than the median plus the standart deviation
    Args:
        users: pd.DataFrame with user data
    Returns:
        users: pd.Dataframe filtered, removing the users previously mentioned """
    std = users["user_completed"].std()
    median = users["user_completed"].median()
    upper_limit = median + std

    users = users.loc[
        (users["user_completed"] >= median) & 
        (users["user_completed"] <= upper_limit)
    ]

    return users


def remove_anime_with_few_ratings(animes: pd.DataFrame):
    """Removes anime with a number of ratings below the median
    Args:
        animes: pd.DataFrame with all anime data
    Returns:
        animes: pd.DataFrame with all anime data, except by those excluded above"""
    median = animes["scored_by"].median()
    animes = animes.loc[animes["scored_by"]> median]

    return animes

def drop_unused_info(df: pd.DataFrame) -> pd.DataFrame:
    """Clean columns that won't be used furthermore
    Args:
        df: pd.DataFrame
    Returns: 
        df: pd.DataFrame without the columns"""
    df = df.drop(columns=["username",
                          "my_score",
                          "user_completed",
                          "scored_by"])
    return df

## Step 2 - Defining a pipeline

Here i'll create a pipeline for loading, normalizing and joining the data by using the functions above

In [6]:
def load_normalize_and_join() -> pd.DataFrame:
    """Loads, normalizes and join the 3 main dataframes
    Returns:
        rating_data: pd.DataFrame containing anime_id, normalized_score and user_id"""

    users = generate_key_user_atributes()
    users = remove_users_with_low_number_of_ratings(users)

    ratings = generate_key_ratings_atributes()
    ratings = normalize_ratings(ratings)
    
    animes = generate_key_anime_atributes()
    animes = remove_anime_with_few_ratings(animes)

    rating_data = add_user_data_to_rating(ratings, users)
    # this step will remove all ratings from anime with less than the defined number of ratings
    rating_data = drop_unused_info(rating_data.merge(animes, on="anime_id", how="inner"))
    

    return rating_data

## Step 2.1 - runing the pipeline and storing the results
Run the cell below to execute the load, normalize and join pipeline, also, the following cell can be used to store the results in a .csv file, making it possible to only load the csv file instead of repeting the first steps.

In [7]:
rating_data = load_normalize_and_join()

In [8]:
rating_data.to_csv("./normalized_joined_data.csv", index=False)

In [None]:
rating_data = pd.read_csv("./normalized_joined_data.csv")

## Step 3 - Generating a Cosine Similarity Matrix

Here i'll create a matrix will all users, all animes, and the values of the normalized scores for each anime and user.

In [None]:
def create_user_anime_matrix(rating_data: pd.DataFrame) -> pd.DataFrame:
    """creates a matrix with all the values for users as the index and all the values for animes on the columns and fill the values with the ratings, for rated animes and with 0 in case there is not a rating
    Args: 
        rating_data: pd.DataFrame containing user_ids, anime_ids and normalized ratings
    Returns: 
        user_anime_matrix: pd.DataFrame containing a matrix with the source data"""
    user_anime_matrix = rating_data.pivot_table(
        index="user_id", columns="anime_id", values="normalized_score").fillna(0)
    
    return user_anime_matrix


def create_similarity_matrix(user_anime_matrix: pd.DataFrame) -> pd.DataFrame:
    """creats a similarity matrix by using an user-anime-rating matrix
    Args:
        user_anime_matrix: pd.DataFrame with a matrix of user/anime/rating
    Returns:
        similarity_matrix: pd.DataFrame with the level of similarity between all users"""
    user_similarity = cosine_similarity(user_anime_matrix)
    similarity_matrix = pd.DataFrame(user_similarity, index=user_anime_matrix.index, columns=user_anime_matrix.index)

    return similarity_matrix

## Step 4 - instantiate the matrix

Here i'll simple instantiate the matrixes on local variables, so they can be used later on

In [11]:
user_anime_matrix = create_user_anime_matrix(rating_data)
similarity_matrix = create_similarity_matrix(user_anime_matrix)

## Step 5 - Define the suggestion methods

Bellow, there is the definition of the functions used to get similar uses and, afterwards, an anime suggestion list, for an user_id

In [None]:
def get_similar_users(user_id: int, top_n: int=5) -> list:
    """Get a list with N most-similar users
    Args:
        user_id: int id belonged by the user
        top_n: int number of most-similar users returned
    Returns:
        similar: list of user_ids"""
    similar = similarity_matrix[user_id].sort_values(ascending=False)
    similar = similar.drop(user_id) 
    return similar.head(top_n)

def suggest_anime(user_id, top_n=10):
    """Get a dataframe with N anime_ids and average ratings for those animes
    Args:
        user_id: int id belonged by the user
        top_n: int number of anime suggestions
    Returns:
        non_watched_anime: pd.DataFrame containing anime_ids and average ratings"""
    similar_users = get_similar_users(user_id)
    
    similar_anime = user_anime_matrix.loc[similar_users.index]
    
    average_ratings = similar_anime.mean(axis=0)
    
    watched_anime = user_anime_matrix.loc[user_id]
    non_watched_anime = average_ratings[watched_anime == 0]
    
    return non_watched_anime.sort_values(ascending=False).head(top_n)

## Step 6 - Run the suggestion

Bellow, the method can be executed to return N suggestions for an user_id.

In [15]:
suggest_anime(user_id=66, top_n=20)

anime_id
60      0.86
488     0.84
1887    0.84
30      0.82
120     0.80
387     0.80
1       0.78
50      0.78
2904    0.78
28      0.78
889     0.74
245     0.72
45      0.70
61      0.70
2001    0.68
202     0.66
467     0.64
97      0.62
227     0.62
2025    0.60
dtype: float64