In [1]:
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Phase 1 - ETL

Here i will load and transform the data i will be using;
Originally, that were datasets:
- anime_cleaned.csv; (cointains data about the animes)
- animelists_cleaned.csv; (contains the data about the users ratings of each anime)
- users_cleaned.csv (contains user data)

In order to ensure the experiment remains compatible with my system’s processing capacity, i needed to reduce the data volume, so i opted to:
- Remove animes in which the total number of ratings were bellow the median of all anime;
- Remove users who had whatch less than the median number of shows or more than the median plus the standart deviation, in order to handle outliers.

In this phase i also normalized the values.

## Step 1 - Function declaration

Here i will declare the functions that will be used for loading and transformation


In [2]:

datasets_folder_path: str = "./DataSets"
anime_file_name: str = "anime_cleaned.csv"
users_file_name: str = "users_cleaned.csv"
ratings_file_name: str = "animelists_cleaned.csv"
def generate_key_user_atributes() -> pd.DataFrame :
    """Generates a dataframe keeping only the user atributes that will be used for trainning the model
    Args:
        path: String with the original dataset path
    Returns:
        users: pd.DataFrame containning users, number or completed shows and gender
    """
    users = pd.read_csv(f"{datasets_folder_path}/{users_file_name}")
    users = users[["username","user_id","user_completed","gender"]]
    return users
def generate_key_ratings_atributes() -> pd.DataFrame :
    """Generates a dataframe keeping only the user atributes that will be used for trainning the model
    Args:
        path: String with the original dataset path
    Returns:
        ratings: pd.DataFrame containning users, number or completed shows and gender"""
    ratings = pd.read_csv(f"{datasets_folder_path}/{ratings_file_name}")
    ratings = ratings[["username","anime_id","my_score"]]
    return ratings
def generate_key_anime_atributes() -> pd.DataFrame :
    """Generates a dataframe keeping only the user atributes that will be used for trainning the model
    Args:
        path: String with the original dataset path
    Returns:
        ratings: pd.DataFrame containning users, number or completed shows and gender"""
    animes = pd.read_csv(f"{datasets_folder_path}/{anime_file_name}")
    animes = animes [["anime_id","scored_by","duration_min","episodes","genre"]]

    return animes

def remove_unfinished_anime_ratings(ratings: pd.DataFrame, animes: pd.DataFrame) -> pd.DataFrame:
    """Removes all anime that the user did not see all episodes from the dateframe
    Args:
        ratings: pd.DataFrame with the users ratings
        animes: pd.DataFrame with the anime data
    Returns:
        completed_animes_ratings: pd.DataFrame with all ratings from completed animes"""
    ratings = ratings.merge(animes,on="anime_id",how="left")
    ratings["completed"] = ratings.apply(
        lambda row: "Y" if row["my_watched_episodes"] == row["episodes"] else "N", axis=1)
    completed_anime_ratings = ratings.loc[ratings["completed"] == "Y"]
    return completed_anime_ratings

def add_user_data_to_rating(ratings: pd.DataFrame, user_data: pd.DataFrame) -> pd.DataFrame:
    """Merge the ratings with user data
    Args:
        ratings: pd.DataFrame with the ratings
        user_data: pd.DataFrame with the user data
    Returns:
        full_ratings: pd.DataFrame with the merging of ratings and user data"""
    full_ratings = ratings.merge(user_data, on="username", how="inner")
    return full_ratings


def normalize_anime_genres(animes: pd.DataFrame) -> pd.DataFrame:
    """Split the genre list and adds one column for each genre, setting those as true or false for each genre present in the genre list for each anime"
    Args:
        animes: pd.DataFrame with anime data
    Returns:
        animes_genres: pd.DataFrame with all anime data and the normalized genre data
    """
    animes["genre_list"] = animes["genre"].str.split(", ")
    animes["genre_list"] = animes["genre_list"].apply(
        lambda x: x if isinstance(x, list) else []
    )
    mlb = MultiLabelBinarizer()
    genres_encoded = mlb.fit_transform(animes["genre_list"])
    animes_genres = pd.DataFrame(genres_encoded, columns=mlb.classes_)

    animes = pd.concat([animes, animes_genres], axis=1)
    
    return animes

def normalize_duration(animes: pd.DataFrame) -> pd.DataFrame:
    """Normalizes the episode duration by dividing all values by the highest value
    Args:
        anime: pd.DataFrame with all anime data
    Returns: 
        animes: pd.DataFrame with the normalized values"""
    animes["normalized_duration_min"] = animes["duration_min"] / animes["duration_min"].max()

    return animes

def normalize_episodes(animes: pd.DataFrame) -> pd.DataFrame:
    """Normalizes the number of episodes by dividing all values by the highest value
    Args:
        anime: pd.DataFrame with all anime data
    Returns: 
        animes: pd.DataFrame with the normalized values"""
    animes["normalized_episodes"] = animes["episodes"] / animes["episodes"].max()

    return animes


def normalize_ratings(ratings: pd.DataFrame) -> pd.DataFrame:
    """Normalizes the ratings by dividing all values by the highest possible value (10)
    Args:
        ratings: pd.DataFrame with all ratiings data
    Returns: 
        ratings: pd.DataFrame with the normalized values"""
    ratings["normalized_score"] = ratings["my_score"] / ratings["my_score"].max()

    return ratings


def normalize_user_completed(users: pd.DataFrame) -> pd.DataFrame:
    """Normalzies the number of anime completed by the user
    Args:
        user: pd.DataFrame with all user data
    Returns: 
        users: pd.DataFrame with the normalized values"""
    users["normalized_user_completed"] =  users["user_completed"] / users["user_completed"].max()

    return users

def normalize_user_gender(users: pd.DataFrame) -> pd.DataFrame:
    """Normalzies the user gender, using 1 for male and 0 for female
    Args:
        user: pd.DataFrame with all user data
    Returns: 
        users: pd.DataFrame with the normalized values"""
    users["normalized_gender"] =  users["gender"].apply(lambda x: 1 if "Male" else 0)

    return users

def remove_users_with_low_number_of_ratings(users: pd.DataFrame) -> pd.DataFrame:
    """Removes users with a low number of ratings from the dataframe
    Args:
        users: pd.DataFrame with all data
    Returns:
        users: pd.DataFrame cleanned"""
    std = users["user_completed"].std()
    median = users["user_completed"].median()
    upper_limit = median + std

    users = users.loc[
        (users["user_completed"] >= median) & 
        (users["user_completed"] <= upper_limit)
    ]

    return users


def remove_anime_with_few_ratings(animes: pd.DataFrame):
    """Removes animes with a low number of ratings from the dataframe
    Args:
        animes: pd.DataFrame with all data
    Returns:
        animes: pd.DataFrame cleanned"""
    median = animes["scored_by"].median()
    animes = animes.loc[animes["scored_by"]> median]

    return animes

def drop_unused_info(df: pd.DataFrame) -> pd.DataFrame:
    """Drop columns that wont be used for trainning the model
    Args:
        df: pd.DataFrame with the data
    Returns:
        df: pd.DataFrame without the unnecessary columns"""
    df = df.drop(columns=["username",
                          "my_score",
                          "user_completed",
                          "scored_by",
                          "genre_list",
                          "duration_min",
                          "episodes",
                          "genre",
                          "gender"])
    return df

## Step 2 - Defining a pipeline

Here i'll create a pipeline for loading, normalizing and joining the data by using the functions above

In [3]:
def normalize_user_data() -> pd.DataFrame:
    """Run the pipeline to normalize user data
    Returns:
        users: pd.DataFrame with normalized user data"""
    users = generate_key_user_atributes()
    users = remove_users_with_low_number_of_ratings(users)
    users = normalize_user_completed(users)
    users = normalize_user_gender(users)
    return users

def normalize_rating_data(users: pd.DataFrame) -> pd.DataFrame:
    """Run the pipeline to normalize rating data and joins the data with user data
    Args:
        users: pd.DataFrame with user data
    Returns:
        user_and_ratings: pd.DataFrame with normalized rating data"""
    ratings = generate_key_ratings_atributes()
    ratings = normalize_ratings(ratings)
    user_and_ratings = add_user_data_to_rating(ratings, users)
    return user_and_ratings

def normalize_anime_data() -> pd.DataFrame:
    """Run the pipeline to normalize anime data
    Returns:
        animes: pd.DataFrame with normalized anime data"""
    animes = generate_key_anime_atributes()
    animes = remove_anime_with_few_ratings(animes)
    animes = normalize_duration(animes)
    animes = normalize_episodes(animes)
    animes = normalize_anime_genres(animes)
    return animes



## Step 2.1 - runing the pipeline and storing the results
Run the cell below to execute the load, normalize and join pipeline.

The following cell drops the unused information from the dataframe and also adds a column that will be the result we will seek for on the model, if the user liked the anime or note, for default, the "liked" rating was set to 0.8, so only ratings with a value above or equal to that will be considered anime the user liked.

The last cell can be used to store the results in a .csv file, making it possible to only load the csv file instead of repeting the first steps.

In [4]:
user_data = normalize_user_data()
rating_data = normalize_rating_data(user_data)
anime_data = normalize_anime_data()

In [5]:
rating_data = drop_unused_info(rating_data.merge(anime_data, on="anime_id", how="inner"))
rating_data["high_rated"] = rating_data["normalized_score"].apply(lambda x: 1 if x >= 0.8 else 0)

In [None]:
rating_data.to_csv("./normalized_joined_data_xgb.csv", index=False)

# Step 3 - spliting Test and Train data

Bellow you can split our data into test and train.

- For our parameters we will have all data, except the normalized_score and the high_rated columns, the define if the user liked the show.
- For our results there is the high_rated boolean value

The Data Set will be split into 80% for training the model and 20% for testing the model.

The random_state value is set to make sure we can replicate the results everytime we train the model, note that another random_state value can possible show better accuracy.

In [11]:
X = rating_data.loc[:, ~rating_data.columns.isin(['high_rated', 'normalized_score'])]
y = rating_data["high_rated"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4 - Training the model

By using the previosly split data, we can train the model on the cell bellow.

The accuracy can be tested on the following one.

In [13]:
model = XGBClassifier()
model.fit(X_train, y_train)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [20]:
y_pred = model.predict(X_test)
print(f"accuracy: {accuracy_score(y_test, y_pred)*100} %" )


accuracy: 67.11303222585273 %


# Optional Step, Save/Load the model

So you don't need to train it again

In [18]:
model.save_model("AInimeSuggestions_xgb.json")

In [None]:
model = XGBClassifier()
model.load_model("AInimeSuggestions_xgb.json")

# Step 5 - Predicting good shows for an user

I'll define a function the allows us to predict all shows an user did not watch yet to see if the user might like then.

In [15]:
def predict_user(user_id: int = int(12345678), user_completed_animes: int = 0, gender: str = "Male") -> pd.DataFrame:
      """Predicts if the user will like all anime the used had never rate and returns a dataframe with the names of the ones he might like
      Args:
            user_id: int (optional) value with the user id
            user_completed_animes: int (optional) number of anime seen by the user
            gender: str (optional) user gender, default value is Male """
      
      if user_id in user_data["user_id"]:
            this_user_data = user_data[user_data["user_id"]== user_id]
            user_ratings = rating_data[rating_data["user_id"] == user_id]
            user_animes = user_ratings["anime_id"].unique()
            non_watched = anime_data[~anime_data['anime_id'].isin(user_animes)].dropna()

            non_watched["user_id"] = user_id
            non_watched["normalized_user_completed"] = this_user_data["user_completed"] /user_data["user_completed"].max()
            non_watched["normalized_gender"] = int(this_user_data["gender"] == "Male")
      else:
            non_watched = anime_data

            non_watched["user_id"] = user_id
            non_watched["normalized_user_completed"] = user_completed_animes /user_data["user_completed"].max()
            non_watched["normalized_gender"] = int(gender == "Male")


      non_watched = non_watched.drop(columns=[
                          "scored_by",
                          "genre_list",
                          "duration_min",
                          "episodes",
                          "genre"])
      non_watched = non_watched.reindex(columns=X.columns)
      predict = model.predict(non_watched)

      predict_df = pd.DataFrame({
      "anime_id": non_watched["anime_id"].values,
      "score": predict
})
      anime = pd.read_csv(f"{datasets_folder_path}/{anime_file_name}")
      predict_df = predict_df.merge(anime[["anime_id","title"]], on="anime_id", how="left")
      predict_df = predict_df[predict_df["score"]==1]

      return predict_df


In [16]:
predict_user(120)

  non_watched["normalized_gender"] = int(this_user_data["gender"] == "Male")


Unnamed: 0,anime_id,score,title
3,12365.0,1,Bakuman. 3rd Season
6,2787.0,1,Shakugan no Shana II (Second)
7,4477.0,1,Nodame Cantabile: Paris-hen
8,4814.0,1,Junjou Romantica 2
9,7054.0,1,Kaichou wa Maid-sama!
...,...,...,...
1486,30503.0,1,Noragami Aragoto
1501,18897.0,1,Nisekoi
1507,31339.0,1,Drifters
1512,33352.0,1,Violet Evergarden


In [17]:
predict_user(
    user_completed_animes= 100,
    gender= "Female"    
)

Unnamed: 0,anime_id,score,title
4,12365.0,1,Bakuman. 3rd Season
11,7054.0,1,Kaichou wa Maid-sama!
12,11123.0,1,Sekaiichi Hatsukoi 2
17,1735.0,1,Naruto: Shippuuden
19,4224.0,1,Toradora!
...,...,...,...
3283,6547.0,1,Angel Beats!
3298,33674.0,1,No Game No Life: Zero
3308,1575.0,1,Code Geass: Hangyaku no Lelouch
3316,23441.0,1,Love Stage!!: Chotto Janakutte
