**Import required data processing libraries**

In [28]:
import pandas as pd
import numpy as np

**Read in the anime dataset from the data folder**

In [29]:
anime_df = pd.read_csv("./data/anime.csv").dropna().drop("members",axis=1)
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16


**Cleaning data**

**Creating a genre vector for each anime, by finding the unique genres and representing 1 for part of genre and 0 as not part of genre**

**Factorizing the anime types**

**Converting the rating and episode columns to the right primitive data type, in numpy and giving default values to unknown anime metrics**

In [30]:
genre_set = set()

for genre_label in anime_df["genre"]:
    labels = genre_label.split(",")
    for label in labels:
        genre_set.add(label.strip())

for genre in genre_set:
    anime_df[genre] = [1 if genre in genre_label else 0 for genre_label in anime_df["genre"]]

unique_types = anime_df.type.unique()
TYPE_MAP = {unique_type : index for index,unique_type in enumerate(unique_types)}

anime_df["num_type"] = anime_df["type"].apply(lambda type : TYPE_MAP[type])


anime_df["rating"] = anime_df["rating"].astype(np.float64)

anime_df["episodes"] = [ep_count if ep_count != "Unknown" else 0.00 for ep_count in anime_df["episodes"]]
anime_df["episodes"] = anime_df["episodes"].astype(np.int64)


anime_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,Dementia,Samurai,Yuri,Magic,...,Slice of Life,Mecha,Harem,Horror,Cars,Shounen Ai,Psychological,Comedy,Yaoi,num_type
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
12290,5543,Under World,Hentai,OVA,1,4.28,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


**Import Sklearn classes and functions**

In [31]:
from sklearn.neighbors import NearestNeighbors 
import joblib

**Create the feature dataframe**

In [32]:
X = anime_df.drop(["anime_id","name","genre","type"],axis=1)

**Make the Nearest Neighbors model, with neighbors=[1,10], using the ball tree algorithm to optimize distance calculations, given high feature dimensionality**

In [33]:
models = []

for n_neighbors in range(1,11):
    model = NearestNeighbors(n_neighbors=n_neighbors, algorithm="ball_tree")
    model.fit(X)
    models.append(model)


**Save the cleaned anime dataset and models to disk**

In [34]:
for index, model in enumerate(models):
    joblib.dump(model, f'./models/neighbors{index+1}.pkl')

anime_df.to_csv("./data/cleaned_anime.csv",index=False)

# save the genres and unique anime types to local json files, for usage in other backend endpoints

import json

with open("./data/misc.json","w") as file:
    all_types = list(TYPE_MAP.keys())
    all_genres = list(genre_set)
    
    file.write(json.dumps({"genres" : all_genres, "types" : all_types}))

