In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import ast
import pickle

In [2]:
# Function to print version of each required package
def print_package_versions():
    print("Package Versions:")
    print(f"Pandas: {pd.__version__}")
    print(f"Numpy: {np.__version__}")
    print(f"Scikit-Learn: {sklearn.__version__}")
    print("AST: Built-in")
    print("Pickle: Built-in")

print_package_versions()

Package Versions:
Pandas: 2.2.3
Numpy: 2.1.3
Scikit-Learn: 1.5.2
AST: Built-in
Pickle: Built-in


In [3]:
# Function to load datasets
def load_data():
    anime_data = pd.read_csv(r"../data/anime-dataset-2023.csv", low_memory=False,usecols=['Name', 'Genres','Studios','Synopsis','Rating','Image URL'])
    return anime_data
# Load the data and inspect
raw_data = load_data()
raw_data.head()  # Display the first few rows to check

Unnamed: 0,Name,Genres,Synopsis,Studios,Rating,Image URL
0,Cowboy Bebop,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",Sunrise,R - 17+ (violence & profanity),https://cdn.myanimelist.net/images/anime/4/196...
1,Cowboy Bebop: Tengoku no Tobira,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Bones,R - 17+ (violence & profanity),https://cdn.myanimelist.net/images/anime/1439/...
2,Trigun,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",Madhouse,PG-13 - Teens 13 or older,https://cdn.myanimelist.net/images/anime/7/203...
3,Witch Hunter Robin,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,Sunrise,PG-13 - Teens 13 or older,https://cdn.myanimelist.net/images/anime/10/19...
4,Bouken Ou Beet,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,Toei Animation,PG - Children,https://cdn.myanimelist.net/images/anime/7/215...


In [4]:
# Split  columns text into lists for merging
raw_data["GenresList"] = raw_data["Genres"].apply(lambda x: x.split() if isinstance(x, str) else [])
raw_data["Synopsis"] = raw_data["Synopsis"].apply(lambda x: x.split() if isinstance(x, str) else [])
raw_data["Studios"] = raw_data["Studios"].apply(lambda x: x.split() if isinstance(x, str) else [])
raw_data[["GenresList","Synopsis","Studios"]].head() # Check the result

Unnamed: 0,GenresList,Synopsis,Studios
0,"[Action,, Award, Winning,, Sci-Fi]","[Crime, is, timeless., By, the, year, 2071,, h...",[Sunrise]
1,"[Action,, Sci-Fi]","[Another, day,, another, bounty—such, is, the,...",[Bones]
2,"[Action,, Adventure,, Sci-Fi]","[Vash, the, Stampede, is, the, man, with, a, $...",[Madhouse]
3,"[Action,, Drama,, Mystery,, Supernatural]","[Robin, Sena, is, a, powerful, craft, user, dr...",[Sunrise]
4,"[Adventure,, Fantasy,, Supernatural]","[It, is, the, dark, century, and, the, people,...","[Toei, Animation]"


In [5]:
# Combine relevant columns into a single 'tags' column
raw_data['tags'] = raw_data['GenresList'] + raw_data['Synopsis'] + raw_data['Studios']
raw_data[["Name","tags"]].head()  # Check the result

Unnamed: 0,Name,tags
0,Cowboy Bebop,"[Action,, Award, Winning,, Sci-Fi, Crime, is, ..."
1,Cowboy Bebop: Tengoku no Tobira,"[Action,, Sci-Fi, Another, day,, another, boun..."
2,Trigun,"[Action,, Adventure,, Sci-Fi, Vash, the, Stamp..."
3,Witch Hunter Robin,"[Action,, Drama,, Mystery,, Supernatural, Robi..."
4,Bouken Ou Beet,"[Adventure,, Fantasy,, Supernatural, It, is, t..."


In [6]:
# Select relevant columns and prepare final dataset
final_data = raw_data[['Name', 'Genres', 'Rating', 'Image URL', 'tags']]
final_data['tags'] = final_data['tags'].apply(lambda x: " ".join(x))  # Convert list to single string
final_data['tags'] = final_data['tags'].apply(lambda x: x.lower()) # Lower the string
final_data.head()  # Check the final dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['tags'] = final_data['tags'].apply(lambda x: " ".join(x))  # Convert list to single string
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['tags'] = final_data['tags'].apply(lambda x: x.lower()) # Lower the string


Unnamed: 0,Name,Genres,Rating,Image URL,tags
0,Cowboy Bebop,"Action, Award Winning, Sci-Fi",R - 17+ (violence & profanity),https://cdn.myanimelist.net/images/anime/4/196...,"action, award winning, sci-fi crime is timeles..."
1,Cowboy Bebop: Tengoku no Tobira,"Action, Sci-Fi",R - 17+ (violence & profanity),https://cdn.myanimelist.net/images/anime/1439/...,"action, sci-fi another day, another bounty—suc..."
2,Trigun,"Action, Adventure, Sci-Fi",PG-13 - Teens 13 or older,https://cdn.myanimelist.net/images/anime/7/203...,"action, adventure, sci-fi vash the stampede is..."
3,Witch Hunter Robin,"Action, Drama, Mystery, Supernatural",PG-13 - Teens 13 or older,https://cdn.myanimelist.net/images/anime/10/19...,"action, drama, mystery, supernatural robin sen..."
4,Bouken Ou Beet,"Adventure, Fantasy, Supernatural",PG - Children,https://cdn.myanimelist.net/images/anime/7/215...,"adventure, fantasy, supernatural it is the dar..."


In [7]:
# Initialize CountVectorizer and compute similarity matrix
vectorizer = CountVectorizer(max_features=6000, token_pattern=r'(?u)\b[a-zA-Z]+\b', stop_words="english")
anime_vectors = vectorizer.fit_transform(final_data['tags']).toarray()
similarity_matrix = cosine_similarity(anime_vectors)

# Check the similarity matrix shape and a sample
similarity_matrix.shape, similarity_matrix[:5, :5]  # Shape and a small sample

((24905, 24905),
 array([[1.        , 0.19762125, 0.18264184, 0.13920485, 0.06910737],
        [0.19762125, 1.        , 0.14287172, 0.05444655, 0.02316827],
        [0.18264184, 0.14287172, 1.        , 0.07156563, 0.11419812],
        [0.13920485, 0.05444655, 0.07156563, 1.        , 0.08703883],
        [0.06910737, 0.02316827, 0.11419812, 0.08703883, 1.        ]]))

In [8]:
# Define paths for saving in the 'model' folder
final_data_path = "../models/anime_list.pkl"
similarity_matrix_path = "../models/anime_similarity.pkl"

# Save processed data and similarity matrix to the specified paths
pickle.dump(final_data, open(final_data_path, 'wb'))
pickle.dump(similarity_matrix, open(similarity_matrix_path, 'wb'))

# Confirm data saved
print("Data saved in the model folder as anime_list.pkl and anime_similarity.pkl")

Data saved in the model folder as anime_list.pkl and anime_similarity.pkl
