# Anime Recommendation System using Cosine Similarity

Colab / Jupyter Notebook Ready

In [1]:

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

from scipy.sparse import hstack


## Load Dataset

In [2]:

df = pd.read_csv('anime.csv')
df.head()


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


## Dataset Exploration

In [3]:

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [4]:

df.describe(include='all')


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
count,12294.0,12294,12232,12269,12294.0,12064.0,12294.0
unique,,12292,3264,6,187.0,,
top,,Saru Kani Gassen,Hentai,TV,1.0,,
freq,,2,823,3787,5677.0,,
mean,14058.221653,,,,,6.473902,18071.34
std,11455.294701,,,,,1.026746,54820.68
min,1.0,,,,,1.67,5.0
25%,3484.25,,,,,5.88,225.0
50%,10260.5,,,,,6.57,1550.0
75%,24794.5,,,,,7.18,9437.0


## Data Preprocessing

In [5]:

df['genre'] = df['genre'].fillna('Unknown')
df['rating'] = df['rating'].fillna(df['rating'].mean())

df['episodes'] = df['episodes'].replace('Unknown', np.nan)
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
df['episodes'] = df['episodes'].fillna(df['episodes'].median())

df['members'] = df['members'].fillna(df['members'].median())

df.isnull().sum()


Unnamed: 0,0
anime_id,0
name,0
genre,0
type,25
episodes,0
rating,0
members,0


## Feature Extraction

In [6]:

tfidf = TfidfVectorizer(stop_words='english')
genre_matrix = tfidf.fit_transform(df['genre'])


In [7]:

scaler = MinMaxScaler()
numeric_features = df[['rating', 'episodes', 'members']]
numeric_scaled = scaler.fit_transform(numeric_features)


In [8]:

final_features = hstack([genre_matrix, numeric_scaled])


## Cosine Similarity

In [9]:

cosine_sim = cosine_similarity(final_features)


## Recommendation Function

In [10]:

def recommend_anime(anime_title, top_n=10, similarity_threshold=0.3):
    if anime_title not in df['name'].values:
        return "Anime not found in dataset."

    idx = df[df['name'] == anime_title].index[0]
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    recommendations = []
    for i, score in similarity_scores[1:]:
        if score >= similarity_threshold:
            recommendations.append((df.iloc[i]['name'], score))
        if len(recommendations) == top_n:
            break

    return pd.DataFrame(recommendations, columns=['Recommended Anime', 'Similarity Score'])


## Example Recommendation

In [11]:

recommend_anime('Naruto', top_n=5, similarity_threshold=0.4)


Unnamed: 0,Recommended Anime,Similarity Score
0,Naruto: Shippuuden,0.991495
1,Dragon Ball Z,0.94279
2,Dragon Ball,0.915894
3,Naruto: Shippuuden Movie 4 - The Lost Tower,0.905891
4,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,0.905552


## Evaluation

In [12]:

def evaluate_model(anime_title, k=5):
    recommendations = recommend_anime(anime_title, top_n=k)
    if isinstance(recommendations, str):
        return None

    target_genres = set(df[df['name'] == anime_title]['genre'].values[0].split(', '))
    relevant = 0

    for anime in recommendations['Recommended Anime']:
        rec_genres = set(df[df['name'] == anime]['genre'].values[0].split(', '))
        if target_genres.intersection(rec_genres):
            relevant += 1

    precision = relevant / k
    recall = relevant / len(target_genres)
    f1 = 2 * precision * recall / (precision + recall + 1e-6)

    return precision, recall, f1


In [13]:

precision, recall, f1 = evaluate_model('Naruto', k=5)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")


Precision: 1.00
Recall: 1.00
F1-score: 1.00
