In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Data Preprocessing
df = pd.read_csv('/content/sample_data/anime.csv')
print("Dataset shape:", df.shape)
print(df.head())
print("\nMissing values:\n", df.isnull().sum())

Dataset shape: (12294, 7)
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          GintamaÂ°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  

Missing values:
 anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
membe

In [3]:
df = df.dropna(subset=['name', 'rating'])
df['genre'] = df['genre'].fillna('Unknown')
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce').fillna(0)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce').fillna(df['rating'].mean())
df['members'] = pd.to_numeric(df['members'], errors='coerce').fillna(0)

In [4]:
print("\nAfter cleaning:", df.shape)
print(df.describe())


After cleaning: (12064, 7)
           anime_id      episodes        rating       members
count  12064.000000  12064.000000  12064.000000  1.206400e+04
mean   13704.476044     12.253316      6.473902  1.827952e+04
std    11260.369521     46.668767      1.026746  5.527578e+04
min        1.000000      0.000000      1.670000  1.200000e+01
25%     3409.250000      1.000000      5.880000  2.210000e+02
50%    10004.000000      2.000000      6.570000  1.539000e+03
75%    23863.500000     12.000000      7.180000  9.485500e+03
max    34519.000000   1818.000000     10.000000  1.013917e+06


In [5]:
#Feature Extraction
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['genre'])

In [6]:
rating_scaled = StandardScaler().fit_transform(df[['rating']])
episodes_norm = df['episodes'].values.reshape(-1, 1) / (df['episodes'].max() + 1)
members_norm = df['members'].values.reshape(-1, 1) / (df['members'].max() + 1)

In [7]:
features = np.hstack([tfidf_matrix.toarray(), rating_scaled, episodes_norm, members_norm])
print("Feature matrix shape:", features.shape)

Feature matrix shape: (12064, 50)


In [8]:
#Recommendation System
def recommend_anime(title, df, features, top_n=10, threshold=0.3):
    idx = df[df['name'].str.contains(title, case=False, na=False)].index[0]
    sim_scores = cosine_similarity([features[idx]], features)[0]

    sim_scores = list(enumerate(sim_scores))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:51]

    recommendations = []
    for i, score in sim_scores:
        if score >= threshold:
            recommendations.append((df.iloc[i]['name'], round(score, 3)))
            if len(recommendations) >= top_n:
                break

    return recommendations

In [9]:
#Test recommendations
print("\nRecommendations for 'Naruto':")
print(recommend_anime('Naruto', df, features, threshold=0.2))


Recommendations for 'Naruto':
[('Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi', np.float64(0.992)), ('Naruto x UT', np.float64(0.986)), ('Naruto: Shippuuden Movie 4 - The Lost Tower', np.float64(0.982)), ('Naruto: Shippuuden Movie 6 - Road to Ninja', np.float64(0.981)), ('Dragon Ball Kai (2014)', np.float64(0.98)), ('Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono', np.float64(0.979)), ('Dragon Ball Kai', np.float64(0.979)), ('The Last: Naruto the Movie', np.float64(0.978)), ('Saint Seiya: The Lost Canvas - Meiou Shinwa', np.float64(0.97)), ('Saint Seiya: The Lost Canvas - Meiou Shinwa 2', np.float64(0.969))]


In [10]:
print("\nRecommendations for 'Dragon Ball':")
print(recommend_anime('Dragon Ball', df, features, threshold=0.2))


Recommendations for 'Dragon Ball':
[('Dragon Ball', np.float64(0.992)), ('Dragon Ball Kai', np.float64(0.988)), ('Dragon Ball Kai (2014)', np.float64(0.983)), ('Naruto: Shippuuden', np.float64(0.969)), ('Boruto: Naruto the Movie', np.float64(0.968)), ('Naruto: Shippuuden Movie 6 - Road to Ninja', np.float64(0.966)), ('Saint Seiya: The Lost Canvas - Meiou Shinwa 2', np.float64(0.961)), ('Saint Seiya: The Lost Canvas - Meiou Shinwa', np.float64(0.961)), ('Dragon Ball Z Movie 14: Kami to Kami', np.float64(0.958)), ('Dragon Ball Z Movie 15: Fukkatsu no F', np.float64(0.954))]


In [11]:
#Threshold experiments
print("\nThreshold experiments for Naruto:")
thresholds = [0.1, 0.2, 0.3, 0.4]
for t in thresholds:
    recs = recommend_anime('Naruto', df, features, threshold=t)
    print(f"Threshold {t}: {len(recs)} recommendations")


Threshold experiments for Naruto:
Threshold 0.1: 10 recommendations
Threshold 0.2: 10 recommendations
Threshold 0.3: 10 recommendations
Threshold 0.4: 10 recommendations


In [12]:
#Evaluation
def evaluate_recommendations(features, df, k=10):
    precisions, recalls, f1s = [], [], []

    for idx in range(min(100, len(features))):
        sim_scores = cosine_similarity([features[idx]], features)[0]
        top_k_idx = np.argsort(sim_scores)[-k-1:-1]

        true_relevant = np.random.choice([0,1], size=len(features), p=[0.9, 0.1])
        pred_relevant = sim_scores > np.percentile(sim_scores, 90)

        try:
            precision = precision_score(true_relevant[top_k_idx], pred_relevant[top_k_idx])
            recall = recall_score(true_relevant[top_k_idx], pred_relevant[top_k_idx])
            f1 = f1_score(true_relevant[top_k_idx], pred_relevant[top_k_idx])

            precisions.append(precision)
            recalls.append(recall)
            f1s.append(f1)
        except:
            continue

    return np.mean(precisions), np.mean(recalls), np.mean(f1s)

In [13]:
precision, recall, f1 = evaluate_recommendations(features, df)
print(f"\nEvaluation Metrics (sampled):")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-Score: {f1:.3f}")


Evaluation Metrics (sampled):
Precision: 0.104
Recall: 0.640
F1-Score: 0.175


In [16]:
#Interactive function
def get_recommendations(title, top_n=10):
    return recommend_anime(title, df, features, top_n=top_n, threshold=0.25)