In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

df=pd.read_csv("anime.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [6]:
df.shape

(12294, 7)

In [7]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [29]:
# Step 1: Data Preprocessing
# Drop rows with missing essential data
df.dropna(subset=["name","genre","rating"],inplace=True)

In [30]:
# Fill missing episodes with median
df["episodes"]=pd.to_numeric(df["episodes"],errors='coerce')
df["episodes"].fillna(df["episodes"].median(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["episodes"].fillna(df["episodes"].median(),inplace=True)


In [31]:
# Clean genres
df["genre"]=df["genre"].str.lower().str.replace(" ", "")

In [32]:
# Step 2: Feature Extraction
# Using TF-IDF on genre
tfidf=TfidfVectorizer(token_pattern=r"[^,]+")
tfidf_matrix=tfidf.fit_transform(df["genre"])

In [33]:
# Add rating and episodes as additional features
numeric_features=df[["rating", "episodes"]]
numeric_features=(numeric_features-numeric_features.mean())/numeric_features.std()

In [34]:
# Combine genre TF-IDF with numeric features
from scipy.sparse import hstack
features_combined=hstack([tfidf_matrix,numeric_features])

In [35]:
# Step 3: Recommendation System
# Cosine Similarity Matrix
cos_sim=cosine_similarity(features_combined,features_combined)

In [15]:
# Function to recommend anime
def recommend_anime(title, top_n=5, threshold=0.3):
    if title not in df["name"].values:
        return "Anime not found."

    idx = df[df["name"] == title].index[0]
    sim_scores = list(enumerate(cos_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    recommended = []
    for i, score in sim_scores[1:]:  # skip itself
        if score >= threshold:
            recommended.append((df.iloc[i]["name"], score))
        if len(recommended) >= top_n:
            break
    return recommended

In [16]:
# Try a recommendation
print("Recommended Anime:")
print(recommend_anime("Naruto"))

Recommended Anime:
[('Dragon Ball Z', 0.9951780748155952), ('Bleach', 0.9832627363578021), ('Katekyo Hitman Reborn!', 0.9806521879098526), ('Keroro Gunsou', 0.9745870890831057), ('Dragon Ball', 0.9666902215217162)]


In [36]:
# Step 4: Evaluation
# Binary relevance matrix for genre prediction 
X=features_combined
y=df.copy()

In [37]:
# Split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [38]:
# Recompute cosine similarity only for test vs train
cos_sim_eval=cosine_similarity(X_test,X_train)

In [39]:
# Genre labels
genre_cols=list(all_genres)
y_train_genres=y_train[genre_cols].values
y_test_genres=y_test[genre_cols].values

In [40]:
# Predict genre of test item based on most similar item in train
pred_labels=[]

for i in range(X_test.shape[0]):
    sim_scores=cos_sim_eval[i]
    top_idx=np.argmax(sim_scores)
    pred_labels.append(y_train_genres[top_idx])

In [41]:
# Convert to numpy array
pred_labels=np.array(pred_labels)

In [42]:
precision=precision_score(y_test_genres,pred_labels,average="micro")
recall=recall_score(y_test_genres,pred_labels,average="micro")
f1=f1_score(y_test_genres,pred_labels,average="micro")

In [43]:
print("\nEvaluation Metrics:")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")


Evaluation Metrics:
Precision: 0.939
Recall: 0.897
F1 Score: 0.918


In [1]:
#1. What’s the difference between user-based and item-based collaborative filtering?

#User-based finds users similar to you and recommends what they liked.
#Item-based finds items similar to what you liked and recommends those.
#Item-based is usually more stable and scalable.

In [2]:
#2. What is collaborative filtering, and how does it work?

#Collaborative filtering recommends items by finding patterns in user behavior (like ratings or purchases).
#It uses either similar users (user-based) or similar items (item-based) to make predictions—without needing item details.