In [99]:
import pandas as pd
import numpy as np

In [145]:
df = pd.read_csv('D:\\ExcelR Assignments\\Assignment 11\\anime.csv')
df.shape

(12294, 7)

In [147]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [149]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [107]:
# There are some null values. I am filling those null values witg h zeroes.

In [153]:
# Handle missing values
df['genre'] = df['genre'].fillna("Unknown")
most_common_type = df['type'].mode()[0]
df['type'] = df['type'].fillna(most_common_type)
median_rating = df['rating'].median()
df['rating'] = df['rating'].fillna(median_rating)
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
median_episodes_per_type = df.groupby('type')['episodes'].transform('median')
df['episodes'] = df['episodes'].fillna(median_episodes_per_type)

In [155]:
# Multi-hot encode genres

from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(df['genre'].str.split(", "))
genre_encoded_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

In [159]:
# Normalize numerical columns

scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(df[['rating', 'members', 'episodes']])
normalized_features_df = pd.DataFrame(normalized_features, columns=['rating', 'members', 'episodes'])

In [165]:
# Combine all features for similarity

features_df = pd.concat([genre_encoded_df, normalized_features_df], axis=1)
features_df.head()

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Super Power,Supernatural,Thriller,Unknown,Vampire,Yaoi,Yuri,rating,members,episodes
0,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0.92437,0.197872,0.0
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0.911164,0.78277,0.034673
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.909964,0.112689,0.027518
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0.90036,0.664325,0.012658
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.89916,0.149186,0.027518


In [171]:
# Function to recommend anime
from sklearn.metrics.pairwise import cosine_similarity

def recommend_anime(target_anime_name, anime_data, features_df, top_n=5):
    # Find the index of the target anime
    target_index = anime_data[anime_data['name'] == target_anime_name].index[0]
     
    # Compute cosine similarity
    similarity = cosine_similarity(features_df)
    
    # Get similarity scores for the target anime
    similarity_scores = list(enumerate(similarity[target_index]))
    
    # Sort by similarity score in descending order
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    #Recommend top_n anime
    top_recommendations = sorted_scores[1:top_n+1]
    recommended_anime = [anime_data.iloc[i[0]]['name'] for i in top_recommendations]
    
    return recommended_anime

# Example: Recommend similar anime for "Kimi no Na wa."
recommendations = recommend_anime("Kimi no Na wa.", df, features_df, top_n=5)
print("Recommended Anime:", recommendations)

Recommended Anime: ['Wind: A Breath of Heart OVA', 'Wind: A Breath of Heart (TV)', 'Aura: Maryuuin Kouga Saigo no Tatakai', 'Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen', 'Kokoro ga Sakebitagatterunda.']


In [181]:
from sklearn.model_selection import train_test_split
import numpy as np

# Split data into train and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Recompute features for train data
train_genre_encoded = mlb.fit_transform(train_data['genre'].str.split(", "))
train_genre_encoded_df = pd.DataFrame(train_genre_encoded, columns=mlb.classes_)
train_normalized_features = scaler.fit_transform(train_data[['rating', 'members', 'episodes']])
train_normalized_features_df = pd.DataFrame(train_normalized_features, columns=['rating', 'members', 'episodes'])
train_features_df = pd.concat([train_genre_encoded_df, train_normalized_features_df], axis=1)

In [183]:
# Initialize lists to store precision and recall values
precision_list = []
recall_list = []

In [185]:
# Evaluate the recommendation system
for test_anime in test_data['name'].head(10):  # Limit to a subset for faster evaluation
    try:
        recommended = recommend_anime(test_anime, train_data, train_features_df, top_n=5)
        actual_genres = set(test_data[test_data['name'] == test_anime]['genre'].iloc[0].split(", "))
        
        # Get genres for recommended anime
        recommended_genres = set(
            anime_data[df['name'].isin(recommended)]['genre']
            .str.split(", ")
            .explode()
        )
        
        # Calculate the intersection of genres
        intersection = len(actual_genres & recommended_genres)
        
        # Avoid division by zero for precision and recall
        precision = intersection / len(recommended_genres) if len(recommended_genres) > 0 else 0
        recall = intersection / len(actual_genres) if len(actual_genres) > 0 else 0
        
        precision_list.append(precision)
        recall_list.append(recall)
    except IndexError:
        continue

In [187]:
# Calculate average metrics
avg_precision = np.mean(precision_list) if precision_list else 0
avg_recall = np.mean(recall_list) if recall_list else 0
f1_score = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) > 0 else 0

# Print results
print(f"Precision: {avg_precision:.2f}, Recall: {avg_recall:.2f}, F1 Score: {f1_score:.2f}")

Precision: 0.00, Recall: 0.00, F1 Score: 0.00


In [1]:
'''
Observations:

Precision:
   A low precision value suggests the system is recommending many irrelevant anime. This could be due to:
   Overweighting of less relevant features (e.g., members, episodes).
   Insufficient granularity in genre matching (e.g., multiple genres sharing partial overlap).

Recall:
   A low recall value indicates the system is failing to recommend a significant number of relevant anime.
Causes:
   Highly specific genre combinations (e.g., action + romance + supernatural) are not matched well.
   Popular anime dominating recommendations, reducing diversity.

F1-Score:
   A low F1-score reflects the combined effect of poor precision and recall.
   Indicates the need to improve both relevance and completeness of recommendations.
'''

'\nObservations:\n\nPrecision:\n   A low precision value suggests the system is recommending many irrelevant anime. This could be due to:\n   Overweighting of less relevant features (e.g., members, episodes).\n   Insufficient granularity in genre matching (e.g., multiple genres sharing partial overlap).\n\nRecall:\n   A low recall value indicates the system is failing to recommend a significant number of relevant anime.\nCauses:\n   Highly specific genre combinations (e.g., action + romance + supernatural) are not matched well.\n   Popular anime dominating recommendations, reducing diversity.\n\nF1-Score:\n   A low F1-score reflects the combined effect of poor precision and recall.\n   Indicates the need to improve both relevance and completeness of recommendations.\n'

Interview Questions:

In [None]:
'''
1. Can you explain the difference between user-based and item-based collaborative filtering?
User-based Collaborative Filtering: This method finds users with similar preferences (user-user similarity) based on 
    their ratings or interactions. It recommends items that similar users have liked. For example, if User A and User B have similar tastes,
    User A will be recommended items liked by User B.
Item-based Collaborative Filtering: This approach identifies items that are similar (item-item similarity) based on
    how users interact with them. It recommends items similar to those the user has previously liked. For instance, 
    if two movies are often liked together, recommending one suggests the other.

2. What is collaborative filtering, and how does it work?
Collaborative filtering is a recommendation system technique that suggests items based on user interactions or preferences. 
It operates by leveraging the collective behavior of a group of users to make recommendations.
How it Works:
      Collaborative filtering identifies patterns in user-item interactions, such as ratings, clicks, or purchases.
User-based: Finds users with similar preferences and recommends items they liked.
Item-based: Identifies similar items based on user interactions and suggests items similar to those the user engaged with.
It assumes that users with similar past behaviors will have similar future preferences. No prior item information is required, 
making it domain-independent.
'''