In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings

warnings.filterwarnings("ignore")  # Ignore warnings for clean output

# Load the dataset
file_path = "/content/anime.csv"  # Update with your dataset path in Google Colab
anime_df = pd.read_csv(file_path)

# Display dataset info
print("✅ Dataset Loaded Successfully!\n")
print("🔹 Dataset Information:")
print(anime_df.info(), "\n")
print("🔹 First 5 Rows:")
print(anime_df.head(), "\n")

# Handle missing values (replace missing genres with empty string)
anime_df['genre'] = anime_df['genre'].fillna('')

# Drop rows with missing 'name' or 'rating' values
anime_df.dropna(subset=['name', 'rating'], inplace=True)

# Convert 'genre' column into a numerical format using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
genre_matrix = tfidf.fit_transform(anime_df['genre'])

print("✅ Genre Text Converted into Numerical Format using TF-IDF!")

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(genre_matrix)

print("✅ Cosine Similarity Matrix Computed!")

# Function to recommend anime
def recommend_anime(anime_title, top_n=5):
    if anime_title not in anime_df['name'].values:
        return f"❌ Anime '{anime_title}' not found in dataset."

    # Get index of the anime
    idx = anime_df[anime_df['name'] == anime_title].index[0]

    # Get similarity scores and sort them
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top-N recommendations
    recommended_indices = [i[0] for i in sim_scores[1:top_n+1]]

    # Display recommendations in a clear format
    recommendations = anime_df.iloc[recommended_indices][['name', 'genre', 'rating']]

    print(f"\n🎬 **Top {top_n} Recommended Anime for '{anime_title}':**\n")
    for i, row in recommendations.iterrows():
        print(f"🔹 **{row['name']}**")
        print(f"   - Genre: {row['genre']}")
        print(f"   - Rating: ⭐ {row['rating']}\n")

    return recommendations

# Try recommending an anime
anime_name = "Boruto: Naruto the Movie"  # Change this to any anime name in dataset
recommend_anime(anime_name, top_n=5)




✅ Dataset Loaded Successfully!

🔹 Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None 

🔹 First 5 Rows:
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, R

Unnamed: 0,name,genre,rating
615,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",7.94
841,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",7.81
1103,Boruto: Naruto the Movie - Naruto ga Hokage ni...,"Action, Comedy, Martial Arts, Shounen, Super P...",7.68
1343,Naruto x UT,"Action, Comedy, Martial Arts, Shounen, Super P...",7.58
1472,Naruto: Shippuuden Movie 4 - The Lost Tower,"Action, Comedy, Martial Arts, Shounen, Super P...",7.53


In [None]:
interview_questions_answers = {
    "1. Can you explain the difference between user-based and item-based collaborative filtering?":
    "User-based collaborative filtering recommends items by finding users with similar tastes and suggesting what they liked. "
    "Item-based collaborative filtering, on the other hand, focuses on finding similarities between items and recommends based on similar item interactions.",

    "2. What is collaborative filtering, and how does it work?":
    "Collaborative filtering is a recommendation technique that predicts user preferences based on past interactions and similar users/items. "
    "It works by identifying patterns in user-item relationships and suggesting new items accordingly."
}

for question, answer in interview_questions_answers.items():
    print(question)
    print(answer, "\n")





1. Can you explain the difference between user-based and item-based collaborative filtering?
User-based collaborative filtering recommends items by finding users with similar tastes and suggesting what they liked. Item-based collaborative filtering, on the other hand, focuses on finding similarities between items and recommends based on similar item interactions. 

2. What is collaborative filtering, and how does it work?
Collaborative filtering is a recommendation technique that predicts user preferences based on past interactions and similar users/items. It works by identifying patterns in user-item relationships and suggesting new items accordingly. 



In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

anime = pd.read_csv('anime.csv')
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [8]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [9]:
anime.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,62
type,25
episodes,0
rating,230
members,0


In [10]:
# Handle missing values (e.g., imputation)

anime['genre'].fillna(anime['genre'].mode()[0], inplace=True)
anime['type'].fillna(anime['type'].mode()[0], inplace=True)
anime['rating'] = anime['rating'].fillna(anime['rating'].median())

In [11]:
# After imputating the missing values
anime.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,0
type,0
episodes,0
rating,0
members,0


In [12]:
anime.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12294.0,12294.0
mean,14058.221653,6.4757,18071.34
std,11455.294701,1.017179,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.9,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.17,9437.0
max,34527.0,10.0,1013917.0


In [13]:
# Get the unique values in categorical columns
print("\nUnique values in 'type' and 'genre' columns:")
print(f"Unique 'type' values: {anime['type'].unique()}")
print(f"Unique 'genre' values: {anime['genre'].unique()[:10]}")


Unique values in 'type' and 'genre' columns:
Unique 'type' values: ['Movie' 'TV' 'OVA' 'Special' 'Music' 'ONA']
Unique 'genre' values: ['Drama, Romance, School, Supernatural'
 'Action, Adventure, Drama, Fantasy, Magic, Military, Shounen'
 'Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen'
 'Sci-Fi, Thriller' 'Comedy, Drama, School, Shounen, Sports'
 'Action, Adventure, Shounen, Super Power'
 'Drama, Military, Sci-Fi, Space'
 'Drama, Fantasy, Romance, Slice of Life, Supernatural'
 'Drama, School, Shounen'
 'Action, Drama, Mecha, Military, Sci-Fi, Super Power']


In [14]:
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
#  Genre Feature Extraction
# Split the genre column into a list of genres for each anime
anime['genre'] = anime['genre'].fillna('Unknown').apply(lambda x: x if isinstance(x, list) else x.split(', ') if isinstance(x, str) else [])

# Use MultiLabelBinarizer to convert genres into a binary matrix
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(anime['genre'])

# Convert the binary matrix back into a DataFrame for easy merging
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_)

#Rating Feature
# Rating is already numeric; we'll use it directly

#Type Feature
# Convert the 'type' column to categorical numerical values (e.g., TV, Movie, OVA, etc.)
anime['type'] = anime['type'].fillna('Unknown')
anime['type'] = pd.Categorical(anime['type'])
anime['type_code'] = anime['type'].cat.codes

# Combine the extracted features into a single DataFrame
# We keep the 'rating' and 'type_code', and concatenate the genre features
anime_features = pd.concat([anime[['rating', 'type_code']], genre_df], axis=1)

# Now the anime_features DataFrame contains all the extracted features
anime_features.head()

Unnamed: 0,rating,type_code,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,9.37,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
1,9.26,5,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,9.25,5,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9.17,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,9.16,5,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Type Feature - Label Encoding
# Fill missing values in 'type' column and convert it to categorical codes
anime['type'] = anime['type'].cat.add_categories('Unknown')

# Apply LabelEncoder to convert 'type' into numerical labels
le = LabelEncoder()
anime['type_encoded'] = le.fit_transform(anime['type'])

# 3. Combine the transformed features back with the original dataset
# Drop original 'genre' and 'type' columns, and concatenate new features
anime_encoded = pd.concat([anime.drop(['genre', 'type'], axis=1), genre_df, anime['type_encoded']], axis=1)

# Show the first few rows of the encoded dataset
print(anime_encoded.head())

   anime_id                              name episodes  rating  members  \
0     32281                    Kimi no Na wa.        1    9.37   200630   
1      5114  Fullmetal Alchemist: Brotherhood       64    9.26   793665   
2     28977                          Gintama°       51    9.25   114262   
3      9253                       Steins;Gate       24    9.17   673572   
4      9969                     Gintama&#039;       51    9.16   151266   

   type_code  type_encoded  Action  Adventure  Cars  ...  Slice of Life  \
0          0             0       0          0     0  ...              0   
1          5             5       1          1     0  ...              0   
2          5             5       1          0     0  ...              0   
3          5             5       0          0     0  ...              0   
4          5             5       1          0     0  ...              0   

   Space  Sports  Super Power  Supernatural  Thriller  Vampire  Yaoi  Yuri  \
0      0       0    

In [16]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# List of numerical features to be normalized
numerical_features = ['rating', 'members', 'type_encoded']

#  Min-Max Scaling
min_max_scaler = MinMaxScaler()
anime[numerical_features] = min_max_scaler.fit_transform(anime[numerical_features])

# Show the first few rows of the normalized dataset
print(anime.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  \
0             [Drama, Romance, School, Supernatural]  Movie        1   
1  [Action, Adventure, Drama, Fantasy, Magic, Mil...     TV       64   
2  [Action, Comedy, Historical, Parody, Samurai, ...     TV       51   
3                                 [Sci-Fi, Thriller]     TV       24   
4  [Action, Comedy, Historical, Parody, Samurai, ...     TV       51   

     rating   members  type_code  type_encoded  
0  0.924370  0.197872          0           0.0  
1  0.911164  0.782770          5           1.0  
2  0.909964  0.112689          5           1.0  
3  0.900360  0.664325          5           1.0  
4  0.899160  0.149186      

In [17]:
#  Standardization
scaler = StandardScaler()
anime[numerical_features] = scaler.fit_transform

# Show the first few rows of the normalized dataset
print(anime.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  \
0             [Drama, Romance, School, Supernatural]  Movie        1   
1  [Action, Adventure, Drama, Fantasy, Magic, Mil...     TV       64   
2  [Action, Comedy, Historical, Parody, Samurai, ...     TV       51   
3                                 [Sci-Fi, Thriller]     TV       24   
4  [Action, Comedy, Historical, Parody, Samurai, ...     TV       51   

                                              rating  \
0  <bound method TransformerMixin.fit_transform o...   
1  <bound method TransformerMixin.fit_transform o...   
2  <bound method TransformerMixin.fit_transform o...   
3  <bound method TransformerMixin.fit_transform 

In [20]:
print(anime[numerical_features].dtypes)


rating          object
members         object
type_encoded    object
dtype: object


In [21]:
                                            #Recommendation System:

from sklearn.metrics.pairwise import cosine_similarity
# Normalize Numerical Features
numerical_features = ['rating', 'members', 'type_encoded']
# Convert to numeric, invalid parsing will be set as NaN
anime[numerical_features] = anime[numerical_features].apply(pd.to_numeric, errors='coerce')

# Fill any NaNs with 0 or a better imputation if needed
anime[numerical_features] = anime[numerical_features].fillna(0)

# Now safe to scale
scaler = MinMaxScaler()
anime[numerical_features] = scaler.fit_transform(anime[numerical_features])

# Combine all features (rating, members, type_encoded, and genres)
anime_features = pd.concat([anime[numerical_features], genre_df], axis=1)

# Cosine Similarity Matrix
cosine_sim = cosine_similarity(anime_features)

# 6. Function to Recommend Anime
def recommend_anime(anime_title, anime, cosine_sim, top_n=10):
    # Get the index of the anime that matches the title
    indices = pd.Series(anime.index, index=anime['name']).drop_duplicates()

    if anime_title not in indices:
        return "Anime not found in the dataset."

    idx = indices[anime_title]

    # Get the pairwise similarity scores for that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime by similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top-N most similar anime
    sim_scores = sim_scores[1:top_n+1]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top-N most similar anime
    return anime['name'].iloc[anime_indices]

# Example usage:
recommended_anime = recommend_anime('Naruto', anime, cosine_sim, top_n=10)
print(recommended_anime)

615                                    Naruto: Shippuuden
841                                                Naruto
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
1573    Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...
2458                 Naruto Shippuuden: Sunny Side Battle
2997    Naruto Soyokazeden Movie: Naruto to Mashin to ...
175                                Katekyo Hitman Reborn!
7628                              Kyutai Panic Adventure!
Name: name, dtype: object


In [22]:
                   #Function to Recommend Anime with a Similarity Threshold
def recommend_anime(anime_title, anime, cosine_sim, top_n=5, threshold=0.2):
    # Get the index of the anime that matches the title
    indices = pd.Series(anime.index, index=anime['name']).drop_duplicates()

    if anime_title not in indices:
        return "Anime not found in the dataset."

    idx = indices[anime_title]

    # Get the pairwise similarity scores for that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime by similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Filter based on the similarity threshold
    sim_scores = [i for i in sim_scores if i[1] >= threshold]

    # Get the scores of the top-N most similar anime
    sim_scores = sim_scores[1:top_n+1]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top-N most similar anime
    return anime['name'].iloc[anime_indices], [i[1] for i in sim_scores]

# Example usage:
# Experiment with a threshold value of 0.2 (e.g., recommend anime that have a similarity score of at least 0.2)
recommended_anime, similarity_scores = recommend_anime('Naruto', anime, cosine_sim, top_n=5, threshold=0.2)
print("Recommended Anime:", recommended_anime)
print("Similarity Scores:", similarity_scores)

Recommended Anime: 615                                    Naruto: Shippuuden
841                                                Naruto
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
Name: name, dtype: object
Similarity Scores: [np.float64(0.9999999999999999), np.float64(0.9999999999999999), np.float64(0.9999999999999999), np.float64(0.9999999999999999), np.float64(0.9999999999999999)]


In [23]:
from sklearn.model_selection import train_test_split

# You can adjust this depending on the target variable for evaluation
X = anime_features  # Feature matrix (already created from previous steps)
y = anime['name']  # Target label (for recommendation, this could be different based on your task)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting sets
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

Training set size: (9835, 46)
Testing set size: (2459, 46)


In [24]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Function to evaluate the recommendation system
def evaluate_recommendation_system(recommended_anime, relevant_anime):
    # Convert the anime lists into binary vectors (1 if recommended/relevant, 0 otherwise)
    all_anime = list(set(recommended_anime + relevant_anime))  # Combine unique anime IDs for evaluation
    y_true = [1 if anime in relevant_anime else 0 for anime in all_anime]
    y_pred = [1 if anime in recommended_anime else 0 for anime in all_anime]

    # Calculate Precision, Recall, and F1 Score
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    return precision, recall, f1

# Assuming 'Naruto' is our test anime, with recommended and relevant anime
recommended_anime = recommend_anime('Naruto', anime, cosine_sim, top_n=5)[0]  # Replace with actual recommendation function
relevant_anime = ['One Piece', 'Bleach', 'Dragon Ball Z', 'Fairy Tail', 'Attack on Titan']  # Example ground truth (relevant anime)

precision, recall, f1 = evaluate_recommendation_system(recommended_anime.tolist(), relevant_anime)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Precision: 0.00
Recall: 0.00
F1 Score: 0.00


In [26]:
# Experimenting with thresholds
threshold_values = [0.1, 0.3, 0.7]  # Lower thresholds for better recall, higher for precision
for threshold in threshold_values:
    print(f"\nThreshold: {threshold}")
    precision, recall, f1 = evaluate_recommendation_system(recommended_anime.tolist(), relevant_anime)
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}")



Threshold: 0.1
Precision: 0.00, Recall: 0.00, F1-Score: 0.00

Threshold: 0.3
Precision: 0.00, Recall: 0.00, F1-Score: 0.00

Threshold: 0.7
Precision: 0.00, Recall: 0.00, F1-Score: 0.00


                                        #Areas of Improvement:
1.Feature Engineering:
More Descriptive Features: Consider improving the feature engineering process by extracting more informative features, such as user reviews, text descriptions of anime, or even tags that users provide.

2.Hybrid Recommendation Approach:
Content-based + Collaborative Filtering: The current approach seems to be purely content-based (using features like genre, ratings, etc.). You can integrate collaborative filtering techniques that recommend anime based on user behavior patterns (i.e., users who liked the same anime). This can help increase both precision and recall by leveraging user preference data.

3.Personalization:
User Profiles: If user data is available, create user profiles based on their preferences (e.g., favorite genres, ratings). Personalized recommendations based on individual preferences can improve recall by suggesting anime that are highly relevant to a specific user.

5.Threshold Tuning:
Dynamic Threshold: The current system uses a fixed similarity threshold to filter recommendations. Implementing a dynamic threshold based on the specific input anime or user can provide more relevant recommendations. For example, popular anime could have a stricter threshold, while less-known ones might need a more lenient threshold.




In [27]:
interview_questions_answers = {
    "1. Can you explain the difference between user-based and item-based collaborative filtering?":
    "User-based collaborative filtering recommends items by finding users with similar tastes and suggesting what they liked. "
    "Item-based collaborative filtering, on the other hand, focuses on finding similarities between items and recommends based on similar item interactions.",

    "2. What is collaborative filtering, and how does it work?":
    "Collaborative filtering is a recommendation technique that predicts user preferences based on past interactions and similar users/items. "
    "It works by identifying patterns in user-item relationships and suggesting new items accordingly."
}

for question, answer in interview_questions_answers.items():
    print(question)
    print(answer, "\n")





1. Can you explain the difference between user-based and item-based collaborative filtering?
User-based collaborative filtering recommends items by finding users with similar tastes and suggesting what they liked. Item-based collaborative filtering, on the other hand, focuses on finding similarities between items and recommends based on similar item interactions. 

2. What is collaborative filtering, and how does it work?
Collaborative filtering is a recommendation technique that predicts user preferences based on past interactions and similar users/items. It works by identifying patterns in user-item relationships and suggesting new items accordingly. 

