In [1]:
import pandas as pd
import numpy as np

## Data Preprocessing

In [2]:
# Load the dataset
anime_df = pd.read_csv('anime.csv')

In [3]:
# Display the first few rows of the dataset
print(anime_df.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [4]:
# Check for missing values
print(anime_df.isnull().sum())

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [5]:
# Handle missing values (e.g., fill or drop)
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)
anime_df.dropna(inplace=True)

In [6]:
# Verify that there are no missing values left
print(anime_df.isnull().sum())

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


In [7]:
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler

In [8]:
# Split the genres into a list
anime_df['genre'] = anime_df['genre'].apply(lambda x: x.split(', '))

In [9]:
# Use MultiLabelBinarizer to create binary columns for each genre
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(anime_df['genre'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)

In [10]:
# Normalize the ratings
scaler = StandardScaler()
ratings_normalized = scaler.fit_transform(anime_df[['rating']])

In [11]:
# Combine the genres and normalized ratings
features = np.hstack([genres_encoded, ratings_normalized])
features_df = pd.DataFrame(features, columns=list(mlb.classes_) + ['normalized_rating'])

## Recommendation System

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
# Compute cosine similarity matrix
cos_sim = cosine_similarity(features_df)

In [14]:
def recommend_anime(anime_title, num_recommendations=10):
    if anime_title not in anime_df['name'].values:
        print(f"Anime title '{anime_title}' not found in the dataset.")
        return []
    
    # Find the index of the target anime
    target_index = anime_df[anime_df['name'] == anime_title].index[0]
    
    # Get similarity scores for the target anime
    similarity_scores = list(enumerate(cos_sim[target_index]))
    
    # Sort the anime based on similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the most similar anime
    similar_anime_indices = [i[0] for i in similarity_scores[1:num_recommendations+1]]
    
       # Ensure indices are within bounds
    for idx in similar_anime_indices:
        if idx >= len(anime_df):
            print(f"Index out of bounds: {idx} (max: {len(anime_df)-1})")
            return []
  
    # Return the titles of the most similar anime
    return anime_df.iloc[similar_anime_indices]['name'].tolist()

In [15]:
# Example usage
print(recommend_anime('Naruto', 5))

['Naruto: Shippuuden', 'Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi', 'Boruto: Naruto the Movie', 'Naruto x UT', 'Naruto: Shippuuden Movie 4 - The Lost Tower']


In [16]:
def recommend_anime(anime_title, num_recommendations=10):
    if anime_title not in anime_df['name'].values:
        print(f"Anime title '{anime_title}' not found in the dataset.")
        return []
    
    # Find the index of the target anime
    target_index = anime_df[anime_df['name'] == anime_title].index[0]
    print(f"Target index: {target_index}")
    
    # Get similarity scores for the target anime
    similarity_scores = list(enumerate(cos_sim[target_index]))
    print(f"Similarity scores calculated for index: {target_index}")
    
    # Sort the anime based on similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    print(f"Sorted similarity scores: {similarity_scores[:num_recommendations+1]}")
    
    # Get the indices of the most similar anime
    similar_anime_indices = [i[0] for i in similarity_scores[1:num_recommendations+1]]
    print(f"Similar anime indices: {similar_anime_indices}")
    
    # Ensure indices are within bounds
    for idx in similar_anime_indices:
        if idx >= len(anime_df):
            print(f"Index out of bounds: {idx} (max: {len(anime_df)-1})")
            return []
    
    # Return the titles of the most similar anime
    return anime_df.iloc[similar_anime_indices]['name'].tolist()


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [18]:
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(anime_df, test_size=0.2, random_state=42)

In [19]:
def evaluate_recommendations(test_df, num_recommendations=10):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    for index, row in test_df.iterrows():
        recommendations = recommend_anime(row['name'], num_recommendations)
        if recommendations:
            if row['name'] in recommendations:
                true_positives += 1
            else:
                false_negatives += 1
            false_positives += num_recommendations - 1
        else:
            false_negatives += 1
    
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1


In [20]:
# Evaluate the recommendation system
precision, recall, f1 = evaluate_recommendations(test_df)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

Target index: 2920
Similarity scores calculated for index: 2920
Sorted similarity scores: [(2920, 0.9999999999999999), (4032, 0.9867572997306525), (4254, 0.9819190082412707), (4954, 0.957854911050755), (5936, 0.8979212386746862), (6493, 0.8408161814648587), (10435, 0.8407744690406534), (2899, 0.8407605316316803), (3007, 0.8407178393443708), (3043, 0.8405456154794319), (10155, 0.8404150682028495)]
Similar anime indices: [4032, 4254, 4954, 5936, 6493, 10435, 2899, 3007, 3043, 10155]
Target index: 4563
Similarity scores calculated for index: 4563
Sorted similarity scores: [(4563, 0.9999999999999999), (4536, 0.9999843241909292), (9176, 0.9999370468331003), (4780, 0.99960513984027), (4807, 0.999430779823258), (4845, 0.9992244177677011), (4165, 0.9984548215002492), (4945, 0.9984126591302771), (4128, 0.9981337379718271), (4048, 0.997783134395567), (4070, 0.997783134395567)]
Similar anime indices: [4536, 9176, 4780, 4807, 4845, 4165, 4945, 4128, 4048, 4070]
Target index: 5208
Similarity scores

IndexError: index 12292 is out of bounds for axis 0 with size 12210