In [8]:
import pandas as pd

# Load the dataset to explore its structure and attributes
anime_df = pd.read_csv('anime.csv')

# Show the first few rows of the dataset to understand its structure
anime_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [12]:
#Check for null columns
print(anime_df.isnull().sum())
anime_df.dtypes

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

In [14]:
# Data Exploration and Preprocessing

# Handle missing values
# For 'genre' and 'type', fill missing values with 'Unknown'
anime_df['genre'].fillna('Unknown', inplace=True)
anime_df['type'].fillna('Unknown', inplace=True)

# For 'rating', fill missing values with the mean rating
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)

# Replace 'Unknown' in 'episodes' with 0, convert to int (we'll treat unknown episodes as 0 for now)
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'], errors='coerce').fillna(0).astype(int)

# Confirm the missing values are handled
anime_df.isnull().sum()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  anime_df['genre'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  anime_df['type'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [32]:
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler

# Convert 'genre' into a list of genres for each anime
genres_encoded = anime_df['genre'].str.get_dummies(sep=', ')

# Normalize 'rating' and 'members'
scaler = MinMaxScaler()
anime_df[['rating', 'members']] = scaler.fit_transform(anime_df[['rating', 'members']])

# Combine the normalized rating, members, and encoded genres into a feature matrix
import numpy as np

features = pd.concat([anime_df[['rating', 'members']], genres_encoded], axis=1)

# Confirm the feature extraction
print(features.shape, features[:2])  # Check the shape and a sample of the features
features

(12294, 46)      rating   members  Action  Adventure  Cars  Comedy  Dementia  Demons  \
0  0.924370  0.197872       0          0     0       0         0       0   
1  0.911164  0.782770       1          1     0       0         0       0   

   Drama  Ecchi  ...  Slice of Life  Space  Sports  Super Power  Supernatural  \
0      1      0  ...              0      0       0            0             1   
1      1      0  ...              0      0       0            0             0   

   Thriller  Unknown  Vampire  Yaoi  Yuri  
0         0        0        0     0     0  
1         0        0        0     0     0  

[2 rows x 46 columns]


Unnamed: 0,rating,members,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Unknown,Vampire,Yaoi,Yuri
0,0.924370,0.197872,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,0.911164,0.782770,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0.909964,0.112689,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.900360,0.664325,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0.899160,0.149186,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0.297719,0.000203,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,0.313325,0.000176,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,0.385354,0.000211,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,0.397359,0.000168,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

anime_df['genre'].fillna('Unknown', inplace=True)
anime_df['type'].fillna('Unknown', inplace=True)
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'], errors='coerce')
anime_df['episodes'].fillna(anime_df['episodes'].mean(), inplace=True)
genres_encoded = anime_df['genre'].str.get_dummies(sep=', ')

# Normalize the 'rating' and 'members' columns to bring them into a similar scale (0 to 1 range)
scaler = MinMaxScaler()
anime_df[['rating', 'members']] = scaler.fit_transform(anime_df[['rating', 'members']])

# Combine the normalized 'rating' and 'members' columns with the encoded 'genres' columns into a single DataFrame
features = pd.concat([anime_df[['rating', 'members']], genres_encoded], axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  anime_df['genre'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  anime_df['type'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

In [50]:
# Function to recommend anime based on cosine similarity
def recommend_anime(target_anime, features, anime_df, threshold=0.5):
    if target_anime not in anime_df['name'].values:
        return []
    cosine_sim = cosine_similarity(features)
    target_idx = anime_df[anime_df['name'] == target_anime].index[0]
    sim_scores = list(enumerate(cosine_sim[target_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_indices = [i[0] for i in sim_scores if i[1] > threshold and i[0] != target_idx]
    return anime_df.iloc[sim_indices]['name'].tolist()
    
# Example usage:
target_anime = "Kimi no Na wa."
recommended_anime = recommend_anime(target_anime, features, anime_df)

# Print the recommended anime list for the target anime
print("Recommended anime for {}: {}".format(target_anime, recommended_anime))

Recommended anime for Kimi no Na wa.: ['Wind: A Breath of Heart OVA', 'Wind: A Breath of Heart (TV)', 'Aura: Maryuuin Kouga Saigo no Tatakai', 'Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen', 'Kokoro ga Sakebitagatterunda.', 'Angel Beats!: Another Epilogue', 'True Tears', 'Myself; Yourself', 'Kimikiss Pure Rouge', 'Koi to Senkyo to Chocolate', 'Harmonie', 'Air Movie', '&quot;Bungaku Shoujo&quot; Memoire', 'Koi to Senkyo to Chocolate Special', 'Otome wa Boku ni Koishiteru: Futari no Elder', 'Myself ; Yourself Specials', 'Koi to Senkyo to Chocolate: Ikenai Hazuki-sensei', 'Touka Gettan', 'Venus Versus Virus', 'Mizuiro (2003)', 'To Heart 2 Special', 'School Days ONA', 'Tokimeki Memorial: Forever With You', 'Little Busters!: Refrain', 'Kokoro Connect', 'Kokoro Connect: Michi Random', 'Shakugan no Shana', 'Shakugan no Shana II (Second)', 'Clannad', 'Little Busters!: EX', 'Shakugan no Shana S', 'Hotarubi no Mori e', 'Yahari Ore no Seishun Love Comedy wa Machigatteiru. Zoku', 'Kanon (

In [49]:
# Evaluation of Recommendation Systems

# Evaluation function for recommendation systems
def evaluate_recommendations(anime_df, features, threshold=0.5):
    train_data, test_data = train_test_split(anime_df, test_size=0.2, random_state=42)
    train_features = features.loc[train_data.index]
    test_features = features.loc[test_data.index]
    y_true = []
    y_pred = []
    # Step 4: Iterate through each anime in the test dataset
    for idx, row in test_data.iterrows():
        target_anime = row['name']
        true_anime = test_data[test_data['genre'] == row['genre']]['name'].tolist()
        recommended_anime = recommend_anime(target_anime, train_features, train_data, threshold)
        y_true.extend([1 if anime in true_anime else 0 for anime in test_data['name']])
        y_pred.extend([1 if anime in recommended_anime else 0 for anime in test_data['name']])
    precision = precision_score(y_true, y_pred, average='micro')
    recall = recall_score(y_true, y_pred, average='micro')
    f1 = f1_score(y_true, y_pred, average='micro')

    return precision, recall, f1
    
# Step 11: Call the evaluation function and print the precision, recall, and F1-score
precision, recall, f1 = evaluate_recommendations(anime_df, features)
print("Precision: ",precision)
print("Recall: ", recall)
print("F1-Score: ", f1)

Precision:  0.9919084535797407
Recall:  0.9919084535797407
F1-Score:  0.9919084535797407
