# Data Preprocessing:

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("anime.csv")
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [3]:
df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [4]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [6]:
df['genre'].fillna('Unknown',inplace =True)

In [7]:
print(df['genre'].value_counts())

genre
Hentai                                                  823
Comedy                                                  523
Music                                                   301
Kids                                                    199
Comedy, Slice of Life                                   179
                                                       ... 
Adventure, Drama, Fantasy, Game, Sci-Fi                   1
Adventure, Demons, Fantasy, Historical                    1
Action, Comedy, Drama, Mecha, Music, Sci-Fi, Shounen      1
Action, Comedy, Fantasy, Mecha, Sci-Fi, Shounen           1
Hentai, Slice of Life                                     1
Name: count, Length: 3265, dtype: int64


# Feature Extraction:

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer
features_df = df[['genre','rating']]
mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(mlb.fit_transform(features_df['genre'].str.split(',')),
                            columns = mlb.classes_,
                            index=features_df.index)
features_df = pd.concat([features_df,genre_encoded],axis=1)
features_df.drop('genre',axis=1,inplace = True)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
features_df["rating"] = scaler.fit_transform(features_df[["rating"]])
print(features_df.head())

     rating   Adventure   Cars   Comedy   Dementia   Demons   Drama   Ecchi  \
0  0.924370           0      0        0          0        0       0       0   
1  0.911164           1      0        0          0        0       1       0   
2  0.909964           0      0        1          0        0       0       0   
3  0.900360           0      0        0          0        0       0       0   
4  0.899160           0      0        1          0        0       0       0   

    Fantasy   Game  ...  Shounen  Slice of Life  Space  Sports  Super Power  \
0         0      0  ...        0              0      0       0            0   
1         1      0  ...        0              0      0       0            0   
2         0      0  ...        0              0      0       0            0   
3         0      0  ...        0              0      0       0            0   
4         0      0  ...        0              0      0       0            0   

   Supernatural  Thriller  Unknown  Vampire  Yaoi 

# Recommendation System:

In [17]:
# Given a target anime, recommend a list of similar anime based on cosine similarity scores.
# Experiment with different threshold values for similarity scores to adjust the recommendation list size.

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def recommend_anime(anime_title, features_df, similarity_threshold=0.7):
  """
  Recommends anime based on cosine similarity.

  Args:
    anime_title: The title of the target anime.
    features_df: A DataFrame with anime features (e.g., genre, rating).
    similarity_threshold: The minimum cosine similarity score for a recommendation.

  Returns:
    A list of recommended anime titles.
  """

  if anime_title not in df['name'].values:
    return "Anime not found in the dataset."

  target_anime_index = df[df['name'] == anime_title].index[0]
  target_anime_features = features_df.iloc[target_anime_index].values.reshape(1, -1)

  # Check for NaN values in target_anime_features and features_df and handle them (e.g., impute with 0)
  target_anime_features = np.nan_to_num(target_anime_features)
  features_df = features_df.fillna(0)

  cosine_similarities = cosine_similarity(target_anime_features, features_df)
  similar_anime_indices = np.where(cosine_similarities > similarity_threshold)[1]
  similar_anime_scores = cosine_similarities[0][similar_anime_indices]

  recommended_anime = []
  for index, score in zip(similar_anime_indices, similar_anime_scores):
    recommended_anime.append((df['name'].iloc[index], score))

  # Sort recommendations by similarity score in descending order
  recommended_anime.sort(key=lambda x: x[1], reverse=True)

  return [anime[0] for anime in recommended_anime]

# Example usage:
target_anime = "Naruto"
recommendations = recommend_anime(target_anime, features_df)
print(f"Recommendations for '{target_anime}':")
for anime in recommendations:
  print(anime)

Recommendations for 'Naruto':
Naruto
Naruto: Shippuuden
Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi
Boruto: Naruto the Movie
Naruto x UT
Naruto: Shippuuden Movie 4 - The Lost Tower
Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono
Naruto Shippuuden: Sunny Side Battle
Naruto Soyokazeden Movie: Naruto to Mashin to Mitsu no Onegai Dattebayo!!
Katekyo Hitman Reborn!
Kyutai Panic Adventure!
Battle Spirits: Ryuuko no Ken
Dragon Ball Z
Dragon Ball Kai (2014)
Dragon Ball Kai
Medaka Box Abnormal
Dragon Ball Z Movie 15: Fukkatsu no F
Dragon Ball Super
Medaka Box
Tenjou Tenge
Dragon Ball Z: Summer Vacation Special
Dragon Ball Z: Atsumare! Gokuu World
Dragon Ball GT: Goku Gaiden! Yuuki no Akashi wa Suushinchuu
Dragon Ball Z Movie 11: Super Senshi Gekiha!! Katsu no wa Ore da
Boku no Hero Academia
Shijou Saikyou no Deshi Kenichi
Shijou Saikyou no Deshi Kenichi OVA
Bleach
The Last: Naruto the Movie
Naruto: Shippuuden Movie 6 - Road to Ninja
Bleach Movie 4: Jigoku-hen
Ranma ½: Akumu! S

In [18]:
from sklearn.model_selection import train_test_split
train_df , test_df = train_test_split(features_df,test_size = 0.2,random_state =42)

print("Training set shape:",train_df.shape)
print("Testing set shape:",test_df.shape)

Training set shape: (9835, 84)
Testing set shape: (2459, 84)


# Evaluation:

In [21]:
def evaluate_recommendations(df,features_df,recommend_anime_func,test_size =0.2, random_state =42):
    train_df,test_df = train_test_split(df,test_size = test_size,random_state = random_state)
    true_positives = 0
    false_positives = 0
    false_negatives= 0
    for index,row in test_df.iterrows():
        anime_title = row['name']
        recommendations = recommend_anime_func(anime_title,features_df)
        if isinstance(recommendations,str):
            continue
        for recommended_anime in recommendations:
            if recommended_anime in test_df["name"].values:
                true_positives += 1
                break
        else:
            false_negatives +=1
    precision = true_positives / (true_positives + false_positives) if(true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if(true_positives + false_negatives) > 0 else 0
    f1_score = 2*(precision *recall) / (precision +recall) if (precision + recall) > 0 else 0
    
    return {'precision': precision,'recall':recall,'f1score':f1_score}

evaluation_results = evaluate_recommendations(df,features_df,recommend_anime)
print("Evaluation Results:")
print(evaluation_results)

Evaluation Results:
{'precision': 1.0, 'recall': 1.0, 'f1score': 1.0}


In [22]:
print("Evaluation Results:")
print(evaluation_results)

Evaluation Results:
{'precision': 1.0, 'recall': 1.0, 'f1score': 1.0}
