# Library

In [1]:
from pandas import read_csv
from pandas import DataFrame

from numpy import sort
from numpy import unique
from numpy import delete

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Data Understanding

## 1.1. Downloading Data

In [2]:
# !kaggle datasets download CooperUnion/anime-recommendations-database

## 1.2. Extracting Data

In [3]:
# !unzip anime-recommendations-database.zip

## 1.3. Data Loading

In [4]:
anime_df = read_csv('./anime.csv')
rating_df = read_csv('./rating.csv')

print("Dataframe Anime")
print("Jumlah Data Anime  :", len(anime_df.anime_id.unique()))

print("\nDataframe Rating")
print("Jumlah Data Rating :", len(rating_df.user_id.unique()))
print("Jumlah Anime yang di rating", len(rating_df.anime_id.unique()))

Dataframe Anime
Jumlah Data Anime  : 12294

Dataframe Rating
Jumlah Data Rating : 73515
Jumlah Anime yang di rating 11200


## 1.5. Anime Variabel

In [5]:
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [6]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [7]:
print("Jumlah Anime :", len(anime_df.name.unique()))
print("Judul Anime  :", anime_df.name.unique())
print("\nJumlah Genre :", len(anime_df.genre.unique()))
print("Genre Anime  :", anime_df.genre.unique())

Jumlah Anime : 12292
Judul Anime  : ['Kimi no Na wa.' 'Fullmetal Alchemist: Brotherhood' 'Gintama°' ...
 'Violence Gekiga David no Hoshi'
 'Violence Gekiga Shin David no Hoshi: Inma Densetsu'
 'Yasuji no Pornorama: Yacchimae!!']

Jumlah Genre : 3265
Genre Anime  : ['Drama, Romance, School, Supernatural'
 'Action, Adventure, Drama, Fantasy, Magic, Military, Shounen'
 'Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen' ...
 'Hentai, Sports' 'Drama, Romance, School, Yuri' 'Hentai, Slice of Life']


## 1.6. Rating Variabel

In [8]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [9]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [10]:
print("Jumlah User     :", len(rating_df.user_id.unique()))
print("Jumlah Anime    :", len(rating_df.anime_id.unique()))
print("Panjang Rating  :", len(rating_df.rating.unique()))
print("Rentang Rating  :", sort(rating_df.rating.unique()))

Jumlah User     : 73515
Jumlah Anime    : 11200
Panjang Rating  : 11
Rentang Rating  : [-1  1  2  3  4  5  6  7  8  9 10]


In [11]:
rating_df['rating'].describe()

count    7.813737e+06
mean     6.144030e+00
std      3.727800e+00
min     -1.000000e+00
25%      6.000000e+00
50%      7.000000e+00
75%      9.000000e+00
max      1.000000e+01
Name: rating, dtype: float64

# 2. Data Preprocessing

# 3. Data Preparation

## 3.1. Missing Value

### 3.1.1. Anime Variabel

In [12]:
anime_df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

Menampilkan baris yang memiliki missing value

In [13]:
anime_df[anime_df.isnull().any(axis=1)]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
2844,33242,IS: Infinite Stratos 2 - Infinite Wedding,,Special,1,7.15,6604
3541,33589,ViVid Strike!,,TV,12,6.96,12345
6040,29765,Metropolis (2009),,Movie,1,6.27,313
6646,32695,Match Shoujo,,ONA,1,6.02,242
7018,33187,Katsudou Shashin,,Movie,1,5.79,607
...,...,...,...,...,...,...,...
12274,34492,Nuki Doki! Tenshi to Akuma no Sakusei Battle -...,Hentai,OVA,Unknown,,392
12279,34491,Sagurare Otome The Animation,Hentai,OVA,1,,79
12280,34312,Saimin Class,Hentai,OVA,Unknown,,240
12282,34388,Shikkoku no Shaga The Animation,Hentai,OVA,Unknown,,195


Terdapat 277 data yang memilik missing value. Karena jumlahnya sedikit maka data dengan missing value akan dihapus

In [14]:
anime_df = anime_df.dropna()

# Periksa kembali missing value
anime_df.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

Sekarang dataset anime sudah bersih dari missing value

### 3.1.2. Rating Variabel

In [15]:
rating_df.isnull().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

Tidak terdapat missing value pada dataset rating

## 3.2. Duplicated Data

In [63]:
print("Duplicated Data Count:", anime_df.duplicated().sum())

Duplicated Data Count: 0


# 4. Model Development

## 4.1. Content Based Filtering

In [21]:
data = anime_df

In [22]:
tfidf = TfidfVectorizer()
tfidf.fit(data['genre'])
tfidf.get_feature_names_out()

array(['action', 'adventure', 'ai', 'arts', 'cars', 'comedy', 'dementia',
       'demons', 'drama', 'ecchi', 'fantasy', 'fi', 'game', 'harem',
       'hentai', 'historical', 'horror', 'josei', 'kids', 'life', 'magic',
       'martial', 'mecha', 'military', 'music', 'mystery', 'of', 'parody',
       'police', 'power', 'psychological', 'romance', 'samurai', 'school',
       'sci', 'seinen', 'shoujo', 'shounen', 'slice', 'space', 'sports',
       'super', 'supernatural', 'thriller', 'vampire', 'yaoi', 'yuri'],
      dtype=object)

In [23]:
matrix = tfidf.fit_transform(data['genre'])
matrix.shape

(12017, 47)

In [24]:
DataFrame(matrix.todense(), index=data['name'], columns=tfidf.get_feature_names_out())

Unnamed: 0_level_0,action,adventure,ai,arts,cars,comedy,dementia,demons,drama,ecchi,...,shounen,slice,space,sports,super,supernatural,thriller,vampire,yaoi,yuri
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Kimi no Na wa.,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.439008,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.545474,0.000000,0.0,0.0,0.0
Fullmetal Alchemist: Brotherhood,0.294985,0.316287,0.0,0.0,0.0,0.000000,0.0,0.0,0.335194,0.0,...,0.350107,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
Gintama°,0.251618,0.000000,0.0,0.0,0.0,0.200402,0.0,0.0,0.000000,0.0,...,0.298636,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
Steins;Gate,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.833852,0.0,0.0,0.0
Gintama&#039;,0.251618,0.000000,0.0,0.0,0.0,0.200402,0.0,0.0,0.000000,0.0,...,0.298636,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Toushindai My Lover: Minami tai Mecha-Minami,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
Under World,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
Violence Gekiga David no Hoshi,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
Violence Gekiga Shin David no Hoshi: Inma Densetsu,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0


### 4.1. Cosine Similarity

In [25]:
# dense output untuk menghemat penggunaan memory/RAM
similarity = cosine_similarity(matrix, dense_output=True)
print(similarity.shape)
similarity

(12017, 12017)


array([[1.        , 0.14715318, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.14715318, 1.        , 0.17877808, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.17877808, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ]])

In [27]:
similarity_df = DataFrame(similarity, index=data['name'], columns=data['name'])

In [59]:
def anime_recommendations(name, similarity_data=similarity_df, items=anime_df[['name', 'genre']], k=10):
    try:
        """
        Provides anime recommendations based on cosine similarity.

        Args:
            name (str): The name of the anime to find recommendations for.
            similarity_data (pd.DataFrame): The similarity matrix DataFrame.
            items (pd.DataFrame): The DataFrame containing anime names and genres.
            k (int): The number of recommendations to return.

        Returns:
            pd.DataFrame: A DataFrame containing the top k recommendations.
        """
        # Get the similarity scores for the given anime
        similarity_scores = similarity_data.loc[:, name]

        # Check if similarity_scores is a DataFrame and convert it to a Series if necessary
        if isinstance(similarity_scores, DataFrame):
            similarity_scores = similarity_scores.iloc[:, 0]

        # Sort the scores in descending order and get the top k indices
        # We add 1 to k to exclude the anime itself from the recommendations
        top_k_indices = similarity_scores.sort_values(ascending=False).index[1:k + 1]

        # Get the names of the recommended anime
        closest = top_k_indices

        # Return the recommendations as a DataFrame
        recommendation = DataFrame(closest).merge(items)

        return recommendation
    
    except KeyError as e:
        print("Tidak ada anime dengan judul serupa di dataset.")
        print("Input judul anime lain atau pastikan judul sudah benar")

anime_recommendations('Guilty Crown')

Unnamed: 0,name,genre
0,Guilty Crown: Lost Christmas,"Action, Drama, Sci-Fi, Super Power"
1,s.CRY.ed,"Action, Adventure, Drama, Sci-Fi, Super Power"
2,Persona: Trinity Soul,"Action, Sci-Fi, Super Power"
3,Kiddy GiRL-AND Pilot,"Action, Sci-Fi, Super Power"
4,Toaru Kagaku no Railgun,"Action, Sci-Fi, Super Power"
5,Ai City,"Action, Sci-Fi, Super Power"
6,Ougon Bat,"Action, Sci-Fi, Super Power"
7,Solty Rei,"Action, Sci-Fi, Super Power"
8,Toaru Kagaku no Railgun S,"Action, Sci-Fi, Super Power"
9,Choujin Locke: Mirror Ring,"Action, Sci-Fi, Super Power"


## 4.2. Collaborative Filtering

# 5. Evaluation