# Library

In [1]:
from pandas import read_csv
from pandas import DataFrame

from numpy import sort
from numpy import unique
from numpy import delete

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Data Understanding

## 1.1. Downloading Data

In [2]:
# !kaggle datasets download CooperUnion/anime-recommendations-database

## 1.2. Extracting Data

In [3]:
# !unzip anime-recommendations-database.zip

## 1.3. Data Loading

In [None]:
anime_df = read_csv('./anime.csv')
rating_df = read_csv('./rating.csv')

print("Dataframe Anime")
print("Jumlah Data Anime  :", len(anime_df.anime_id.unique()))

print("\nDataframe Rating")
print("Jumlah Data Rating :", len(rating_df.user_id.unique()))
print("Jumlah Anime yang di rating", len(rating_df.anime_id.unique()))

## 1.5. Anime Variabel

In [None]:
anime_df.head()

In [None]:
anime_df.info()

In [None]:
print("Jumlah Anime :", len(anime_df.name.unique()))
print("Judul Anime  :", anime_df.name.unique())
print("\nJumlah Genre :", len(anime_df.genre.unique()))
print("Genre Anime  :", anime_df.genre.unique())

## 1.6. Rating Variabel

In [None]:
rating_df.head()

In [None]:
rating_df.info()

In [None]:
print("Jumlah User     :", len(rating_df.user_id.unique()))
print("Jumlah Anime    :", len(rating_df.anime_id.unique()))
print("Panjang Rating  :", len(rating_df.rating.unique()))
print("Rentang Rating  :", sort(rating_df.rating.unique()))

In [None]:
rating_df['rating'].describe()

# 2. Data Preprocessing

# 3. Data Preparation

## 3.1. Missing Value

### 3.1.1. Anime Variabel

In [None]:
anime_df.isnull().sum()

Menampilkan baris yang memiliki missing value

In [None]:
anime_df[anime_df.isnull().any(axis=1)]

Terdapat 277 data yang memilik missing value. Karena jumlahnya sedikit maka data dengan missing value akan dihapus

In [None]:
anime_df = anime_df.dropna()

# Periksa kembali missing value
anime_df.isnull().sum()

Sekarang dataset anime sudah bersih dari missing value

### 3.1.2. Rating Variabel

In [None]:
rating_df.isnull().sum()

Tidak terdapat missing value pada dataset rating

## 3.2. Genre

In [None]:
fix_anime_df = anime_df.assign(genre=anime_df['genre'].str.split(',')).explode('genre')
fix_anime_df.loc[fix_anime_df['name'] == 'Mahoutsukai no Yome: Hoshi Matsu Hito']

In [None]:
fix_anime_df.shape

In [18]:
fix_anime_df['name'] = fix_anime_df['name'].astype('category')
fix_anime_df['episodes'] = fix_anime_df['episodes'].astype('category')
fix_anime_df['genre'] = fix_anime_df['genre'].astype('category')
fix_anime_df['type'] = fix_anime_df['type'].astype('category')

In [19]:
from pandas import to_numeric
fix_anime_df['anime_id'] = to_numeric(fix_anime_df['anime_id'], downcast='integer')
fix_anime_df['members'] = to_numeric(fix_anime_df['members'], downcast='integer')
fix_anime_df['rating'] = to_numeric(fix_anime_df['rating'], downcast='float')

In [None]:
fix_anime_df.memory_usage(deep=True)

# 4. Model Development

## 4.1. Content Based Filtering

In [21]:
data = fix_anime_df

In [None]:
tfidf = TfidfVectorizer()
tfidf.fit(data['genre'])
tfidf.get_feature_names_out()

In [None]:
matrix = tfidf.fit_transform(data['genre'])
matrix.shape

In [None]:
DataFrame(matrix.todense(), index=data['name'], columns=tfidf.get_feature_names_out())

### 4.1. Cosine Similarity

In [None]:
# dense output untuk menghemat penggunaan memory/RAM
similarity = cosine_similarity(matrix, dense_output=True)
print(similarity.shape)
similarity

In [23]:
# similarity_df = DataFrame(similarity.todense(), index=data['name'], columns=data['name'])

In [None]:
data[data['name']=='Kimi no Na wa.']

In [27]:
def anime_recommendations(name, similarity_data=similarity, items=data[['name', 'genre']], k=5):
    # Dataframe diubah menjadi numpy
    # Range(start, stop, step)
    index = similarity_data.loc[:,name].to_numpy().argpartition(range(-1, -k, -1))
    
    # Mengambil data dengan similarity terbesar dari index yang ada
    closest = similarity_data.columns[index[-1:-(k+2):-1]]
    
    # Drop name agar nama resto yang dicari tidak muncul dalam daftar rekomendasi
    closest = closest.drop(name, errors='ignore')
 
    return DataFrame(closest).merge(items).head(k)

In [None]:
similarity.loc[0]

In [None]:
anime_recommendations('Naruto')

## 4.2. Collaborative Filtering

# 5. Evaluation