# Content-Based Filtering Anime Movie Recommender System
Steps:
1. Collect anime movie data
2. Extract features
4. Cosine Similarity
5. Generate recommendation

# Load dataset into dataframe

In [2]:
#Pandas tries to determine what dtype to set by analyzing the data in each column.
import pandas as pd
import re
import string
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
#to set the low memory to be false because guessing dtypes for each column is very memory demanding. 
#anime_movie_data = content based data
anime_movie_data = pd.read_csv('anime.csv', low_memory = False)
anime_movie_data = pd.DataFrame(anime_movie_data)
anime_movie_data

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [4]:
#user_rating_data = user rating data
user_rating_data = pd.read_csv('rating.csv', low_memory = False)
user_rating_data = user_rating_data[0:1000000]
user_rating_data = pd.DataFrame(user_rating_data)
user_rating_data

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
999995,9358,28999,-1
999996,9358,29067,-1
999997,9358,29093,-1
999998,9358,29095,-1


# Feature Extraction + Data Preprocessing
 Identifying and extracting relevant features from the movie data that can be used to generate recommendations.

# Change Data Type

In [5]:
anime_movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [6]:
#change data type
anime_movie_data['episodes'] = anime_movie_data['episodes'].replace('Unknown', -1)
anime_movie_data = anime_movie_data.astype({"name":"string","genre":"string","type":"string","episodes":"int64"})

In [7]:
anime_movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  string 
 2   genre     12232 non-null  string 
 3   type      12269 non-null  string 
 4   episodes  12294 non-null  int64  
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(3), string(3)
memory usage: 672.5 KB


In [8]:
user_rating_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column    Non-Null Count    Dtype
---  ------    --------------    -----
 0   user_id   1000000 non-null  int64
 1   anime_id  1000000 non-null  int64
 2   rating    1000000 non-null  int64
dtypes: int64(3)
memory usage: 22.9 MB


# Put a meningful variable name

In [9]:
#Rename Variable
anime_movie_data = anime_movie_data.rename(columns={"rating": "avg_rating"})
anime_movie_data.head(5)

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [10]:
#Rename Variable
user_rating_data = user_rating_data.rename(columns={"rating": "user_rating"})
user_rating_data.head(5)

Unnamed: 0,user_id,anime_id,user_rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


# Removing null value

In [11]:
#Check is missing value in each column of a Data Frame and count the total number of true null or NaN?
anime_movie_data.isna().sum()

anime_id        0
name            0
genre          62
type           25
episodes        0
avg_rating    230
members         0
dtype: int64

In [12]:
#Drop those missing value by removing the row (axis = 0) with missing value; remove column (axis = 1) 
#and inplace = True --> modify the original DataFrame in place.
anime_movie_data.dropna(axis=0, inplace= True)
anime_movie_data.isna().sum()

anime_id      0
name          0
genre         0
type          0
episodes      0
avg_rating    0
members       0
dtype: int64

In [13]:
user_rating_data.isna().sum()

user_id        0
anime_id       0
user_rating    0
dtype: int64

# Removing stop words

In [14]:
tfidf = TfidfVectorizer(stop_words = 'english')


# Removing unused label

In [15]:
# Drop the 'members' column from the DataFrame
#anime_movie_data.drop('members', axis=1, inplace=True)
#anime_movie_data

# Clean Text

In [16]:
anime_movie_data

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [17]:
# Define a function to clean the text
def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z0-9\s]', ' ', text)
    return text

In [18]:
anime_movie_data['name']=anime_movie_data['name'].apply(clean_text)
anime_movie_data

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,members
0,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama 039,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover Minami tai Mecha Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi Inma Dens...,Hentai,OVA,1,4.98,175


# Remove duplicate records

In [19]:
# Show duplicate rows based on all columns
duplicate_anime = anime_movie_data[anime_movie_data.duplicated()].shape[0]
duplicate_anime

0

In [20]:
#duplicate_anime = anime_movie_data[anime_movie_data.duplicated(subset=['name'])].shape[0]
#duplicate_anime

In [21]:
#duplicate_name = anime_movie_data[anime_movie_data.duplicated(subset=['name'])]
#duplicate_name

In [22]:
# Remove duplicates based on a specific column
#anime_movie_data = anime_movie_data.drop_duplicates(subset=['name'], keep=False)

# Merging 2 dataset based on anime_id

In [23]:
merge_data = pd.merge(user_rating_data, anime_movie_data, on='anime_id', sort = True)
merge_data

Unnamed: 0,user_id,anime_id,user_rating,name,genre,type,episodes,avg_rating,members
0,13,1,-1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,8.82,486824
1,19,1,10,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,8.82,486824
2,21,1,9,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,8.82,486824
3,23,1,9,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,8.82,486824
4,32,1,10,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,8.82,486824
...,...,...,...,...,...,...,...,...,...
999981,6361,34283,7,Nobunaga no Shinobi Episode 0,"Comedy, Historical",Special,1,6.74,937
999982,7114,34283,6,Nobunaga no Shinobi Episode 0,"Comedy, Historical",Special,1,6.74,937
999983,2951,34324,-1,Chiryokumaru,Action,ONA,1,5.40,817
999984,3009,34324,6,Chiryokumaru,Action,ONA,1,5.40,817


In [24]:
#check each anime got how much rating
anime_rating_data = pd.DataFrame(merge_data.groupby('anime_id')['user_rating'].count())
anime_rating_data.head()

Unnamed: 0_level_0,user_rating
anime_id,Unnamed: 1_level_1
1,2115
5,994
6,1487
7,361
8,61


# Algorithm 1: Content-Based

# Construct the TF-IDF matrix

In [25]:
#Compute the TF-IDF matrix based on the genre
tfidf_matrix = tfidf.fit_transform(anime_movie_data['genre'])
print(type(tfidf_matrix))

<class 'scipy.sparse._csr.csr_matrix'>


In [26]:
tfidf.vocabulary_

{'drama': 8,
 'romance': 30,
 'school': 32,
 'supernatural': 41,
 'action': 0,
 'adventure': 1,
 'fantasy': 10,
 'magic': 20,
 'military': 23,
 'shounen': 36,
 'comedy': 5,
 'historical': 15,
 'parody': 26,
 'samurai': 31,
 'sci': 33,
 'fi': 11,
 'thriller': 42,
 'sports': 39,
 'super': 40,
 'power': 28,
 'space': 38,
 'slice': 37,
 'life': 19,
 'mecha': 22,
 'music': 24,
 'mystery': 25,
 'seinen': 34,
 'martial': 21,
 'arts': 3,
 'vampire': 43,
 'shoujo': 35,
 'horror': 16,
 'police': 27,
 'psychological': 29,
 'demons': 7,
 'ecchi': 9,
 'josei': 17,
 'ai': 2,
 'game': 12,
 'dementia': 6,
 'harem': 13,
 'cars': 4,
 'kids': 18,
 'hentai': 14,
 'yaoi': 44,
 'yuri': 45}

# Anime Movie's Genre Information

In [27]:
anime_movie_data.genre.head()

0                 Drama, Romance, School, Supernatural
1    Action, Adventure, Drama, Fantasy, Magic, Mili...
2    Action, Comedy, Historical, Parody, Samurai, S...
3                                     Sci-Fi, Thriller
4    Action, Comedy, Historical, Parody, Samurai, S...
Name: genre, dtype: string

In [28]:
#re = regular expression 
import re
genre_size = len(anime_movie_data.genre)
print(genre_size)

12017


# Movie with same genre via 'loc' property
 loc property is used to access a group of rows and columns by label(s)

In [29]:
# Search for the 'Romance' genre using loc property
anime_by_genre = anime_movie_data.copy().loc[anime_movie_data['genre'].str.contains('Romance')]
anime_by_genre.head()

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,members
0,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
10,4181,Clannad After Story,"Drama, Fantasy, Romance, Slice of Life, Supern...",TV,24,9.06,456749
16,23273,Shigatsu wa Kimi no Uso,"Drama, Music, Romance, School, Shounen",TV,22,8.92,416397
21,44,Rurouni Kenshin Meiji Kenkaku Romantan Tsui...,"Action, Drama, Historical, Martial Arts, Roman...",OVA,4,8.83,129307
25,7311,Suzumiya Haruhi no Shoushitsu,"Comedy, Mystery, Romance, School, Sci-Fi, Supe...",Movie,1,8.81,240297


# Cosine Similarity

In [30]:
# Compute the Cosine Similarity in terms of pairwise similarities
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [31]:
# Record start time
start = time.time()

# Compute cosine similarity matrix
cosine_sim_cs = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Print time taken
print("Time taken: %s seconds" %(time.time() - start))

Time taken: 4.534275770187378 seconds


In [32]:
# Create a pandas series where indexes are values and titles are indexes
indices = pd.Series(anime_movie_data.index, index = anime_movie_data['name']).drop_duplicates()

# Check the first 10 indices
indices[:10]

name
Kimi no Na wa                                                0
Fullmetal Alchemist  Brotherhood                             1
Gintama                                                      2
Steins Gate                                                  3
Gintama  039                                                 4
Haikyuu    Karasuno Koukou VS Shiratorizawa Gakuen Koukou    5
Hunter x Hunter  2011                                        6
Ginga Eiyuu Densetsu                                         7
Gintama Movie  Kanketsu hen   Yorozuya yo Eien Nare          8
Gintama  039   Enchousen                                     9
dtype: int64

# Recommend top 10 anime movie function 

In [81]:
# Define a function to get recommended movies
def get_recommendations(name, cosine_sim):
    # Get the index of the movie that matches the name
    index = indices[name]

    # Get the pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[index]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 most similar movies
    sim_scores = sim_scores[:10]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return anime_movie_data['name'].iloc[movie_indices]

In [86]:
# Get recommendations for a given movie
name = 'Gintama'
content_based_recommendations = get_recommendations(name, cosine_sim)

# Print the top 10 recommended movies
content_based_recommendations=pd.DataFrame(content_based_recommendations)
content_based_recommendations = content_based_recommendations[:5]
content_based_recommendations

Unnamed: 0,name
2,Gintama
4,Gintama 039
8,Gintama Movie Kanketsu hen Yorozuya yo Eien...
9,Gintama 039 Enchousen
12,Gintama


# Algorithm 2: User-based 

# Find the user-item matrix

In [36]:
#matrix which the anime had been rating by the user
user_anime_matrix = merge_data.pivot_table(index = 'user_id', columns = ['anime_id'], values = 'user_rating')
user_anime_matrix.fillna(-1, inplace=True)
user_anime_matrix.head(10)

anime_id,1,5,6,7,8,15,16,17,18,19,...,34048,34085,34103,34107,34136,34173,34240,34283,34324,34325
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,-1.0,-1.0,8.0,-1.0,-1.0,6.0,-1.0,6.0,6.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
6,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
8,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
9,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
10,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [37]:
merge_data.user_rating.info()

<class 'pandas.core.series.Series'>
Int64Index: 999986 entries, 0 to 999985
Series name: user_rating
Non-Null Count   Dtype
--------------   -----
999986 non-null  int64
dtypes: int64(1)
memory usage: 15.3 MB


In [38]:
anime_user_matrix = user_anime_matrix.transpose()
anime_user_matrix.head(10)

user_id,1,2,3,4,5,6,7,8,9,10,...,9349,9350,9351,9352,9353,9354,9355,9356,9357,9358
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
6,-1.0,-1.0,-1.0,-1.0,8.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
8,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
15,-1.0,-1.0,-1.0,-1.0,6.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
16,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
17,-1.0,-1.0,-1.0,-1.0,6.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
18,-1.0,-1.0,-1.0,-1.0,6.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
19,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


# Select a user for target user to compare correlation with other users

In [48]:
#see the number of movie rating of user_id == 85 
#target_user = user_rating_data[user_rating_data['user_id'] == 85]
target_user = 85
anime_user_rating = anime_user_matrix[target_user]
pd.DataFrame(anime_user_rating.sort_values(ascending= False)).rename(columns={target_user: f"user with id {target_user}'s rating"})

Unnamed: 0_level_0,user with id 85's rating
anime_id,Unnamed: 1_level_1
223,10.0
11757,10.0
10087,10.0
777,10.0
481,10.0
...,...
3146,-1.0
3140,-1.0
3138,-1.0
3137,-1.0


# Find correlation with other users

In [49]:
# Find correlation between user_id = 85 based on the ratings of movies
similar_users = anime_user_matrix.corrwith(anime_user_rating).dropna()
similar_users

user_id
1       0.164536
2      -0.000380
3       0.132831
5       0.067134
7       0.069043
          ...   
9353    0.111778
9354   -0.000530
9355    0.074009
9356   -0.000849
9357    0.066150
Length: 8857, dtype: float64

# Transform it into Data Frame

In [50]:
# Create a dataframe with similar movies as the index column and correlation as another column
similar_users = pd.DataFrame(similar_users, columns = ['correlation'])
similar_users.head(10)

Unnamed: 0_level_0,correlation
user_id,Unnamed: 1_level_1
1,0.164536
2,-0.00038
3,0.132831
5,0.067134
7,0.069043
8,0.099718
9,-0.00038
10,0.18385
11,0.029876
12,0.198167


# Show user with highest correlation

In [65]:
#except the most_similar_users[0] = traget_user  
most_similar_users = similar_users.sort_values(by = 'correlation', ascending = False)
most_similar_users = most_similar_users[1:]
most_similar_users

Unnamed: 0_level_0,correlation
user_id,Unnamed: 1_level_1
635,0.380043
1755,0.376230
968,0.373842
1941,0.370221
2883,0.364643
...,...
5262,-0.006831
6866,-0.007156
8083,-0.007613
7853,-0.007922


In [66]:
#print user_id with highest correlation
most_similar_users = most_similar_users.index.values.tolist()
highest_correlation_user = most_similar_users[0]
highest_correlation_user

635

# 5 best recommend rated anime movie from the user that have highest correlation value

In [88]:
#sort highest rating to lowest rating
#target_user = merge_data.loc[merge_data['user_id'] == target_user].sort_values(by= 'user_rating', ascending= False)
#recommend anime movie based on similar user
user_based_recommendations = merge_data[merge_data['user_id'] == highest_correlation_user].sort_values('user_rating', ascending=False)
user_based_recommendations = user_based_recommendations[:5]
user_based_recommendations

Unnamed: 0,user_id,anime_id,user_rating,name,genre,type,episodes,avg_rating,members
281210,635,1535,10,Death Note,"Mystery, Police, Psychological, Supernatural, ...",TV,37,8.71,1013917
532767,635,6547,10,Angel Beats,"Action, Comedy, Drama, School, Supernatural",TV,13,8.39,717796
699130,635,11617,10,High School DxD,"Comedy, Demons, Ecchi, Harem, Romance, School",TV,12,7.7,398660
706629,635,11757,10,Sword Art Online,"Action, Adventure, Fantasy, Game, Romance",TV,25,7.83,893100
781015,635,15451,10,High School DxD New,"Action, Comedy, Demons, Ecchi, Harem, Romance,...",TV,12,7.87,266657


# Final recommended anime movie system: Combine content-based and user-based 

In [89]:
#make each of the result to a list 
content_based_recommendations_list = content_based_recommendations.index.tolist()
content_based_recommendations_list

[2, 4, 8, 9, 12]

In [90]:
user_based_recommendations_list = user_based_recommendations['anime_id'].tolist()
user_based_recommendations_list

[1535, 6547, 11617, 11757, 15451]

In [92]:
final_anime_recommend = content_based_recommendations_list.copy()
final_anime_recommend += user_based_recommendations_list.copy()
final_anime_recommend

[2, 4, 8, 9, 12, 1535, 6547, 11617, 11757, 15451]

In [93]:
pd.DataFrame(target_user)

Unnamed: 0,user_id,anime_id,user_rating,name,genre,type,episodes,avg_rating,members
86345,85,223,10,Dragon Ball,"Adventure, Comedy, Fantasy, Martial Arts, Shou...",TV,153,8.16,316102
157814,85,481,10,Yu Gi Oh Duel Monsters,"Adventure, Game, Shounen",TV,224,7.57,132099
167025,85,527,10,Pokemon,"Action, Adventure, Comedy, Fantasy, Kids",TV,276,7.43,229157
197497,85,777,10,Hellsing Ultimate,"Action, Horror, Military, Seinen, Supernatural...",OVA,10,8.59,297454
370530,85,2762,10,Igano Kabamaru,"Adventure, Comedy, Romance, Shounen",TV,24,8.13,5063
647944,85,10087,10,Fate Zero,"Action, Fantasy, Supernatural",TV,13,8.51,453630
706323,85,11757,10,Sword Art Online,"Action, Adventure, Fantasy, Game, Romance",TV,25,7.83,893100
710192,85,11759,9,Accel World,"Action, Game, Romance, School, Sci-Fi",TV,24,7.62,324284
125266,85,356,8,Fate stay night,"Action, Fantasy, Magic, Romance, Supernatural",TV,24,7.58,374880
280901,85,1535,8,Death Note,"Mystery, Police, Psychological, Supernatural, ...",TV,37,8.71,1013917


In [None]:
target_anime = 