In [3]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate
pd.set_option("display.precision", 1)
import warnings
warnings.filterwarnings('ignore')

2023-03-04 18:29:02.389450: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-04 18:29:02.573125: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-04 18:29:02.573167: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-04 18:29:03.811455: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

### Loading the dataset

In [4]:
# The anime.csv dataset can also be called the movies dataset
anime = pd.read_csv('anime.csv')
anime.head(10)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.4,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.3,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.2,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.2,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.2,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.2,93351
6,11061,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,148,9.1,425855
7,820,Ginga Eiyuu Densetsu,"Drama, Military, Sci-Fi, Space",OVA,110,9.1,80679
8,15335,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,"Action, Comedy, Historical, Parody, Samurai, S...",Movie,1,9.1,72534
9,15417,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",TV,13,9.1,81109


In [5]:
# looking at the shape
anime.shape

(12294, 7)

In [6]:
# checking for nan values in the dataset

anime.isna().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [7]:
# dropping all nan_values in the dataset
anime = anime.dropna()

In [8]:
# confirming it's dropped.

anime.isna().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [9]:
# since some columns are dropped, it's best to reset the index to avoid key errors later
anime = anime.reset_index(drop=True)
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.4,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.3,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.2,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.2,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.2,151266
...,...,...,...,...,...,...,...
12012,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.2,211
12013,5543,Under World,Hentai,OVA,1,4.3,183
12014,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.9,219
12015,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,5.0,175


In [10]:
# it is discovered that some episodes are missing and they are filled with unknown
anime[anime['episodes']=='Unknown']

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
74,21,One Piece,"Action, Adventure, Comedy, Drama, Fantasy, Sho...",TV,Unknown,8.6,504862
252,235,Detective Conan,"Adventure, Comedy, Mystery, Police, Shounen",TV,Unknown,8.2,114702
615,1735,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,Unknown,7.9,533578
991,966,Crayon Shin-chan,"Comedy, Ecchi, Kids, School, Shounen, Slice of...",TV,Unknown,7.7,26267
1021,33157,Tanaka-kun wa Itsumo Kedaruge Specials,"Comedy, School, Slice of Life",Special,Unknown,7.7,5400
...,...,...,...,...,...,...,...
11381,32238,"Watashi wa, Kairaku Izonshou",Hentai,OVA,Unknown,6.3,1569
11485,28169,Buta no Gotoki Sanzoku ni Torawarete Shojo wo ...,Hentai,OVA,Unknown,6.1,1992
11757,30770,Maid-san to Boin Damashii,"Harem, Hentai",OVA,Unknown,5.7,1498
11778,33125,Shiiku x Kanojo: Tenshi no Kousoku-hen,Hentai,OVA,Unknown,5.7,1326


In [11]:
# A suggestion is to fill the episodes with random numbers
def change(x):
    value = np.random.randint(1, 1000)
    if x == 'Unknown':
        return value
    else:
        return x
anime['episodes'] = anime['episodes'].apply(lambda x: change(x))

In [12]:
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.4,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.3,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.2,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.2,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.2,151266
...,...,...,...,...,...,...,...
12012,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.2,211
12013,5543,Under World,Hentai,OVA,1,4.3,183
12014,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.9,219
12015,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,5.0,175


In [13]:
# Now the user rating dataset
rating = pd.read_csv('rating.csv')
rating.head(10)

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
5,1,355,-1
6,1,356,-1
7,1,442,-1
8,1,487,-1
9,1,846,-1


In [14]:
# dataset source shows that rating out of 10 this user has assigned 
# (-1 if the user watched it but didn't assign a rating).
# dropping -1.

rating = rating[rating['rating'] > 0]
rating

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
83,1,11757,10
101,1,15451,10
153,2,11771,10
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [15]:
# checking the rating size we have
# we have close to 8 million rating, this is a large one.
rating.shape

(6337241, 3)

In [16]:
# considering the computational power that will be needed to process 6 millions rows,
# it is suggested to shuffle the rating data and pick 1.25 million from it.

rating = rating.sample(frac=1)
rating = rating[:1250000]
rating

Unnamed: 0,user_id,anime_id,rating
2526492,23978,12531,9
5141105,49078,1562,10
4784952,45659,9675,7
4172147,39549,7311,10
2174345,21068,1302,3
...,...,...,...
5913403,55242,13331,10
2347326,22536,5162,8
3518240,32567,47,9
6364571,59054,5117,7


In [17]:
# checking the unique users in the dataset
rating['user_id'].nunique()

62233

In [18]:
rating['anime_id'].nunique()

8501

In [19]:
# This shows that not all anime are rated. 

### Creating training sets

In [20]:
# it is necessary to have 2 different sets in the dataset
# The first set is the movies (anime) set, the other is the user set.add

# User set contains user features, Movie set contains movie features.

# Movie features will include movie genres, movie type, number of episodes, members and ratings. 
# User features will contain user average rating per genre since it's the only feature we can derive from the dataset


In [21]:
# since we have the same anime_id, we can as well merge the rating and anime datasets on the anime_id
# using all the rating dataset because i need all user's data
rating_ = pd.read_csv('rating.csv')
data = anime.merge(rating_, on='anime_id')
data.head(20)

Unnamed: 0,anime_id,name,genre,type,episodes,rating_x,members,user_id,rating_y
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.4,200630,99,5
1,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.4,200630,152,10
2,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.4,200630,244,10
3,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.4,200630,271,10
4,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.4,200630,278,-1
5,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.4,200630,322,10
6,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.4,200630,398,10
7,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.4,200630,462,8
8,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.4,200630,490,10
9,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.4,200630,548,10


##### Feature engineering

In [22]:
# starting the the genres column, since it is common to all.
# getting all genres in the anime dataset with a for loop

genres = []
for genre in anime['genre']:
    sep = genre.split(',')
    for i in sep:
        if i not in genres:
            genres.append(i)

In [23]:
print(genres)

['Drama', ' Romance', ' School', ' Supernatural', 'Action', ' Adventure', ' Drama', ' Fantasy', ' Magic', ' Military', ' Shounen', ' Comedy', ' Historical', ' Parody', ' Samurai', ' Sci-Fi', 'Sci-Fi', ' Thriller', 'Comedy', ' Sports', ' Super Power', ' Space', ' Slice of Life', ' Mecha', 'Adventure', ' Music', ' Mystery', ' Seinen', 'Fantasy', ' Martial Arts', ' Vampire', ' Shoujo', ' Horror', ' Police', ' Psychological', 'Mystery', 'Psychological', ' Demons', 'Ecchi', ' Josei', 'Josei', 'Military', 'Romance', ' Shounen Ai', ' Game', 'Demons', ' Ecchi', 'Dementia', ' Harem', 'Music', 'Game', ' Cars', ' Dementia', 'Cars', 'Mecha', 'Horror', 'School', ' Kids', 'Historical', 'Kids', ' Shoujo Ai', 'Shounen', 'Shoujo', 'Magic', 'Harem', 'Martial Arts', 'Sports', 'Slice of Life', 'Seinen', 'Parody', 'Police', 'Thriller', 'Supernatural', 'Samurai', 'Super Power', 'Vampire', 'Space', 'Hentai', ' Yaoi', ' Hentai', ' Yuri', 'Yaoi']


In [24]:
# The split is having spaces in some cases
def remove(string):
    return "".join(string.split())
genres = [remove(i) for i in genres]
print(genres)

['Drama', 'Romance', 'School', 'Supernatural', 'Action', 'Adventure', 'Drama', 'Fantasy', 'Magic', 'Military', 'Shounen', 'Comedy', 'Historical', 'Parody', 'Samurai', 'Sci-Fi', 'Sci-Fi', 'Thriller', 'Comedy', 'Sports', 'SuperPower', 'Space', 'SliceofLife', 'Mecha', 'Adventure', 'Music', 'Mystery', 'Seinen', 'Fantasy', 'MartialArts', 'Vampire', 'Shoujo', 'Horror', 'Police', 'Psychological', 'Mystery', 'Psychological', 'Demons', 'Ecchi', 'Josei', 'Josei', 'Military', 'Romance', 'ShounenAi', 'Game', 'Demons', 'Ecchi', 'Dementia', 'Harem', 'Music', 'Game', 'Cars', 'Dementia', 'Cars', 'Mecha', 'Horror', 'School', 'Kids', 'Historical', 'Kids', 'ShoujoAi', 'Shounen', 'Shoujo', 'Magic', 'Harem', 'MartialArts', 'Sports', 'SliceofLife', 'Seinen', 'Parody', 'Police', 'Thriller', 'Supernatural', 'Samurai', 'SuperPower', 'Vampire', 'Space', 'Hentai', 'Yaoi', 'Hentai', 'Yuri', 'Yaoi']


In [25]:
# some of the genres are repeated. removing them all
res = []
[res.append(x) for x in genres if x not in res]
genres = res
print(genres)

['Drama', 'Romance', 'School', 'Supernatural', 'Action', 'Adventure', 'Fantasy', 'Magic', 'Military', 'Shounen', 'Comedy', 'Historical', 'Parody', 'Samurai', 'Sci-Fi', 'Thriller', 'Sports', 'SuperPower', 'Space', 'SliceofLife', 'Mecha', 'Music', 'Mystery', 'Seinen', 'MartialArts', 'Vampire', 'Shoujo', 'Horror', 'Police', 'Psychological', 'Demons', 'Ecchi', 'Josei', 'ShounenAi', 'Game', 'Dementia', 'Harem', 'Cars', 'Kids', 'ShoujoAi', 'Hentai', 'Yaoi', 'Yuri']


In [26]:
len(genres)

43

#### Anime Feature

In [27]:
# filling all genres 
# creating a dataset with the genres size and anime size
genre_df = pd.DataFrame(index=np.arange(anime['anime_id'].nunique()), columns=np.arange(len(genres)))
# setting the columns name to the genres
genre_df.columns = genres
# siince it will be created with nan values automatically, filling the nan values with 0.
genre_df = genre_df.fillna(0)
genre_df.head()

Unnamed: 0,Drama,Romance,School,Supernatural,Action,Adventure,Fantasy,Magic,Military,Shounen,...,ShounenAi,Game,Dementia,Harem,Cars,Kids,ShoujoAi,Hentai,Yaoi,Yuri
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# checking the shape
genre_df.shape

(12017, 43)

In [29]:
# The anime feature needs the genres of each anime one hot encoded, since this can't be achieved automatically.
# it will be done manually
# Before one Hot encoding, we should have the anime id
genre_df.insert(loc=0, column='anime_id', value=anime['anime_id'].values)
genre_df.head()


Unnamed: 0,anime_id,Drama,Romance,School,Supernatural,Action,Adventure,Fantasy,Magic,Military,...,ShounenAi,Game,Dementia,Harem,Cars,Kids,ShoujoAi,Hentai,Yaoi,Yuri
0,32281,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5114,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9253,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9969,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### One Hot Encoding

In [30]:
for i in range(len(genre_df)):
    anime_genre = anime['genre'][i].split(',')
    anime_genre = [remove(i) for i in anime_genre]
    for genre in anime_genre:
        genre_df[genre][i] = 1


In [31]:
genre_df

Unnamed: 0,anime_id,Drama,Romance,School,Supernatural,Action,Adventure,Fantasy,Magic,Military,...,ShounenAi,Game,Dementia,Harem,Cars,Kids,ShoujoAi,Hentai,Yaoi,Yuri
0,32281,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5114,1,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,28977,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9253,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9969,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12012,9316,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12013,5543,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12014,5621,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12015,6133,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [32]:
# let's confirm the last anime_id
anime.tail()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
12012,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.2,211
12013,5543,Under World,Hentai,OVA,1,4.3,183
12014,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.9,219
12015,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,5.0,175
12016,26081,Yasuji no Pornorama: Yacchimae!!,Hentai,Movie,1,5.5,142


##### Other features

In [33]:
# Looking at the other features, they can be easily soeted out
others = anime[['type', 'episodes', 'rating', 'members']]
anime_feat = pd.concat([genre_df, others], axis=1)
anime_feat

Unnamed: 0,anime_id,Drama,Romance,School,Supernatural,Action,Adventure,Fantasy,Magic,Military,...,Cars,Kids,ShoujoAi,Hentai,Yaoi,Yuri,type,episodes,rating,members
0,32281,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,Movie,1,9.4,200630
1,5114,1,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,TV,64,9.3,793665
2,28977,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,TV,51,9.2,114262
3,9253,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,TV,24,9.2,673572
4,9969,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,TV,51,9.2,151266
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12012,9316,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,OVA,1,4.2,211
12013,5543,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,OVA,1,4.3,183
12014,5621,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,OVA,4,4.9,219
12015,6133,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,OVA,1,5.0,175


In [34]:
# One hot encoding the type column using dummy variables
anime_feat = pd.get_dummies(anime_feat, columns=['type'])
anime_feat

Unnamed: 0,anime_id,Drama,Romance,School,Supernatural,Action,Adventure,Fantasy,Magic,Military,...,Yuri,episodes,rating,members,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,32281,1,1,1,1,0,0,0,0,0,...,0,1,9.4,200630,1,0,0,0,0,0
1,5114,1,0,0,0,1,1,1,1,1,...,0,64,9.3,793665,0,0,0,0,0,1
2,28977,0,0,0,0,1,0,0,0,0,...,0,51,9.2,114262,0,0,0,0,0,1
3,9253,0,0,0,0,0,0,0,0,0,...,0,24,9.2,673572,0,0,0,0,0,1
4,9969,0,0,0,0,1,0,0,0,0,...,0,51,9.2,151266,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12012,9316,0,0,0,0,0,0,0,0,0,...,0,1,4.2,211,0,0,0,1,0,0
12013,5543,0,0,0,0,0,0,0,0,0,...,0,1,4.3,183,0,0,0,1,0,0
12014,5621,0,0,0,0,0,0,0,0,0,...,0,4,4.9,219,0,0,0,1,0,0
12015,6133,0,0,0,0,0,0,0,0,0,...,0,1,5.0,175,0,0,0,1,0,0


In [35]:
anime_feat.rename(columns={'type_Movie':'Movie', 'type_Music':'Music', 'type_ONA':'ONA', 'type_OVA':'OVA',
'type_Special':'Special', 'type_TV':'TV'}, inplace=True)
anime_feat

Unnamed: 0,anime_id,Drama,Romance,School,Supernatural,Action,Adventure,Fantasy,Magic,Military,...,Yuri,episodes,rating,members,Movie,Music,ONA,OVA,Special,TV
0,32281,1,1,1,1,0,0,0,0,0,...,0,1,9.4,200630,1,0,0,0,0,0
1,5114,1,0,0,0,1,1,1,1,1,...,0,64,9.3,793665,0,0,0,0,0,1
2,28977,0,0,0,0,1,0,0,0,0,...,0,51,9.2,114262,0,0,0,0,0,1
3,9253,0,0,0,0,0,0,0,0,0,...,0,24,9.2,673572,0,0,0,0,0,1
4,9969,0,0,0,0,1,0,0,0,0,...,0,51,9.2,151266,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12012,9316,0,0,0,0,0,0,0,0,0,...,0,1,4.2,211,0,0,0,1,0,0
12013,5543,0,0,0,0,0,0,0,0,0,...,0,1,4.3,183,0,0,0,1,0,0
12014,5621,0,0,0,0,0,0,0,0,0,...,0,4,4.9,219,0,0,0,1,0,0
12015,6133,0,0,0,0,0,0,0,0,0,...,0,1,5.0,175,0,0,0,1,0,0


In [36]:
anime_feat.columns

Index(['anime_id', 'Drama', 'Romance', 'School', 'Supernatural', 'Action',
       'Adventure', 'Fantasy', 'Magic', 'Military', 'Shounen', 'Comedy',
       'Historical', 'Parody', 'Samurai', 'Sci-Fi', 'Thriller', 'Sports',
       'SuperPower', 'Space', 'SliceofLife', 'Mecha', 'Music', 'Mystery',
       'Seinen', 'MartialArts', 'Vampire', 'Shoujo', 'Horror', 'Police',
       'Psychological', 'Demons', 'Ecchi', 'Josei', 'ShounenAi', 'Game',
       'Dementia', 'Harem', 'Cars', 'Kids', 'ShoujoAi', 'Hentai', 'Yaoi',
       'Yuri', 'episodes', 'rating', 'members', 'Movie', 'Music', 'ONA', 'OVA',
       'Special', 'TV'],
      dtype='object')

In [37]:
# Now the anime feature is complete.

#### User Features

In [38]:
# # creating a new dataframe called user preference.

# user_pre = pd.DataFrame(index=np.arange(data.user_id.nunique()), columns=np.arange(len(genres)))
# user_pre.columns = genres
# user_pre

In [39]:
# user_pre.columns 

In [40]:
# user_pre.insert(loc=0, column='user_id', value=data['user_id'].unique())
# user_pre

In [41]:
# user_index = 0
# # starting with unique users
# for user in user_pre['user_id']:
#     user_info = data[data['user_id'] == user]
#     # obviously a user can't rate an anime twice
#     # selecting the genre
#     genre_agg = 0
#     for genre in genres:
#         # setting initial genre aggregate as 0
#         # converting the anime_genres to a list
#         anime_genre = user_info['genre'].tolist()
#         # scalling through each string in the list
#         for selected_genre in anime_genre:
#             # spliting to get each genre
#             sep = selected_genre.split(',')
#             # removing spaces that comes from spliting ,using a function defined earlier
#             sep = [remove(i) for i in sep]
#             # eliminating possible genre repetition
#             res = []
#             [res.append(x) for x in sep if x not in res]
#             sep = res
#             # checking if genre is in the list of genre for each anime watched
#             if genre in sep:
#                 # adding 1 if it's there
#                 genre_agg += 1
#         # calculating the average of each genre on the scale of 10
#         genre_avg = genre_agg * 10/len(user_info)
#         # adding to the user_pre data for the genre and user
#         user_pre.loc[user_index, genre] = genre_avg
#         genre_agg = 0

#     user_index += 1

In [42]:
# user_pre

In [43]:
# user_pre.to_csv('user_preference.csv', index=False)

In [44]:
# Loading a prepared dataset
user_perf = pd.read_csv('user_preference.csv')
user_perf

Unnamed: 0,user_id,Drama,Romance,School,Supernatural,Action,Adventure,Fantasy,Magic,Military,...,ShounenAi,Game,Dementia,Harem,Cars,Kids,ShoujoAi,Hentai,Yaoi,Yuri
0,99,4.3,3.1,1.7,3.3,3.9,1.9,2.8,1.1,0.5,...,0.0e+00,0.2,3.8e-01,0.5,7.6e-02,0.5,0.0,0.0,0.0,0.0e+00
1,152,3.2,2.1,2.7,2.8,2.9,1.7,2.4,1.0,0.7,...,0.0e+00,0.3,1.4e-01,0.3,7.0e-02,0.0,0.2,0.3,0.2,0.0e+00
2,244,2.5,3.9,3.7,2.9,3.8,1.7,2.8,0.9,0.1,...,0.0e+00,0.5,0.0e+00,1.5,0.0e+00,0.0,0.0,0.0,0.0,3.6e-02
3,271,1.6,2.7,3.0,2.1,3.0,1.6,2.6,0.9,0.1,...,3.3e-02,0.2,1.7e-02,1.3,0.0e+00,0.9,0.2,0.3,0.0,5.0e-02
4,278,3.9,2.5,2.3,3.4,3.4,1.4,1.8,1.8,0.0,...,0.0e+00,0.5,0.0e+00,0.0,0.0e+00,0.0,0.0,0.0,0.0,0.0e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73510,14133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0e+00,0.0,0.0e+00,0.0,0.0e+00,0.0,0.0,10.0,10.0,0.0e+00
73511,40914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0e+00,0.0,0.0e+00,0.0,0.0e+00,0.0,0.0,10.0,10.0,0.0e+00
73512,40965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0e+00,0.0,0.0e+00,0.0,0.0e+00,0.0,0.0,10.0,10.0,0.0e+00
73513,55932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0e+00,0.0,0.0e+00,0.0,0.0e+00,0.0,0.0,10.0,10.0,0.0e+00


In [45]:
# Now we have user data
# merging the rating and user_pref dataframe to see user's rating per anime


user_rating = rating.merge(user_perf, on='user_id')
user_rating

Unnamed: 0,user_id,anime_id,rating,Drama,Romance,School,Supernatural,Action,Adventure,Fantasy,...,ShounenAi,Game,Dementia,Harem,Cars,Kids,ShoujoAi,Hentai,Yaoi,Yuri
0,23978,12531,9,3.2,1.4,0.7,1.1,5.6,6.4,4.6,...,0.0,0.4,0.1,0.0,0.0,0.9,0.0,0.0,0.0,0.0
1,23978,3927,8,3.2,1.4,0.7,1.1,5.6,6.4,4.6,...,0.0,0.4,0.1,0.0,0.0,0.9,0.0,0.0,0.0,0.0
2,23978,1535,9,3.2,1.4,0.7,1.1,5.6,6.4,4.6,...,0.0,0.4,0.1,0.0,0.0,0.9,0.0,0.0,0.0,0.0
3,23978,2889,5,3.2,1.4,0.7,1.1,5.6,6.4,4.6,...,0.0,0.4,0.1,0.0,0.0,0.9,0.0,0.0,0.0,0.0
4,23978,9314,10,3.2,1.4,0.7,1.1,5.6,6.4,4.6,...,0.0,0.4,0.1,0.0,0.0,0.9,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1249995,72193,19,10,4.1,0.0,1.8,3.5,5.3,1.8,3.5,...,0.0,0.6,1.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1249996,20087,7791,10,2.5,2.5,1.7,1.7,1.7,2.5,0.8,...,1.7,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.8,0.0
1249997,72144,19163,6,2.5,5.0,3.3,3.3,5.8,0.0,0.8,...,0.0,0.0,0.0,3.3,0.0,0.0,0.0,0.0,0.0,0.8
1249998,2146,11741,3,4.3,0.0,0.0,5.7,7.1,0.0,4.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
rated_anime_id = rating['anime_id'].unique()
rated_anime_id

array([12531,  1562,  9675, ...,  5299, 30173,  6513])

In [47]:
# # saving this dataframe
# user_rating.to_csv('User_rating.csv', index=False)

In [48]:
rated_anime_id = rating['anime_id'].unique()
rated_anime_id

array([12531,  1562,  9675, ...,  5299, 30173,  6513])

In [49]:
rated_anime = anime_feat[anime_feat['anime_id'].isin(rated_anime_id)]
rated_anime

Unnamed: 0,anime_id,Drama,Romance,School,Supernatural,Action,Adventure,Fantasy,Magic,Military,...,Yuri,episodes,rating,members,Movie,Music,ONA,OVA,Special,TV
0,32281,1,1,1,1,0,0,0,0,0,...,0,1,9.4,200630,1,0,0,0,0,0
1,5114,1,0,0,0,1,1,1,1,1,...,0,64,9.3,793665,0,0,0,0,0,1
2,28977,0,0,0,0,1,0,0,0,0,...,0,51,9.2,114262,0,0,0,0,0,1
3,9253,0,0,0,0,0,0,0,0,0,...,0,24,9.2,673572,0,0,0,0,0,1
4,9969,0,0,0,0,1,0,0,0,0,...,0,51,9.2,151266,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11981,5569,0,0,0,0,0,0,0,0,0,...,0,1,3.0,934,0,0,0,1,0,0
11982,18483,0,0,0,0,0,0,0,0,0,...,0,1,3.2,214,0,0,0,1,0,0
12004,12397,0,0,0,0,0,0,0,0,0,...,0,2,4.7,176,0,0,0,1,0,0
12005,17833,0,0,0,0,0,0,0,0,0,...,0,1,3.6,138,0,0,0,1,0,0


In [50]:
# creating a new dataframe for rated animes.
anime_rating = rating.merge(rated_anime, on='anime_id')
anime_rating

Unnamed: 0,user_id,anime_id,rating_x,Drama,Romance,School,Supernatural,Action,Adventure,Fantasy,...,Yuri,episodes,rating_y,members,Movie,Music,ONA,OVA,Special,TV
0,23978,12531,9,1,1,1,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
1,23769,12531,9,1,1,1,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
2,68296,12531,9,1,1,1,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
3,10923,12531,10,1,1,1,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
4,13245,12531,10,1,1,1,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1249979,58064,5192,5,0,0,0,0,0,1,0,...,0,1,6.3,142,0,0,0,0,1,0
1249980,68465,8999,9,0,0,0,0,1,1,0,...,0,52,7.2,143,0,0,0,0,0,1
1249981,53492,5299,5,0,0,0,0,0,0,0,...,0,1,5.1,413,1,0,0,0,0,0
1249982,62710,30173,8,0,0,0,0,1,0,0,...,0,13,5.5,134,0,0,0,0,0,1


In [51]:
# rating_x is the user rating, rating_y is the anime rating. It's best to raname to avoid issues 
anime_rating.rename(columns={'rating_x':'user_rating', 'rating_y':'anime_rating'}, inplace=True)
anime_rating

Unnamed: 0,user_id,anime_id,user_rating,Drama,Romance,School,Supernatural,Action,Adventure,Fantasy,...,Yuri,episodes,anime_rating,members,Movie,Music,ONA,OVA,Special,TV
0,23978,12531,9,1,1,1,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
1,23769,12531,9,1,1,1,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
2,68296,12531,9,1,1,1,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
3,10923,12531,10,1,1,1,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
4,13245,12531,10,1,1,1,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1249979,58064,5192,5,0,0,0,0,0,1,0,...,0,1,6.3,142,0,0,0,0,1,0
1249980,68465,8999,9,0,0,0,0,1,1,0,...,0,52,7.2,143,0,0,0,0,0,1
1249981,53492,5299,5,0,0,0,0,0,0,0,...,0,1,5.1,413,1,0,0,0,0,0
1249982,62710,30173,8,0,0,0,0,1,0,0,...,0,13,5.5,134,0,0,0,0,0,1


In [52]:
# dropping thr user_id and user_rating column
anime_rating = anime_rating.drop(['user_id', 'user_rating'], axis=1)
anime_rating

Unnamed: 0,anime_id,Drama,Romance,School,Supernatural,Action,Adventure,Fantasy,Magic,Military,...,Yuri,episodes,anime_rating,members,Movie,Music,ONA,OVA,Special,TV
0,12531,1,1,1,0,0,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
1,12531,1,1,1,0,0,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
2,12531,1,1,1,0,0,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
3,12531,1,1,1,0,0,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
4,12531,1,1,1,0,0,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1249979,5192,0,0,0,0,0,1,0,0,0,...,0,1,6.3,142,0,0,0,0,1,0
1249980,8999,0,0,0,0,1,1,0,0,0,...,0,52,7.2,143,0,0,0,0,0,1
1249981,5299,0,0,0,0,0,0,0,0,0,...,0,1,5.1,413,1,0,0,0,0,0
1249982,30173,0,0,0,0,1,0,0,0,0,...,0,13,5.5,134,0,0,0,0,0,1


In [53]:
# # saving this dataframe
# anime_rating.to_csv('anime_rating.csv', index=False)

In [54]:
# A necessary condition is that the final user feature and anime feature have the same size
# Hence setting both to 1.5m

In [55]:
user_final = user_rating[:1200000]
anime_final = anime_rating[:1200000]

In [56]:
# setting rating column as the target variable
target = user_final['rating']
# dropping the rating column
user_final = user_final.drop(['rating', 'anime_id'], axis=1)
user_final

Unnamed: 0,user_id,Drama,Romance,School,Supernatural,Action,Adventure,Fantasy,Magic,Military,...,ShounenAi,Game,Dementia,Harem,Cars,Kids,ShoujoAi,Hentai,Yaoi,Yuri
0,23978,3.2,1.4,0.7,1.1,5.6,6.4,4.6,0.7,1.4,...,0.0,0.4,0.1,0.0,0.0,0.9,0.0,0.0,0.0,0.0
1,23978,3.2,1.4,0.7,1.1,5.6,6.4,4.6,0.7,1.4,...,0.0,0.4,0.1,0.0,0.0,0.9,0.0,0.0,0.0,0.0
2,23978,3.2,1.4,0.7,1.1,5.6,6.4,4.6,0.7,1.4,...,0.0,0.4,0.1,0.0,0.0,0.9,0.0,0.0,0.0,0.0
3,23978,3.2,1.4,0.7,1.1,5.6,6.4,4.6,0.7,1.4,...,0.0,0.4,0.1,0.0,0.0,0.9,0.0,0.0,0.0,0.0
4,23978,3.2,1.4,0.7,1.1,5.6,6.4,4.6,0.7,1.4,...,0.0,0.4,0.1,0.0,0.0,0.9,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,8102,3.6,4.0,2.1,4.0,5.5,3.0,4.3,1.3,1.5,...,0.0,0.0,0.0,1.7,0.0,0.4,0.0,0.0,0.0,0.0
1199996,8102,3.6,4.0,2.1,4.0,5.5,3.0,4.3,1.3,1.5,...,0.0,0.0,0.0,1.7,0.0,0.4,0.0,0.0,0.0,0.0
1199997,8102,3.6,4.0,2.1,4.0,5.5,3.0,4.3,1.3,1.5,...,0.0,0.0,0.0,1.7,0.0,0.4,0.0,0.0,0.0,0.0
1199998,8102,3.6,4.0,2.1,4.0,5.5,3.0,4.3,1.3,1.5,...,0.0,0.0,0.0,1.7,0.0,0.4,0.0,0.0,0.0,0.0


In [57]:
anime_final 

Unnamed: 0,anime_id,Drama,Romance,School,Supernatural,Action,Adventure,Fantasy,Magic,Military,...,Yuri,episodes,anime_rating,members,Movie,Music,ONA,OVA,Special,TV
0,12531,1,1,1,0,0,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
1,12531,1,1,1,0,0,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
2,12531,1,1,1,0,0,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
3,12531,1,1,1,0,0,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
4,12531,1,1,1,0,0,0,0,0,0,...,0,12,8.5,146592,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,17020,1,1,1,0,0,0,0,0,0,...,0,1,6.5,3008,0,0,0,0,1,0
1199996,17020,1,1,1,0,0,0,0,0,0,...,0,1,6.5,3008,0,0,0,0,1,0
1199997,17020,1,1,1,0,0,0,0,0,0,...,0,1,6.5,3008,0,0,0,0,1,0
1199998,17020,1,1,1,0,0,0,0,0,0,...,0,1,6.5,3008,0,0,0,0,1,0


### Preprocessing

In [58]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# scaling the data

user_scaler = StandardScaler()

user_scaler = user_scaler.fit(user_final)
scaled_user_data = user_scaler.transform(user_final)

anime_scaler = StandardScaler()
anime_scaler = anime_scaler.fit(anime_final)
scaled_anime_data = anime_scaler.transform(anime_final)

minmax = MinMaxScaler((-1,1))
minmax.fit(target.values.reshape(-1, 1))
y_train = minmax.transform(target.values.reshape(-1, 1))

In [59]:
scaled_user_data

array([[-0.6093042 ,  0.53105489, -1.54675482, ..., -0.25667414,
        -0.2211498 , -0.30039361],
       [-0.6093042 ,  0.53105489, -1.54675482, ..., -0.25667414,
        -0.2211498 , -0.30039361],
       [-0.6093042 ,  0.53105489, -1.54675482, ..., -0.25667414,
        -0.2211498 , -0.30039361],
       ...,
       [-1.3652639 ,  1.04449086,  0.73344895, ..., -0.25667414,
        -0.2211498 , -0.30039361],
       [-1.3652639 ,  1.04449086,  0.73344895, ..., -0.25667414,
        -0.2211498 , -0.30039361],
       [-1.3652639 ,  1.04449086,  0.73344895, ..., -0.25667414,
        -0.2211498 , -0.30039361]])

In [60]:
scaled_user_data.shape

(1200000, 44)

In [61]:
scaled_anime_data

array([[ 0.40994252,  1.61598641,  1.43324963, ..., -0.30928186,
        -0.25797481,  0.64764813],
       [ 0.40994252,  1.61598641,  1.43324963, ..., -0.30928186,
        -0.25797481,  0.64764813],
       [ 0.40994252,  1.61598641,  1.43324963, ..., -0.30928186,
        -0.25797481,  0.64764813],
       ...,
       [ 0.91664653,  1.61598641,  1.43324963, ..., -0.30928186,
         3.87634739, -1.54404832],
       [ 0.91664653,  1.61598641,  1.43324963, ..., -0.30928186,
         3.87634739, -1.54404832],
       [ 0.91664653,  1.61598641,  1.43324963, ..., -0.30928186,
         3.87634739, -1.54404832]])

In [62]:
y_train

array([[ 0.77777778],
       [ 0.55555556],
       [ 0.77777778],
       ...,
       [ 0.77777778],
       [-0.33333333],
       [-0.11111111]])

In [63]:
# splitting the data into train and test set
from sklearn.model_selection import train_test_split

user_train, user_test = train_test_split(scaled_user_data, test_size=0.20, random_state=42)
anime_train, anime_test = train_test_split(scaled_anime_data, test_size=0.20, random_state=42)
target_train, target_test = train_test_split(y_train, test_size=0.20, random_state=42)

user_train.shape, anime_train.shape, target_train.shape, user_test.shape, anime_test.shape, target_test.shape

((960000, 44),
 (960000, 53),
 (960000, 1),
 (240000, 44),
 (240000, 53),
 (240000, 1))

### Content Based Filtering with Neural Network

In [64]:
# num_outputs = 32
# tf.random.set_seed(1)
# user_NN = tf.keras.models.Sequential([   
#     tf.keras.layers.Dense(256, activation='relu'),
#     tf.keras.layers.Dense(128, activation='relu'),
#     tf.keras.layers.Dense(num_outputs)
# ])

# anime_NN = tf.keras.models.Sequential([   
#     tf.keras.layers.Dense(256, activation='relu'),
#     tf.keras.layers.Dense(128, activation='relu'),
#     tf.keras.layers.Dense(num_outputs)
# ])

# # create the user input and point to the base network
# input_user = tf.keras.layers.Input(shape=(user_train.shape[1]))
# vu = user_NN(input_user)
# print(vu.shape)
# vu = tf.linalg.l2_normalize(vu, axis=1)


# # create the anime input and point to the base network
# input_anime = tf.keras.layers.Input(shape=(anime_train.shape[1]))
# vm = anime_NN(input_anime)
# print(vm.shape)
# vm = tf.linalg.l2_normalize(vm, axis=1)


# # compute the dot product of the two vectors vu and vm
# output = tf.keras.layers.Dot(axes=1)([vu, vm])

# # specify the inputs and output of the model
# model_1 = tf.keras.Model([input_user, input_anime], output)

# model_1.summary()

In [65]:
# # compiling the model
# tf.random.set_seed(1)
# cost_fn = tf.keras.losses.MeanSquaredError()
# opt = keras.optimizers.Adam(learning_rate=0.03)
# model_1.compile(optimizer=opt,
#               loss=cost_fn)

In [66]:
# # training the model
# tf.random.set_seed(1)
# model_1.fit([user_train, anime_train], target_train, epochs=10)

In [67]:
# model_1.evaluate([user_test, anime_test], target_test)

In [68]:
# model_1.save('New_model1')

In [69]:
model_1 = keras.models.load_model('New_model1')
model_1

2023-03-04 18:29:55.242167: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-04 18:29:55.242552: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-04 18:29:55.245400: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2023-03-04 18:29:55.245574: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2023-03-04 18:29:55.245679: W tensorflow/c

<keras.engine.functional.Functional at 0x7fe7d23d1dc0>

#### New User Prediction

In [70]:
# The imagination is that, the new user will select anime genres prefrences. 
# Considering that the user chose those genres, the average rating given automatically to the genres will be 10
# Two datasets will be formed, the first will be that of user preference, the secod will be for the anime


In [91]:
def New_User(interest, anime_type):
    likes = interest + list(anime_type)
    sep = likes
    # removing spaces that comes from spliting ,using a function defined earlier
    sep = [remove(i) for i in sep]
    # eliminating possible genre repetition
    res = []
    [res.append(x) for x in sep if x not in res]
    liked = res
    # just in case there's a wrong input ,it's best to filter things out.
    likes = [i for i in liked if i in anime_feat.columns[1:]]
    types = ['Movie', 'Music','ONA', 'OVA', 'Special', 'TV']
    new_user = pd.DataFrame(index=np.arange(1), columns=np.arange(len(genres)))
    # setting the columns name to the genres
    new_user.columns = genres
    # siince it will be created with nan values automatically, filling the nan values with 0.
    new_user = new_user.fillna(0)
    newid = max(rating['user_id'].values) + 1
    new_user.insert(loc=0, column='user_id', value=newid)
    for k in likes:
        # auto capitalising
        cap = k.capitalize()
        if cap in new_user.columns:
            # setting it to 10 since user preferred it
            new_user[cap] = 10


    # -------------------------------------------------------------------------------------------

    # Anime feature
    new_user_anime = pd.DataFrame(index=np.arange(1), columns=np.arange(len(anime_feat.columns)))
    new_user_anime.columns = anime_feat.columns
    new_user_anime = new_user_anime.fillna(0)
    # Filling with values
    for i in likes:
        # auto capitalising
        cap = i.capitalize()
        if cap in new_user_anime.columns:
            new_user_anime[cap] = 1
        elif i in types:
            new_user_anime[i] = 1

    # keeping anime IDs
    ids = anime_feat['anime_id']
    # removing other features from the anime feat dataset
    search_anime_feat = anime_feat.drop(['anime_id', 'episodes', 'rating', 'members'], axis=1)
    # doing the same here.
    new_user_anime = new_user_anime.drop(['anime_id', 'episodes', 'rating', 'members'], axis=1)
    # converting them to arrays
    new_user_anime_arr = new_user_anime.values
    search_anime_feat = search_anime_feat.values

    # a list for Euclidean distance
    ED = []
    for feat in search_anime_feat:
        dist = np.linalg.norm(new_user_anime_arr - feat)
        ED.append(dist)

    # creating a dataframe for the Ids and distance
    distance = pd.DataFrame()
    distance['anime_id'] = ids
    distance['euc_dist'] = ED

    # sorting the result based on ascending order of the euc distance
    distance = distance.sort_values(by='euc_dist', ascending=True)
    # fetching the top 30 anime ids
    selected_animes = distance['anime_id'][:30]
    # extracting selected animes from anime feat
    selection = anime_feat[anime_feat['anime_id'].isin(selected_animes)]

    # -----------------------------------
    # now creating the user and anime features. starting with getting their arrays
    new_user_feat = new_user.values
    new_user_anime_feat = selection.values
    # since the selected anime size is 30, to avoid creating a loop when using the model it is suggested to use np.repeat to make the user shape like that of the anime
    new_user_feat = np.repeat(new_user_feat, len(selection), axis=0)


    # ----------------------------------------
    # preprocessing the data we have 
    prep_user_feat = user_scaler.transform(new_user_feat)
    prep_anime_feat = anime_scaler.transform(new_user_anime_feat)

    # -------------------------------------------
    # now passing it to our model
    model_prediction = model_1.predict([prep_user_feat, prep_anime_feat])
    # this result is in the scaled format, it needs to be reversed
    prediction = minmax.inverse_transform(model_prediction)
    # now we have our prediction, it should be presented in a dataframe
    # considering the fact that it was created according to it's proportion in the anime order, we can retrieve that first
    result = anime[anime['anime_id'].isin(selection['anime_id'].tolist())]
    # renaming rating column as anime rating
    result.rename(columns={'rating':'anime_rating'}, inplace=True)
    result['predicted_rating'] = prediction
    result = result.sort_values(by='predicted_rating', ascending=False)
    return result


In [92]:
print(genres)

['Drama', 'Romance', 'School', 'Supernatural', 'Action', 'Adventure', 'Fantasy', 'Magic', 'Military', 'Shounen', 'Comedy', 'Historical', 'Parody', 'Samurai', 'Sci-Fi', 'Thriller', 'Sports', 'SuperPower', 'Space', 'SliceofLife', 'Mecha', 'Music', 'Mystery', 'Seinen', 'MartialArts', 'Vampire', 'Shoujo', 'Horror', 'Police', 'Psychological', 'Demons', 'Ecchi', 'Josei', 'ShounenAi', 'Game', 'Dementia', 'Harem', 'Cars', 'Kids', 'ShoujoAi', 'Hentai', 'Yaoi', 'Yuri']


In [93]:
New_User(['Drama'],'TV')



Unnamed: 0,anime_id,name,genre,type,episodes,anime_rating,members,predicted_rating
8309,33862,Chichi Kaeru,Drama,ONA,1,4.6,41,8.5
7273,32588,Meow no Hoshi,Drama,OVA,1,5.6,212,8.3
7193,7374,100%,Drama,OVA,1,5.6,363,8.3
4009,7229,Black Jack ONA,Drama,ONA,12,6.8,1297,8.3
8336,17689,Chironup no Kitsune,Drama,Movie,1,7.5,196,8.3
2014,10448,Rain Town,Drama,ONA,1,7.4,12489,8.3
8463,30935,Douwa Mondai to Jinken: Anata wa Dou Kangaemasuka,Drama,OVA,1,8.0,42,8.3
10096,29675,Sarasoujuu no Hana no Iro,Drama,Movie,1,3.8,97,8.2
10187,30045,Shin Saru Kani Gassen,Drama,Movie,1,4.4,86,8.2
9714,29900,Nobara,Drama,Movie,1,4.6,70,8.2


In [77]:
v  = ['Drama']
v.append('TV')

In [78]:
v

['Drama', 'TV']