In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("/content/anime.csv")
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [3]:
df.shape

(12294, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [5]:
# handling null values
df.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,62
type,25
episodes,0
rating,230
members,0


In [6]:
df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [7]:
num_col=[i for i in df.columns if df[i].dtype!='O']
num_col

['anime_id', 'rating', 'members']

In [8]:
cat_col=[i for i in df.columns if df[i].dtype=='O']
cat_col

['name', 'genre', 'type', 'episodes']

In [9]:
# remove missing value from genre & type column
df.dropna(subset=['genre', 'type'], inplace=True)

In [10]:
# replace missing value with mean in rating column
df['rating'].fillna(df['rating'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating'].fillna(df['rating'].mean(), inplace=True)


In [11]:
# checking missing values
df.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,0
type,0
episodes,0
rating,0
members,0


# Feature Extraction

In [12]:
df.groupby('name')['rating'].mean().sort_values(ascending = False)

Unnamed: 0_level_0,rating
name,Unnamed: 1_level_1
Taka no Tsume 8: Yoshida-kun no X-Files,10.00
Spoon-hime no Swing Kitchen,9.60
Mogura no Motoro,9.50
Kimi no Na wa.,9.37
Kahei no Umi,9.33
...,...
Hametsu no Mars,2.37
Utsu Musume Sayuri,2.14
Tenkuu Danzai Skelter+Heaven,2.00
Hi Gekiga Ukiyoe Senya Ichiya,1.92


In [13]:
def clean_genre_column(genres):
    return ','.join(
        [label.strip().lower().replace("'","").replace('"','').replace('[','').replace(']','')
         for label in genres.split(',')]
    )
# strip(): remove whitespace
# lower(): converts the labels to lowercase
# split(',') split the input string from the genre column into a list of labels

In [14]:
df['genre'] = df['genre'].apply(clean_genre_column)

In [15]:
df['genre'] = df['genre'].str.split(',')

genre_labels = df['genre'].explode().unique()
print(f"Genre labels:,{genre_labels}")

Genre labels:,['drama' 'romance' 'school' 'supernatural' 'action' 'adventure' 'fantasy'
 'magic' 'military' 'shounen' 'comedy' 'historical' 'parody' 'samurai'
 'sci-fi' 'thriller' 'sports' 'super power' 'space' 'slice of life'
 'mecha' 'music' 'mystery' 'seinen' 'martial arts' 'vampire' 'shoujo'
 'horror' 'police' 'psychological' 'demons' 'ecchi' 'josei' 'shounen ai'
 'game' 'dementia' 'harem' 'cars' 'kids' 'shoujo ai' 'hentai' 'yaoi'
 'yuri']


In [16]:
len(genre_labels)

43

In [17]:
# create a seprate row for each unique genre
df_cleaned = df.explode('genre')

In [18]:
df_cleaned

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,drama,Movie,1,9.37,200630
0,32281,Kimi no Na wa.,romance,Movie,1,9.37,200630
0,32281,Kimi no Na wa.,school,Movie,1,9.37,200630
0,32281,Kimi no Na wa.,supernatural,Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,action,TV,64,9.26,793665
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,hentai,OVA,1,4.15,211
12290,5543,Under World,hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,hentai,OVA,1,4.98,175


In [19]:
# one hot encode the genre column
one_hot = pd.get_dummies(df_cleaned['genre']).groupby(df_cleaned.index).max()
one_hot

Unnamed: 0,action,adventure,cars,comedy,dementia,demons,drama,ecchi,fantasy,game,...,shounen ai,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri
0,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,True,True,False,False,False,False,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,True,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12290,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12291,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12292,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [20]:
one_hot_binary = one_hot.astype(int)
one_hot_binary

Unnamed: 0,action,adventure,cars,comedy,dementia,demons,drama,ecchi,fantasy,game,...,shounen ai,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
df = pd.concat([df.drop(columns=['genre']), one_hot_binary], axis = 1)
df

Unnamed: 0,anime_id,name,type,episodes,rating,members,action,adventure,cars,comedy,...,shounen ai,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri
0,32281,Kimi no Na wa.,Movie,1,9.37,200630,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,TV,64,9.26,793665,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,TV,51,9.25,114262,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,TV,24,9.17,673572,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,9969,Gintama&#039;,TV,51,9.16,151266,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,OVA,1,4.15,211,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,5543,Under World,OVA,1,4.28,183,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,5621,Violence Gekiga David no Hoshi,OVA,4,4.88,219,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,OVA,1,4.98,175,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
df.drop(columns=['anime_id','type','episodes'], inplace = True)

In [23]:
df.head()

Unnamed: 0,name,rating,members,action,adventure,cars,comedy,dementia,demons,drama,...,shounen ai,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri
0,Kimi no Na wa.,9.37,200630,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,Fullmetal Alchemist: Brotherhood,9.26,793665,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Gintama°,9.25,114262,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Steins;Gate,9.17,673572,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,Gintama&#039;,9.16,151266,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# normalize features by using minmnaxscaler
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()
df['rating'] = minmax.fit_transform(df[['rating']])
df['members'] = minmax.fit_transform(df[['members']])

In [25]:
df.head()

Unnamed: 0,name,rating,members,action,adventure,cars,comedy,dementia,demons,drama,...,shounen ai,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri
0,Kimi no Na wa.,0.92437,0.197872,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,Fullmetal Alchemist: Brotherhood,0.911164,0.78277,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Gintama°,0.909964,0.112689,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Steins;Gate,0.90036,0.664325,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,Gintama&#039;,0.89916,0.149186,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df['rating'].unique()

array([0.92436975, 0.91116447, 0.90996399, 0.90036014, 0.89915966,
       0.89795918, 0.89555822, 0.89315726, 0.89195678, 0.88715486,
       0.88595438, 0.8847539 , 0.87755102, 0.87154862, 0.87034814,
       0.86554622, 0.8607443 , 0.85954382, 0.85834334, 0.85714286,
       0.85594238, 0.85354142, 0.85234094, 0.85114046, 0.84993998,
       0.8487395 , 0.84753902, 0.84633854, 0.84513806, 0.84273709,
       0.84153661, 0.84033613, 0.83913565, 0.83793517, 0.83673469,
       0.83433373, 0.83313325, 0.83193277, 0.83073229, 0.82953181,
       0.82833133, 0.82713085, 0.82593037, 0.82472989, 0.82352941,
       0.82232893, 0.82112845, 0.81992797, 0.81872749, 0.81752701,
       0.81632653, 0.81512605, 0.81392557, 0.81272509, 0.81152461,
       0.81032413, 0.80912365, 0.80792317, 0.80672269, 0.80552221,
       0.80432173, 0.80312125, 0.80192077, 0.80072029, 0.79951981,
       0.79831933, 0.79711885, 0.79591837, 0.79471789, 0.79351741,
       0.79231693, 0.79111645, 0.78991597, 0.78871549, 0.78751

In [27]:
df['members'].unique()

array([1.97872202e-01, 7.82770102e-01, 1.12689267e-01, ...,
       2.70299592e-02, 5.65630942e-02, 6.38122441e-04])

In [28]:
df.columns

Index(['name', 'rating', 'members', 'action', 'adventure', 'cars', 'comedy',
       'dementia', 'demons', 'drama', 'ecchi', 'fantasy', 'game', 'harem',
       'hentai', 'historical', 'horror', 'josei', 'kids', 'magic',
       'martial arts', 'mecha', 'military', 'music', 'mystery', 'parody',
       'police', 'psychological', 'romance', 'samurai', 'school', 'sci-fi',
       'seinen', 'shoujo', 'shoujo ai', 'shounen', 'shounen ai',
       'slice of life', 'space', 'sports', 'super power', 'supernatural',
       'thriller', 'vampire', 'yaoi', 'yuri'],
      dtype='object')

In [29]:
df_melted = df.melt(
    id_vars=['name','rating', 'members'],  # Columns to keep as they are
    value_vars=[col for col in df.columns if col not in ['name', 'rating', 'members']],
    var_name='genre',  # Name for the new column that contain names(values) of the melted column
    value_name='present' # Binary value from the melted columns for genre
)

#df.melt(): function used to convert a dataframe from wide format to long format
# the wide format means that genre are spread across multiple column,
# the long format means that each genre get separates with curresponding values


In [30]:
df_melted

Unnamed: 0,name,rating,members,genre,present
0,Kimi no Na wa.,0.924370,0.197872,action,0
1,Fullmetal Alchemist: Brotherhood,0.911164,0.782770,action,1
2,Gintama°,0.909964,0.112689,action,1
3,Steins;Gate,0.900360,0.664325,action,0
4,Gintama&#039;,0.899160,0.149186,action,1
...,...,...,...,...,...
525025,Toushindai My Lover: Minami tai Mecha-Minami,0.297719,0.000203,yuri,0
525026,Under World,0.313325,0.000176,yuri,0
525027,Violence Gekiga David no Hoshi,0.385354,0.000211,yuri,0
525028,Violence Gekiga Shin David no Hoshi: Inma Dens...,0.397359,0.000168,yuri,0


In [31]:
df_filter = df_melted[df_melted['present'] == 1] # include only rows where the present column is 1

df_pivot = df_filter.pivot_table(
    index='name',     # contains a each unique name
    columns='genre',  # each unique genre will form a column in dataframe
    values='rating',  # contain values of the rating column
    aggfunc='mean'    # calculate average of multiple ratings present ratings column
)

In [32]:
df_filter

Unnamed: 0,name,rating,members,genre,present
1,Fullmetal Alchemist: Brotherhood,0.911164,0.782770,action,1
2,Gintama°,0.909964,0.112689,action,1
4,Gintama&#039;,0.899160,0.149186,action,1
6,Hunter x Hunter (2011),0.895558,0.420007,action,1
8,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,0.891957,0.071534,action,1
...,...,...,...,...,...
524882,Houkago Mania Club: Koi no Hoshii no,0.454982,0.001682,yuri,1
524911,Soushitsukyou,0.445378,0.001072,yuri,1
524917,Star☆Jewel Gaiden: Natsumi Oblivion,0.441777,0.000866,yuri,1
524940,My Life As,0.428571,0.000762,yuri,1


In [33]:
df_pivot = df_pivot.merge(
    df[['name', 'members']].drop_duplicates(), #removes any duplicate rows present in name & member column
    left_index=True,
    right_on='name',
    how='left'

)
df_pivot.set_index('name',inplace= True)


In [34]:
df_pivot.head()

Unnamed: 0_level_0,action,adventure,cars,comedy,dementia,demons,drama,ecchi,fantasy,game,...,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri,members
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;0&quot;,,,,,,,,,,,...,,,,,,,,,,0.001149
"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",,,,,,,,,,,...,,,,,,,,,,0.000107
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,,,,0.647059,,,,,0.647059,,...,,,,,,,,,,0.014149
&quot;Bungaku Shoujo&quot; Memoire,,,,,,,0.704682,,,,...,,,,,,,,,,0.017761
&quot;Bungaku Shoujo&quot; Movie,,,,,,,0.715486,,,,...,,,,,,,,,,0.040417


In [35]:
df.columns

Index(['name', 'rating', 'members', 'action', 'adventure', 'cars', 'comedy',
       'dementia', 'demons', 'drama', 'ecchi', 'fantasy', 'game', 'harem',
       'hentai', 'historical', 'horror', 'josei', 'kids', 'magic',
       'martial arts', 'mecha', 'military', 'music', 'mystery', 'parody',
       'police', 'psychological', 'romance', 'samurai', 'school', 'sci-fi',
       'seinen', 'shoujo', 'shoujo ai', 'shounen', 'shounen ai',
       'slice of life', 'space', 'sports', 'super power', 'supernatural',
       'thriller', 'vampire', 'yaoi', 'yuri'],
      dtype='object')

In [36]:
df_pivot.fillna(0,axis=1,inplace = True)

In [37]:
df_pivot.head()

Unnamed: 0_level_0,action,adventure,cars,comedy,dementia,demons,drama,ecchi,fantasy,game,...,slice of life,space,sports,super power,supernatural,thriller,vampire,yaoi,yuri,members
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;0&quot;,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001149
"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000107
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,0.0,0.0,0.0,0.647059,0.0,0.0,0.0,0.0,0.647059,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014149
&quot;Bungaku Shoujo&quot; Memoire,0.0,0.0,0.0,0.0,0.0,0.0,0.704682,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017761
&quot;Bungaku Shoujo&quot; Movie,0.0,0.0,0.0,0.0,0.0,0.0,0.715486,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040417


In [38]:
df_pivot.index

Index(['&quot;0&quot;',
       '&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu',
       '&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi',
       '&quot;Bungaku Shoujo&quot; Memoire',
       '&quot;Bungaku Shoujo&quot; Movie', '&quot;Eiji&quot;',
       '&quot;Eiyuu&quot; Kaitai', '.hack//G.U. Returner',
       '.hack//G.U. Trilogy', '.hack//G.U. Trilogy: Parody Mode',
       ...
       's.CRY.ed', 'vivi', 'xxxHOLiC', 'xxxHOLiC Kei',
       'xxxHOLiC Movie: Manatsu no Yoru no Yume', 'xxxHOLiC Rou',
       'xxxHOLiC Shunmuki', 'Üks Uks', 'ēlDLIVE', '◯'],
      dtype='object', name='name', length=12210)

In [39]:
# reset index column for further anlysis
if 'index' not in df.columns:           # checks whether there is no column named index
    df = df.reset_index()               # if not reset the column

# Recommendation System

# Cosine Similarity

In [40]:
similarity = cosine_similarity(df_pivot)
similarity

array([[1.00000000e+00, 7.52304188e-07, 3.56418960e-05, ...,
        3.14435297e-07, 1.93236496e-05, 7.07106452e-01],
       [7.52304188e-07, 1.00000000e+00, 3.36368880e-06, ...,
        2.96746976e-08, 1.82366122e-06, 2.68916413e-07],
       [3.56418960e-05, 3.36368880e-06, 1.00000000e+00, ...,
        1.40589738e-06, 8.63995505e-05, 1.27404459e-05],
       ...,
       [3.14435297e-07, 2.96746976e-08, 1.40589738e-06, ...,
        1.00000000e+00, 7.62222870e-07, 7.07106529e-01],
       [1.93236496e-05, 1.82366122e-06, 8.63995505e-05, ...,
        7.62222870e-07, 1.00000000e+00, 6.90737419e-06],
       [7.07106452e-01, 2.68916413e-07, 1.27404459e-05, ...,
        7.07106529e-01, 6.90737419e-06, 1.00000000e+00]])

In [41]:
similarity.shape

(12210, 12210)

In [42]:
# enter the movie
anime_name = input("Enter your favourite anime name:")

Enter your favourite anime name:one piece


In [43]:
# creating a list with all the movie names in the dataset
list_of_all_the_anime_name = df['name'].tolist()
list_of_all_the_anime_name

['Kimi no Na wa.',
 'Fullmetal Alchemist: Brotherhood',
 'Gintama°',
 'Steins;Gate',
 'Gintama&#039;',
 'Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou',
 'Hunter x Hunter (2011)',
 'Ginga Eiyuu Densetsu',
 'Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare',
 'Gintama&#039;: Enchousen',
 'Clannad: After Story',
 'Koe no Katachi',
 'Gintama',
 'Code Geass: Hangyaku no Lelouch R2',
 'Haikyuu!! Second Season',
 'Sen to Chihiro no Kamikakushi',
 'Shigatsu wa Kimi no Uso',
 'Mushishi Zoku Shou 2nd Season',
 'Ookami Kodomo no Ame to Yuki',
 'Code Geass: Hangyaku no Lelouch',
 'Hajime no Ippo',
 'Rurouni Kenshin: Meiji Kenkaku Romantan - Tsuioku-hen',
 'Cowboy Bebop',
 'One Punch Man',
 'Mononoke Hime',
 'Suzumiya Haruhi no Shoushitsu',
 'Monogatari Series: Second Season',
 'Mushishi Zoku Shou',
 'Mushishi',
 'Tengen Toppa Gurren Lagann',
 'Great Teacher Onizuka',
 'Natsume Yuujinchou Go',
 'Hajime no Ippo: New Challenger',
 'Mushishi Zoku Shou: Suzu no Shizuku',
 'Natsume Yuuji

In [44]:
len(list_of_all_the_anime_name)

12210

In [45]:
# finding the close match for the movie name given by the user
find_close_match = difflib.get_close_matches(anime_name, list_of_all_the_anime_name)
find_close_match

['One Piece', 'Ange Vierge']

In [46]:
#
close_match = find_close_match[0]
close_match

'One Piece'

In [47]:
# finding the index of the movie with title
index_of_the_anime = df[df['name'] == close_match]['index'].values[0]
index_of_the_anime

np.int64(74)

In [48]:
# getting a list of similar anime
similarity_score = list(enumerate(similarity[index_of_the_anime]))
similarity_score

# (similarity[index_of_anime] -> access the similarity score for the specific anime with index of each anime
# enumerate ->pairs each index with its corresponding similarity score
# list ->converts enumeration into list of tuples

[(0, np.float64(3.0684433639917404e-05)),
 (1, np.float64(2.8958303940500175e-06)),
 (2, np.float64(0.25830026358133384)),
 (3, np.float64(0.25831444019147876)),
 (4, np.float64(0.22381129628616123)),
 (5, np.float64(9.246725371339889e-06)),
 (6, np.float64(0.31621542717663487)),
 (7, np.float64(0.1826616989466648)),
 (8, np.float64(0.4473052053909179)),
 (9, np.float64(0.20004824341030203)),
 (10, np.float64(0.2583213952799664)),
 (11, np.float64(0.1826725048675286)),
 (12, np.float64(0.00024700243544710943)),
 (13, np.float64(0.4001028754514707)),
 (14, np.float64(6.543090283202159e-05)),
 (15, np.float64(0.20024362981723198)),
 (16, np.float64(0.18288064752507344)),
 (17, np.float64(0.1828149481718996)),
 (18, np.float64(0.20005942115055936)),
 (19, np.float64(0.3381042102847213)),
 (20, np.float64(0.18266836339920595)),
 (21, np.float64(0.25822876330411654)),
 (22, np.float64(1.9574986760977186e-06)),
 (23, np.float64(5.822360884255564e-07)),
 (24, np.float64(0.22366138808018574)),

In [49]:
len(similarity_score)

12210

In [50]:
# sorting the movies based on their similarity score
sorted_similar_anime = sorted(similarity_score, key = lambda x:x[1], reverse = True)
sorted_similar_anime

[(74, np.float64(0.9999999999999999)),
 (75, np.float64(0.9128627444319836)),
 (5220, np.float64(0.800007289642889)),
 (11928, np.float64(0.799985085658723)),
 (991, np.float64(0.7746029176595671)),
 (9820, np.float64(0.7745640192283504)),
 (5751, np.float64(0.7745612431595225)),
 (9606, np.float64(0.7745589521640961)),
 (2108, np.float64(0.7681896978939938)),
 (9687, np.float64(0.7303321253046968)),
 (9689, np.float64(0.730323198114574)),
 (9686, np.float64(0.7296792618840648)),
 (9685, np.float64(0.727315039834644)),
 (947, np.float64(0.7071253541843608)),
 (946, np.float64(0.7068196577063842)),
 (945, np.float64(0.7053256669353328)),
 (3923, np.float64(0.6760871736931524)),
 (11458, np.float64(0.6760655225027945)),
 (911, np.float64(0.6708688054941591)),
 (725, np.float64(0.6708680022456751)),
 (10335, np.float64(0.6708668111334992)),
 (6579, np.float64(0.6708471121355015)),
 (5724, np.float64(0.6708444166994223)),
 (5723, np.float64(0.6708442943611563)),
 (9493, np.float64(0.670828

In [51]:
# print name of the anime based on their similar score & index
print("Recommended Anime :\n")
i=1
for anime in sorted_similar_anime:
    index = anime[0]
    index_of_anime = df[df.index == index]['name'].values
    if(i<10):
        print(i, ',', index_of_anime)
        i+=1

Recommended Anime :

1 , ['One Piece']
2 , ['Ghost in the Shell: Stand Alone Complex 2nd GIG']
3 , ['Pia Carrot e Youkoso!! 2 DX']
4 , ['Isaku: Respect']
5 , ['Crayon Shin-chan']
6 , ['Onigara']
7 , ['Kaze no Tairiku']
8 , ['My Melody no Akazukin (OVA)']
9 , ['Hunter x Hunter Movie: The Last Mission']


In [52]:
# enter the movie name
anime_name = input("Enter your favourite anime name:")
# creating a list with all the movie names in the dataset
list_of_all_the_anime_name = df['name'].tolist()
# finding the close match for the movie name given by the user
find_close_match = difflib.get_close_matches(anime_name, list_of_all_the_anime_name)

close_match = find_close_match[0]
# finding the index of the movie with title
index_of_the_anime = df[df['name'] == close_match]['index'].values[0]
# getting a list of similar anime
similarity_score = list(enumerate(similarity[index_of_the_anime]))
# sorting the movies based on their similarity score
sorted_similar_anime = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print("Recommended Anime :\n")
i=1
for anime in sorted_similar_anime:
    index = anime[0]
    index_of_anime = df[df.index == index]['name'].values
    if(i<10):
        print(i, ',', index_of_anime)
        i+=1

Enter your favourite anime name:One Piece
Recommended Anime :

1 , ['One Piece']
2 , ['Ghost in the Shell: Stand Alone Complex 2nd GIG']
3 , ['Pia Carrot e Youkoso!! 2 DX']
4 , ['Isaku: Respect']
5 , ['Crayon Shin-chan']
6 , ['Onigara']
7 , ['Kaze no Tairiku']
8 , ['My Melody no Akazukin (OVA)']
9 , ['Hunter x Hunter Movie: The Last Mission']


# Evaluation

In [53]:
from sklearn.model_selection import train_test_split


X_train, X_test = train_test_split(df, train_size = 0.8, random_state = 42)

In [54]:
print(X_train)

       index                                               name    rating  \
3794    3796            Fate/kaleid liner Prisma☆Illya Specials  0.625450   
9646    9684                                Narara Wondeogongju  0.189676   
5646    5648                                  Dragon Collection  0.566627   
3307    3308  Pokemon Omega Ruby &amp; Alpha Sapphire: Mega ...  0.644658   
4132    4134                          Recorder to Randoseru Mi☆  0.615846   
...      ...                                                ...       ...   
11964  12046                                     Doutei Kawaiya  0.480192   
5191    5193                   Tsuru ni Notte: Tomoko no Bouken  0.583433   
5390    5392                          Makyou Densetsu Acrobunch  0.576230   
860      860                                     Aikatsu! Movie  0.734694   
7270    7276            Fushigi no Umi no Nadia: Original Movie  0.469388   

        members  action  adventure  cars  comedy  dementia  demons  ...  \


In [55]:
print(X_test)

       index                                               name    rating  \
2919    2920                                  Monochrome Factor  0.655462   
4561    4563                  Miracle☆Train: Oedo-sen e Youkoso  0.602641   
5206    5208                Hello Harinezumi: Satsui no Ryoubun  0.582233   
3101    3102                                     Hashire Melos!  0.649460   
8203    8224                                    Big X Episode 0  0.483794   
...      ...                                                ...       ...   
8334    8358              Chinpui: Eri-sama Katsudou Daishashin  0.519808   
2628    2628                                        The TV Show  0.665066   
11272  11354  Mou Hasamazu ni wa Irarenai♥: Hoshi ni Onegai ...  0.612245   
7971    7992                                         Armageddon  0.307323   
8395    8419                                 Dai-chan, Daisuki.  0.535414   

        members  action  adventure  cars  comedy  dementia  demons  ...  \


In [56]:
# import the library for evaluation
from sklearn.metrics import precision_score, recall_score, f1_score

In [57]:
# assume index as anime_id
recommende_anime_ids = df['name'].tolist()
real_anime_ids = df['name'].tolist()

In [58]:
# Calculate precision, recall, and F1-score
precision = precision_score(recommende_anime_ids, real_anime_ids, average = 'weighted')
recall = recall_score(recommende_anime_ids, real_anime_ids, average = 'weighted')
f1 = f1_score(recommende_anime_ids, real_anime_ids, average = 'weighted')

In [59]:
# Print the evaluation metrics
print(f"Precision Score: {precision}")
print(f"Recall Score: {recall}")
print(f"f1: {f1}")


Precision Score: 1.0
Recall Score: 1.0
f1: 1.0


In [60]:
# the evaluation metrics score: 1 indicates that the anime name recommended by your system(user) is relevant and has been accurately identified.
# and system has successfully recommended all the relvant anime.
# f1 score confirms that both metrics are balanced.

# Interview Question

# 1.Can you explain the difference between user-based and item-based collaborative filtering?
user-based collaborative filtering finds similar users to make recommendations, while item-based collaborative filtering finds similar items.

# 2. What is collaborative filtering, and how does it work?
Collaborative filtering is a technique used in recommendation systems.
the idea behind this if two users have similar choice in the past, they are likely to have similar choice in the future.

Working:

Data Collection: Collaborative filtering systems gather data from users, such as ratings, likes, or purchases.

Similarity Calculation: The system calculates the similarity between users or items using various metrics, such as cosine similarity, Pearson correlation, or Euclidean distance.

recommendation: Based on the similarity score the system recommends items to users. These recommendations are made by selecting items with the highest predicted ratings.