The goal here is to preprocess our anime data and also create the content based recommendation model.

In [1]:
import pandas as pd

In [2]:
anime_filtered_df = pd.read_csv("data/anime_filtered.csv")

In [3]:
anime_filtered_df.head()

Unnamed: 0,anime_id,name,score,rank,genres,synopsis,type,episodes,popularity,members,studios,source,favorites,rating,year
0,1,cowboy bebop,8.75,41.0,"action, award winning, sci-fi","crime is timeless. by the year 2071, humanity ...",tv,26.0,43,1771505,sunrise,original,78525,rated 17,1998
1,5,cowboy bebop: tengoku no tobira,8.38,189.0,"action, sci-fi","another day, another bounty—such is the life o...",movie,1.0,602,360978,bones,original,1448,rated 17,2001
2,6,trigun,8.22,328.0,"action, adventure, sci-fi","vash the stampede is the man with a $$60,000,0...",tv,26.0,246,727252,madhouse,manga,15035,parental guidance 13,1998
3,7,witch hunter robin,7.25,2764.0,"action, drama, mystery, supernatural",robin sena is a powerful craft user drafted in...,tv,26.0,1795,111931,sunrise,original,613,parental guidance 13,2002
4,8,bouken ou beet,6.94,4240.0,"adventure, fantasy, supernatural",it is the dark century and the people are suff...,tv,52.0,5126,15001,toei animation,manga,14,parental guidance,2004


In [4]:
# Renaming our df
anime = anime_filtered_df

Originally we wanted to use all textual data for our model, but it turns out keeping 'name' and 'source' introduces unwanted noise and degrades our model, so in this run we will drop them.

In [5]:
# Combine textual features
anime['combined_text'] = (
    anime['genres'].fillna('') + " " +
    anime['synopsis'].fillna('') + " " +
    anime['type'].fillna('') + " " +
    anime['studios'].fillna('') + " " +
    anime['rating'].fillna('')
)

In [6]:
anime.head()

Unnamed: 0,anime_id,name,score,rank,genres,synopsis,type,episodes,popularity,members,studios,source,favorites,rating,year,combined_text
0,1,cowboy bebop,8.75,41.0,"action, award winning, sci-fi","crime is timeless. by the year 2071, humanity ...",tv,26.0,43,1771505,sunrise,original,78525,rated 17,1998,"action, award winning, sci-fi crime is timeles..."
1,5,cowboy bebop: tengoku no tobira,8.38,189.0,"action, sci-fi","another day, another bounty—such is the life o...",movie,1.0,602,360978,bones,original,1448,rated 17,2001,"action, sci-fi another day, another bounty—suc..."
2,6,trigun,8.22,328.0,"action, adventure, sci-fi","vash the stampede is the man with a $$60,000,0...",tv,26.0,246,727252,madhouse,manga,15035,parental guidance 13,1998,"action, adventure, sci-fi vash the stampede is..."
3,7,witch hunter robin,7.25,2764.0,"action, drama, mystery, supernatural",robin sena is a powerful craft user drafted in...,tv,26.0,1795,111931,sunrise,original,613,parental guidance 13,2002,"action, drama, mystery, supernatural robin sen..."
4,8,bouken ou beet,6.94,4240.0,"adventure, fantasy, supernatural",it is the dark century and the people are suff...,tv,52.0,5126,15001,toei animation,manga,14,parental guidance,2004,"adventure, fantasy, supernatural it is the dar..."


Now we will preprocess our data by removing stopwords and tokenization.

In [10]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk

In [11]:
# Download stopwords and punkt for tokenization if not already done
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mahmu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\mahmu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mahmu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [12]:
# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join back into a single string
    return ' '.join(tokens)

In [13]:
# Apply preprocessing to the 'combined_text' column
anime['combined_text'] = anime['combined_text'].apply(preprocess_text)

In [14]:
anime.head()

Unnamed: 0,anime_id,name,score,rank,genres,synopsis,type,episodes,popularity,members,studios,source,favorites,rating,year,combined_text
0,1,cowboy bebop,8.75,41.0,"action, award winning, sci-fi","crime is timeless. by the year 2071, humanity ...",tv,26.0,43,1771505,sunrise,original,78525,rated 17,1998,action award winning scifi crime timeless year...
1,5,cowboy bebop: tengoku no tobira,8.38,189.0,"action, sci-fi","another day, another bounty—such is the life o...",movie,1.0,602,360978,bones,original,1448,rated 17,2001,action scifi another day another bounty—such l...
2,6,trigun,8.22,328.0,"action, adventure, sci-fi","vash the stampede is the man with a $$60,000,0...",tv,26.0,246,727252,madhouse,manga,15035,parental guidance 13,1998,action adventure scifi vash stampede man 60000...
3,7,witch hunter robin,7.25,2764.0,"action, drama, mystery, supernatural",robin sena is a powerful craft user drafted in...,tv,26.0,1795,111931,sunrise,original,613,parental guidance 13,2002,action drama mystery supernatural robin sena p...
4,8,bouken ou beet,6.94,4240.0,"adventure, fantasy, supernatural",it is the dark century and the people are suff...,tv,52.0,5126,15001,toei animation,manga,14,parental guidance,2004,adventure fantasy supernatural dark century pe...


We will use Tfid vectorizer and limit to 5000 features for word relavancy and keeping noise out, we tried 1000 words but it didn't yield good results.

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the combined_text
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')  # Limiting to 5000 features
tfidf_matrix = tfidf.fit_transform(anime['combined_text'])

In [16]:
# Display the vocabulary
print(tfidf.get_feature_names_out())

['10' '100' '1000' ... 'zoid' 'zombie' 'zone']


In [17]:
# Check shape
print(tfidf_matrix.shape)

(10048, 5000)


We kept both 'episodes' and 'year' on a previous run but 'episodes' deemed to be an unneccesary noise in our model so we will only keep 'year'

In [18]:
from sklearn.preprocessing import StandardScaler

# Select numerical features to scale
numerical_features = anime[['year']].fillna(0) 
scaler = StandardScaler()
normalized_numerical = scaler.fit_transform(numerical_features)

In [19]:
numerical_features

Unnamed: 0,year
0,1998
1,2001
2,1998
3,2002
4,2004
...,...
10043,2021
10044,2023
10045,2023
10046,2023


Using hstack to combine our 'year' with 'text' data

In [20]:
from scipy.sparse import hstack

# Combine TF-IDF matrix with numerical features
final_features = hstack([tfidf_matrix, normalized_numerical])

Now we can create our content based engine using cosine similarity.

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity
cosine_sim = cosine_similarity(final_features, final_features)

Now we can test out a function to get anime recommendation based on anime_id, each anime_id directs to a series or movie.

In [22]:
anime[['anime_id', 'name']]

Unnamed: 0,anime_id,name
0,1,cowboy bebop
1,5,cowboy bebop: tengoku no tobira
2,6,trigun
3,7,witch hunter robin
4,8,bouken ou beet
...,...,...
10043,55032,"kanojo, okarishimasu petit special"
10044,55093,li shi zhentan shiwusuo: a day in lungmen - li...
10045,55289,one piece: dai gekisen tokushuu! zoro vs. ooka...
10046,55339,mashle: mash burnedead to fushigi na tegami


In [23]:
# Function to get recommendations based on anime_id
def get_recommendations(anime_id, cosine_sim=cosine_sim, df=anime, top_n=10):
    # Index of the given anime_id
    idx = df.index[df['anime_id'] == anime_id][0]
    
    # Get similarity score
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top similar anime
    top_anime_indices = [i[0] for i in sim_scores[1:top_n+1]]
    
    # Combine results into a DataFrame
    recommendations = df.iloc[top_anime_indices][['anime_id', 'name', 'genres']].copy()
    
    return recommendations

In [24]:
# Get top 10 recommendations for anime_id = 1 (cowboy bebop)
recommendations = get_recommendations(1)
recommendations

Unnamed: 0,anime_id,name,genres
2152,2686,tetsujin 28-gou,"adventure, sci-fi"
2784,3990,kumo to tulip,adventure
3719,7523,fuku-chan no sensuikan,comedy
2828,4088,wonder 3,"action, adventure, comedy, sci-fi"
4106,9228,wan wan chuushingura,"action, adventure, drama, fantasy"
2215,2765,momotarou: umi no shinpei,action
1308,1547,obake no q-tarou,"comedy, slice of life, supernatural"
3090,5052,eightman,"action, drama, sci-fi"
4389,10342,tetsuwan atom: uchuu no yuusha,"action, adventure, drama, sci-fi"
3780,7786,arabian night: sindbad no bouken,"action, adventure, fantasy"


We can also add similarity score to our results

In [25]:
def get_recommendations_with_scores(anime_id, cosine_sim=cosine_sim, df=anime, top_n=10):
    idx = df.index[df['anime_id'] == anime_id][0]

    sim_scores = list(enumerate(cosine_sim[idx]))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    top_anime_indices = [i[0] for i in sim_scores[1:top_n+1]]
    top_sim_scores = [i[1] for i in sim_scores[1:top_n+1]]
    
    recommendations = df.iloc[top_anime_indices][['anime_id', 'name', 'genres']].copy()
    recommendations['similarity_score'] = top_sim_scores
    
    return recommendations

In [26]:
# Top 10 recommendations for anime_id = 1 with scores
recommendations = get_recommendations_with_scores(1)
recommendations

Unnamed: 0,anime_id,name,genres,similarity_score
2152,2686,tetsujin 28-gou,"adventure, sci-fi",0.638312
2784,3990,kumo to tulip,adventure,0.635917
3719,7523,fuku-chan no sensuikan,comedy,0.634796
2828,4088,wonder 3,"action, adventure, comedy, sci-fi",0.632741
4106,9228,wan wan chuushingura,"action, adventure, drama, fantasy",0.632715
2215,2765,momotarou: umi no shinpei,action,0.632225
1308,1547,obake no q-tarou,"comedy, slice of life, supernatural",0.631121
3090,5052,eightman,"action, drama, sci-fi",0.630694
4389,10342,tetsuwan atom: uchuu no yuusha,"action, adventure, drama, sci-fi",0.630341
3780,7786,arabian night: sindbad no bouken,"action, adventure, fantasy",0.630117


We can further finetune with sorting and fetching more anime results.

In [27]:
def get_recommendations_with_scores_and_sort(anime_id, cosine_sim=cosine_sim, df=anime, top_n=10):

    idx = df.index[df['anime_id'] == anime_id][0]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    top_anime_indices = [i[0] for i in sim_scores[1:top_n*2]] 
    top_sim_scores = [i[1] for i in sim_scores[1:top_n*2]]
    
    recommendations = df.iloc[top_anime_indices][['anime_id', 'name', 'genres', 'popularity']].copy()
    recommendations['similarity_score'] = top_sim_scores
    
    recommendations = recommendations.sort_values(by=['similarity_score', 'popularity'], ascending=[False, True])
    
    return recommendations.head(top_n)

In [28]:
recommendations = get_recommendations_with_scores_and_sort(1)
recommendations

Unnamed: 0,anime_id,name,genres,popularity,similarity_score
2152,2686,tetsujin 28-gou,"adventure, sci-fi",9158,0.638312
2784,3990,kumo to tulip,adventure,10287,0.635917
3719,7523,fuku-chan no sensuikan,comedy,13234,0.634796
2828,4088,wonder 3,"action, adventure, comedy, sci-fi",12211,0.632741
4106,9228,wan wan chuushingura,"action, adventure, drama, fantasy",11998,0.632715
2215,2765,momotarou: umi no shinpei,action,9862,0.632225
1308,1547,obake no q-tarou,"comedy, slice of life, supernatural",13167,0.631121
3090,5052,eightman,"action, drama, sci-fi",10884,0.630694
4389,10342,tetsuwan atom: uchuu no yuusha,"action, adventure, drama, sci-fi",11049,0.630341
3780,7786,arabian night: sindbad no bouken,"action, adventure, fantasy",11033,0.630117


In [29]:
def get_recommendations_with_filter_and_weight(anime_id, cosine_sim=cosine_sim, df=anime, top_n=10, popularity_threshold=10000):
    idx = df.index[df['anime_id'] == anime_id][0]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    top_anime_indices = [i[0] for i in sim_scores[1:]]
    top_sim_scores = [i[1] for i in sim_scores[1:]]
    
    recommendations = df.iloc[top_anime_indices][['anime_id', 'name', 'genres', 'popularity']].copy()
    recommendations['similarity_score'] = top_sim_scores
    
    recommendations = recommendations[recommendations['popularity'] <= popularity_threshold]
    
    recommendations['weighted_score'] = (
        0.7 * recommendations['similarity_score'] + 
        0.3 * (1 / (recommendations['popularity'] + 1))
    )
    
    recommendations = recommendations.sort_values(by='weighted_score', ascending=False)
    
    return recommendations.head(top_n)

In [30]:
recommendations = get_recommendations_with_filter_and_weight(1)
recommendations

Unnamed: 0,anime_id,name,genres,popularity,similarity_score,weighted_score
2152,2686,tetsujin 28-gou,"adventure, sci-fi",9158,0.638312,0.446851
2215,2765,momotarou: umi no shinpei,action,9862,0.632225,0.442588
1268,1497,aru machi kado no monogatari,"award winning, drama, romance",8362,0.629539,0.440713
2756,3905,wanpaku ouji no orochi taiji,"adventure, award winning, fantasy",9744,0.628415,0.439921
2930,4481,saiyuuki,adventure,9284,0.627415,0.439223
1662,1982,osu,slice of life,8968,0.626857,0.438833
2938,4513,hakujaden,"adventure, fantasy, romance",7642,0.625535,0.437913
3582,6872,cyborg 009,"action, adventure, drama, sci-fi",9034,0.625374,0.437795
1392,1650,uchuu senkan yamato,"action, adventure, award winning, drama, sci-fi",3976,0.624909,0.437512
2200,2747,tetsuwan atom,"action, adventure, drama, sci-fi",5617,0.624615,0.437284


We can add some collaborative balance mechanism to further finetune this model, we can tinker with popularity, score and rank weight to get better results from this model alone.

In [None]:
def get_recommendations_with_score_and_rank(anime_id, cosine_sim, df, top_n=10, popularity_threshold=10000):
    idx = df.index[df['anime_id'] == anime_id][0]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    top_anime_indices = [i[0] for i in sim_scores[1:]]
    top_sim_scores = [i[1] for i in sim_scores[1:]]
    
    recommendations = df.iloc[top_anime_indices][['anime_id', 'name', 'genres', 'popularity', 'score', 'rank']].copy()
    recommendations['similarity_score'] = top_sim_scores
    
    recommendations = recommendations[recommendations['popularity'] <= popularity_threshold]
    
    recommendations['popularity_norm'] = 1 / (recommendations['popularity'] + 1)
    recommendations['rank_norm'] = 1 / (recommendations['rank'] + 1)
    recommendations['score_norm'] = recommendations['score'] / 10 
    
    recommendations['weighted_score'] = (
        0.4 * recommendations['similarity_score'] +
        0.2 * recommendations['popularity_norm'] +
        0.2 * recommendations['score_norm'] +
        0.2 * recommendations['rank_norm']
    )
    
    recommendations = recommendations.sort_values(by='weighted_score', ascending=False)
    
    return recommendations.head(top_n)

In [32]:
recommendations = get_recommendations_with_score_and_rank(1, cosine_sim, anime)
recommendations[['anime_id', 'name', 'genres', 'popularity', 'score', 'rank', 'weighted_score', 'similarity_score']]

Unnamed: 0,anime_id,name,genres,popularity,score,rank,weighted_score,similarity_score
708,820,ginga eiyuu densetsu,"drama, sci-fi",728,9.02,12.0,0.424537,0.571195
2300,2921,ashita no joe 2,"drama, sports",2971,8.71,50.0,0.416573,0.595959
1963,2402,ashita no joe,"drama, sports",2138,8.29,251.0,0.414399,0.61928
302,338,versailles no bara,"drama, romance",2036,8.33,220.0,0.407281,0.599194
268,302,mirai shounen conan,"adventure, drama, sci-fi",2978,8.1,463.0,0.406233,0.609338
1213,1430,lupin iii: cagliostro no shiro,"action, adventure, award winning, comedy, mystery",1807,8.15,409.0,0.405299,0.604252
521,572,kaze no tani no nausicaä,"adventure, award winning, fantasy",611,8.36,197.0,0.404707,0.590426
1392,1650,uchuu senkan yamato,"action, adventure, award winning, drama, sci-fi",3976,7.59,1413.0,0.401955,0.624909
1198,1412,lupin iii,"action, adventure, comedy, mystery",1948,7.63,1296.0,0.401662,0.622013
466,513,tenkuu no shiro laputa,"adventure, award winning, fantasy, romance, sc...",451,8.26,286.0,0.401032,0.586732


Saving items:

In [32]:
from scipy.sparse import save_npz
import joblib

In [33]:
save_npz("data/tfidf_matrix.npz", tfidf_matrix)

In [34]:
joblib.dump(cosine_sim, "data/cosine_sim.pkl")

['data/cosine_sim.pkl']

In [35]:
anime.to_csv("data/anime_filtered_processed.csv", index=False)

In [36]:
joblib.dump(tfidf, "data/tfidf_vectorizer.pkl")

['data/tfidf_vectorizer.pkl']

In [37]:
save_npz("data/final_features.npz", final_features)