First we will load files from our content based and collaborative models. Then we will define our hybrid functions.

In [1]:
from scipy.sparse import load_npz
import joblib
import pandas as pd
import numpy as np

In [2]:
# Load saved files from content based model
tfidf_matrix = load_npz("data/tfidf_matrix.npz")
cosine_sim = joblib.load("data/cosine_sim.pkl")
anime_filtered_df = pd.read_csv("data/anime_filtered_processed.csv")
tfidf_vectorizer = joblib.load("data/tfidf_vectorizer.pkl")
final_features = load_npz("data/final_features.npz")

In [3]:
anime_filtered_df.head()

Unnamed: 0,anime_id,name,score,rank,genres,synopsis,type,episodes,popularity,members,studios,source,favorites,rating,year,combined_text
0,1,cowboy bebop,8.75,41.0,"action, award winning, sci-fi","crime is timeless. by the year 2071, humanity ...",tv,26.0,43,1771505,sunrise,original,78525,rated 17,1998,action award winning scifi crime timeless year...
1,5,cowboy bebop: tengoku no tobira,8.38,189.0,"action, sci-fi","another day, another bounty—such is the life o...",movie,1.0,602,360978,bones,original,1448,rated 17,2001,action scifi another day another bounty—such l...
2,6,trigun,8.22,328.0,"action, adventure, sci-fi","vash the stampede is the man with a $$60,000,0...",tv,26.0,246,727252,madhouse,manga,15035,parental guidance 13,1998,action adventure scifi vash stampede man 60000...
3,7,witch hunter robin,7.25,2764.0,"action, drama, mystery, supernatural",robin sena is a powerful craft user drafted in...,tv,26.0,1795,111931,sunrise,original,613,parental guidance 13,2002,action drama mystery supernatural robin sena p...
4,8,bouken ou beet,6.94,4240.0,"adventure, fantasy, supernatural",it is the dark century and the people are suff...,tv,52.0,5126,15001,toei animation,manga,14,parental guidance,2004,adventure fantasy supernatural dark century pe...


In [4]:
# Load saved files from collaborative based model
svd = joblib.load("data/svd_model_3.pkl")
user_clean = pd.read_csv("data/user_clean_processed_2.csv")

Now we will create a new virtual user with  3 new anime ratings.

In [None]:
# New user ratings
new_user_ratings = [
    {'user_id': 0, 'anime_id': 11111, 'rating': 8},   # Anime ID 1 with rating 8
    {'user_id': 0, 'anime_id': 5042, 'rating': 9},  # Anime ID 50 with rating 9
    {'user_id': 0, 'anime_id': 11617, 'rating': 9}, # Anime ID 200 with rating 7
]

# Convert to df and append to user_clean
new_user_df = pd.DataFrame(new_user_ratings)
updated_user_clean = pd.concat([user_clean, new_user_df], ignore_index=True)

Now we will create our hybrid recommendation functions by combining both content + collaborative data.

In [None]:
def get_recommendations_with_score_and_rank(anime_id, cosine_sim, df, top_n=10, popularity_threshold=10000):
    idx = df.index[df['anime_id'] == anime_id][0]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    top_anime_indices = [i[0] for i in sim_scores[1:]]
    top_sim_scores = [i[1] for i in sim_scores[1:]]
    
    recommendations = df.iloc[top_anime_indices][['anime_id', 'name', 'genres', 'popularity', 'score', 'rank']].copy()
    recommendations['similarity_score'] = top_sim_scores
    
    recommendations = recommendations[recommendations['popularity'] <= popularity_threshold]
    
    recommendations['popularity_norm'] = 1 / (recommendations['popularity'] + 1)
    recommendations['rank_norm'] = 1 / (recommendations['rank'] + 1)
    recommendations['score_norm'] = recommendations['score'] / 10  # Assuming max score is 10
    
    recommendations['weighted_score'] = (
        0.9 * recommendations['similarity_score'] +
        0.1 * recommendations['popularity_norm'] +
        0 * recommendations['score_norm'] +
        0 * recommendations['rank_norm']
    )
    
    recommendations = recommendations.sort_values(by='weighted_score', ascending=False)
    
    return recommendations.head(top_n)

_(We have tuned the model weight to get the best results)_

In [None]:
def hybrid_recommendations_for_new_user(new_user_ratings, svd_model, cosine_sim, anime_df, top_n=10, cf_weight=0.6, content_weight=0.4):
    collaborative_scores = []
    all_anime_ids = anime_df['anime_id'].unique()

    for anime_id in all_anime_ids:
        try:
            pred = svd_model.predict(uid=0, iid=anime_id)
            collaborative_scores.append((anime_id, pred.est))
        except Exception:
            collaborative_scores.append((anime_id, 0))

    collaborative_df = pd.DataFrame(collaborative_scores, columns=['anime_id', 'cf_score'])

    watched_anime_ids = [rating['anime_id'] for rating in new_user_ratings]
    content_scores = []

    for anime_id in watched_anime_ids:
        print(f"Fetching recommendations for Anime ID: {anime_id}")
        try:
            similar_anime = get_recommendations_with_score_and_rank(anime_id, cosine_sim, anime_df, top_n=10)
            content_scores.append(similar_anime[['anime_id', 'weighted_score']])
        except Exception as e:
            print(f"Error processing Anime ID {anime_id}: {e}")

    if content_scores:
        content_scores = pd.concat(content_scores).groupby('anime_id', as_index=False).mean()
    else:
        print("No content-based recommendations found. Defaulting to collaborative filtering only.")
        content_scores = pd.DataFrame(columns=['anime_id', 'weighted_score'])

    hybrid_df = pd.merge(collaborative_df, content_scores, on='anime_id', how='outer').fillna(0)
    hybrid_df['hybrid_score'] = (cf_weight * hybrid_df['cf_score']) + (content_weight * hybrid_df['weighted_score'])
    hybrid_df = pd.merge(hybrid_df, anime_df[['anime_id', 'name', 'genres', 'year', 'studios', 'rank']], on='anime_id')
    hybrid_df = hybrid_df.sort_values(by='hybrid_score', ascending=False).head(top_n)

    return hybrid_df

In [None]:
# Generate hybrid recommendations for the new user
hybrid_recs_new_user = hybrid_recommendations_for_new_user(
    new_user_ratings=new_user_ratings,
    svd_model=svd,
    cosine_sim=cosine_sim,
    anime_df=anime_filtered_df,
    top_n=30,
    cf_weight=0.1,
    content_weight=0.9
)

hybrid_recs_new_user[['name', 'genres', 'year', 'studios', 'rank', 'hybrid_score']]

Fetching recommendations for Anime ID: 11111
Fetching recommendations for Anime ID: 5042
Fetching recommendations for Anime ID: 11617


Unnamed: 0,name,genres,year,studios,rank,hybrid_score
5074,high school dxd new,"action, comedy, romance, ecchi",2013,tnk,1789.0,1.176087
7223,high school dxd hero,"action, comedy, romance, ecchi",2018,passione,2783.0,1.141929
5943,high school dxd born,"action, comedy, romance, ecchi",2015,tnk,2016.0,1.102432
4804,high school dxd ova,"comedy, romance, ecchi",2012,tnk,2700.0,1.080895
3736,kiss x sis (tv),"comedy, romance, ecchi",2010,feel.,5761.0,1.074891
9237,chainsaw man,"action, fantasy",2022,mappa,85.0,1.061099
9150,godzilla: s.p,"action, award winning, mystery, sci-fi",2021,"bones, orange",5588.0,1.012803
9455,lupin iii: part 6,"action, adventure, comedy, mystery",2021,tms entertainment,3195.0,1.011367
9989,jijou wo shiranai tenkousei ga guigui kuru.,comedy,2023,studio signpost,1812.0,1.008618
9368,do it yourself!!,slice of life,2022,pine jam,1797.0,1.008171


A prior run's result are also kept for comparison.

| Name                                  | Genres                                   | Year | Studios                 | Rank   | Hybrid Score |
|---------------------------------------|------------------------------------------|------|-------------------------|--------|--------------|
| High School DxD Hero                  | Action, Comedy, Romance, Ecchi           | 2018 | Passione                | 2783.0 | 1.850334     |
| Chainsaw Man                          | Action, Fantasy                          | 2022 | MAPPA                   | 85.0   | 1.819940     |
| High School DxD New                   | Action, Comedy, Romance, Ecchi           | 2013 | TNK                     | 1789.0 | 1.805281     |
| Fullmetal Alchemist: Brotherhood      | Action, Adventure, Drama, Fantasy        | 2009 | Bones                   | 1.0    | 1.762574     |
| Steins;Gate                           | Drama, Sci-Fi, Suspense                  | 2011 | White Fox               | 3.0    | 1.754490     |
| Kimi no Na wa.                        | Award Winning, Drama, Supernatural       | 2016 | CoMix Wave Films        | 27.0   | 1.743024     |
| Gintama°                              | Action, Comedy, Sci-Fi                   | 2015 | Bandai Namco Pictures   | 4.0    | 1.739494     |
| Gintama Movie 2: Kanketsu-hen         | Action, Comedy, Sci-Fi                   | 2013 | Sunrise                 | 21.0   | 1.736633     |
| Godzilla: S.P                         | Action, Award Winning, Mystery, Sci-Fi   | 2021 | Bones, Orange           | 5588.0 | 1.735556     |
| Lupin III: Part 6                     | Action, Adventure, Comedy, Mystery       | 2021 | TMS Entertainment       | 3195.0 | 1.734280     |
| Jijou wo Shiranai Tenkousei ga Guigui | Comedy                                  | 2023 | Studio Signpost         | 1812.0 | 1.731836     |
| Do It Yourself!!                      | Slice of Life                            | 2022 | Pine Jam                | 1797.0 | 1.731438     |
| Isekai Meikyuu de Harem wo            | Action, Adventure, Fantasy, Romance, Ecchi | 2022 | Passione                | 5979.0 | 1.729776     |
| Hunter x Hunter (2011)                | Action, Adventure, Fantasy               | 2011 | Madhouse                | 10.0   | 1.728369     |
| Youkoso Jitsuryoku Shijou Shugi no... | Drama, Suspense                          | 2022 | Lerche                  | 431.0  | 1.728283     |
| Citrus                                | Drama, Girls Love, Romance               | 2018 | Passione                | 6600.0 | 1.728229     |
| Skip to Loafer                        | Drama                                   | 2023 | P.A. Works              | 291.0  | 1.727186     |
| Code Geass: Hangyaku no Lelouch R2    | Action, Award Winning, Drama, Sci-Fi     | 2008 | Sunrise                 | 20.0   | 1.724871     |
| Lupin III: Part 5                     | Action, Adventure, Comedy, Mystery       | 2018 | Telecom Animation Film  | 429.0  | 1.723882     |
| Gintama'                              | Action, Comedy, Sci-Fi                   | 2011 | Sunrise                 | 8.0    | 1.723712     |


It is evident that we have to diversify our recommendation so that it doesn't repeat the same series over and over again.

In [None]:
def diversify_recommendations_by_keyword(df, column='name', max_per_keyword=1):
    keyword_counts = {}
    diversified = []
    
    for _, row in df.iterrows():
        name = row[column]
        keywords = [word.lower() for word in name.split() if len(word) > 3]  
        
        if all(keyword_counts.get(keyword, 0) < max_per_keyword for keyword in keywords):
            diversified.append(row)
            for keyword in keywords:
                keyword_counts[keyword] = keyword_counts.get(keyword, 0) + 1

    return pd.DataFrame(diversified)

In [None]:
# Diversify recommendations by limiting repeating keywords
diversified_recs = diversify_recommendations_by_keyword(hybrid_recs_new_user, column='name', max_per_keyword=1)

diversified_recs[['anime_id', 'name', 'genres', 'hybrid_score']]

Unnamed: 0,anime_id,name,genres,hybrid_score
5074,15451,high school dxd new,"action, comedy, romance, ecchi",1.176087
3736,7593,kiss x sis (tv),"comedy, romance, ecchi",1.074891
9237,44511,chainsaw man,"action, fantasy",1.061099
9150,43229,godzilla: s.p,"action, award winning, mystery, sci-fi",1.012803
9455,49040,lupin iii: part 6,"action, adventure, comedy, mystery",1.011367
9989,53621,jijou wo shiranai tenkousei ga guigui kuru.,comedy,1.008618
9368,48542,do it yourself!!,slice of life,1.008171
9240,44524,isekai meikyuu de harem wo,"action, adventure, fantasy, romance, ecchi",1.006301
9770,51096,youkoso jitsuryoku shijou shugi no kyoushitsu ...,"drama, suspense",1.004621
9669,50416,skip to loafer,drama,1.003387


Our keyword limit based diversification works really great and we can keep this model as it is now.

We also tried a series keyword diversification but it doesn't work as intended many times.

In [None]:
def diversify_recommendations_by_series_keyword(df, column='name', keywords=None, max_per_keyword=1):
    
    keyword_counts = {}
    diversified = []

    for _, row in df.iterrows():
        name = row[column].lower()
        keyword_found = None

        if keywords:
            for keyword in keywords:
                if keyword.lower() in name:
                    keyword_found = keyword.lower()
                    break

        if not keyword_found:
            diversified.append(row)
        else:
            if keyword_counts.get(keyword_found, 0) < max_per_keyword:
                diversified.append(row)
                keyword_counts[keyword_found] = keyword_counts.get(keyword_found, 0) + 1

    return pd.DataFrame(diversified)


We also made a keyword extractor function to find all series keywords easily.

In [None]:
import pandas as pd
from collections import Counter

anime_titles = anime_filtered_df['name']

keywords = []
for title in anime_titles:
    keywords.extend(title.lower().split())

keyword_counts = Counter(keywords)

series_keywords = [keyword for keyword, count in keyword_counts.items() if count > 3]

print("Extracted Series Keywords:", series_keywords)

Extracted Series Keywords: ['cowboy', 'no', 'tobira', 'witch', 'hunter', 'bouken', 'ou', '21', 'hachimitsu', 'to', 'clover', 'heart:', 'wild', 'initial', 'd', 'stage', 'monster', 'naruto', 'tennis', 'ouji-sama', 'ring', 'ni', 'kakero', '1', 'school', 'rumble', 'trinity', 'blood', 'neon', 'genesis', 'evangelion', 'evangelion:', 'death', '&', 'rebirth', 'the', 'end', 'of', 'berserk', 'koukaku', 'kidoutai', 'rurouni', 'kenshin:', 'meiji', 'kenkaku', 'romantan', '-', 'e', 'aa!', 'megami-sama!', '(tv)', 'tenshi', 'kidou', 'ai', 'yori', 'appleseed', '(movie)', 'blue', 'da', 'capo', 'rozen', 'animation', 'chou', 'black', 'cat', 'full', 'metal', 'panic!', 'second', 'raid', 'gakuen', 'alice', 'soukyuu', 'fafner:', 'dead', 'aggressor', 'mahou', 'shoujo', 'lyrical', 'nanoha', 'shuffle!', 'senshi', 'gundam', 'gundam:', 'dai', 'ms', 'shoutai', 'naka', 'sensou', 'report', '0083:', 'stardust', 'memory', 'gyakushuu', 'victory', 'shin', 'senki', 'wing:', 'endless', 'x', 'seed', 'destiny', 'turn', 'a', 

In [None]:
# Diversify recommendations based on series keywords
diversified_recs = diversify_recommendations_by_series_keyword(
    df=hybrid_recs_new_user,
    column='name',
    keywords=series_keywords,
    max_per_keyword=2 
)

diversified_recs[['anime_id', 'name', 'genres', 'hybrid_score']]

Unnamed: 0,anime_id,name,genres,hybrid_score
5074,15451,high school dxd new,"action, comedy, romance, ecchi",1.176087
7223,34281,high school dxd hero,"action, comedy, romance, ecchi",1.141929
3736,7593,kiss x sis (tv),"comedy, romance, ecchi",1.074891
9237,44511,chainsaw man,"action, fantasy",1.061099
9455,49040,lupin iii: part 6,"action, adventure, comedy, mystery",1.011367
9989,53621,jijou wo shiranai tenkousei ga guigui kuru.,comedy,1.008618
9368,48542,do it yourself!!,slice of life,1.008171
9770,51096,youkoso jitsuryoku shijou shugi no kyoushitsu ...,"drama, suspense",1.004621
9669,50416,skip to loafer,drama,1.003387
4970,14289,suki tte ii na yo.,"drama, romance",0.999995


The result from this approach is questionable, thus we can stay with our first diversification method.