In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
# Step 1: Load and preprocess the data
animes = pd.read_csv('animes.csv')
profiles = pd.read_csv('profiles.csv')
reviews = pd.read_csv('reviews.csv')

In [None]:
animes.columns

In [None]:
profiles.columns

In [None]:
reviews.columns

In [None]:
reviews

In [None]:
print(reviews)

In [None]:
animes

In [None]:
animes = animes.drop_duplicates(subset=['title'], keep='first')

In [None]:
animes.columns

In [None]:
animes.columns

In [None]:
import re

# Step 2: Feature engineering (simplified for this example)
def extract_genres(genre_str):
    if isinstance(genre_str, str):
        return re.findall(r"\w+\s?\w*", genre_str)
    else:
        return []

animes['genres'] = animes['genre'].apply(extract_genres)
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(animes['genres'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)
animes = pd.concat([animes.drop(['genres'], axis=1), genres_df], axis=1)

In [None]:
animes.columns

In [None]:
animes.head()

In [None]:
scaler = MinMaxScaler()
animes[['ranked', 'popularity', 'score']] = scaler.fit_transform(animes[['ranked', 'popularity', 'score']])

In [None]:
animes.dtypes

In [None]:
# Fill NaN values with zeros
animes_cleaned = animes.drop(['title', 'synopsis', 'genre', 'aired', 'img_url', 'link'], axis=1).fillna(0)

# Check if there are any infinity values
print("Contains infinity values:", np.any(np.isinf(animes_cleaned)))

In [None]:
animes_cleaned

In [None]:
similarity_matrix = cosine_similarity(animes_cleaned)

In [None]:
similarity_matrix[0]

In [None]:
def recommend_animes_by_genres(user_favorite_genres, k=10):
    genre_columns = [col for col in animes.columns if col in user_favorite_genres]
    genre_scores = animes[genre_columns].sum(axis=1)
    top_k_indices = genre_scores.argsort()[-k:][::-1]
    
    return [get_anime_title(idx) for idx in top_k_indices]

The function works by creating a subset of the animes DataFrame that only includes the columns corresponding to the user's favorite genres. The values in these columns are either 0 or 1, indicating whether the anime belongs to that genre or not. The function then sums up these values along the rows (axis=1) for each anime, creating a total "genre score" for each anime. This score represents how many of the user's favorite genres are present in each anime.

For example, if the user's favorite genres are ['Vampire', 'Horror', 'Demons'], the "genre score" for each anime would be the sum of the values in these three columns. An anime with all three genres would get a score of 3, while an anime with only one of these genres would get a score of 1.

The function then sorts the animes based on their genre scores in descending order and selects the top-k animes with the highest scores. These animes are recommended to the user since they align most closely with the user's favorite genres.

In [None]:
def get_anime_id(title):
    anime_df = animes[animes['title'] == title]
    if not anime_df.empty:
        return anime_df.index[0]
    else:
        return None

def get_anime_title(anime_id):
    return animes.iloc[anime_id]['title']

def recommend_animes_by_titles(user_favorite_titles, k=10):
    user_favorites_ids = [get_anime_id(title) for title in user_favorite_titles]
    user_favorites_ids = [x for x in user_favorites_ids if x is not None]  # Remove any None values
    user_similarities = np.sum(similarity_matrix[user_favorites_ids], axis=0)
    top_k_indices = user_similarities.argsort()[-k-1:][::-1]
    top_k_indices = [idx for idx in top_k_indices if idx not in user_favorites_ids][:k]
    
    return [get_anime_title(idx) for idx in top_k_indices]

The recommend_animes_by_titles() function is designed to recommend new animes to a user based on their favorite anime titles. Here's a simple explanation of how it works:

First, the function finds the IDs of the user's favorite animes by looking them up in the dataset.
Next, it calculates the similarity between the user's favorite animes and all other animes in the dataset. The more similar two animes are, the higher their similarity score.
The function then identifies the top-k most similar animes to the user's favorites. These are the animes with the highest similarity scores.
To avoid recommending animes the user has already seen, it removes the user's favorite animes from the list of top-k similar animes.
Finally, the function returns the titles of the top-k recommended animes.
In simple terms, the recommend_animes_by_titles() function recommends new animes to the user by finding animes that are most similar to their favorites based on their features.

In [None]:
def recommend_animes_by_genres_and_similarity(user_favorite_titles, user_favorite_genres, k=10):
    user_favorites_ids = [get_anime_id(title) for title in user_favorite_titles]
    user_favorites_ids = [x for x in user_favorites_ids if x is not None]  # Remove any None values
    
    genre_columns = [col for col in animes.columns if col in user_favorite_genres]
    genre_scores = animes[genre_columns].sum(axis=1)
    
    user_similarities = np.sum(similarity_matrix[user_favorites_ids], axis=0)
    combined_scores = genre_scores * user_similarities
    
    top_k_indices = combined_scores.argsort()[-k-1:][::-1]
    top_k_indices = [idx for idx in top_k_indices if idx not in user_favorites_ids][:k]

    return [get_anime_title(idx) for idx in top_k_indices]


The recommend_animes_by_genres_and_similarity() function is designed to recommend new animes to a user based on both their favorite anime titles and favorite genres. Here's a simple explanation of how it works:

First, the function finds the IDs of the user's favorite animes by looking them up in the dataset.
Next, it identifies the genre columns in the dataset that match the user's favorite genres and calculates a genre score for each anime by summing up their values in those columns.
The function then calculates the similarity between the user's favorite animes and all other animes in the dataset. The more similar two animes are, the higher their similarity score.
To combine the genre preferences with the similarity information, the function multiplies the genre scores with the similarity scores, creating a combined score for each anime.
The function identifies the top-k animes with the highest combined scores.
To avoid recommending animes the user has already seen, it removes the user's favorite animes from the list of top-k animes with the highest combined scores.
Finally, the function returns the titles of the top-k recommended animes.
In simple terms, the recommend_animes_by_genres_and_similarity() function recommends new animes to the user by finding animes that have similar features to their favorite titles and also match their favorite genres, resulting in a more personalized recommendation.

In [None]:
user_favorites_titles = ['Naruto', 'Death Note', 'Attack on Titan']  # Example list of favorite animes

recommendations = recommend_animes_by_titles(user_favorites_titles)
print("User's favorite animes titles:", user_favorites_titles)
print("Recommended animes:", recommendations)

user_favorites_genres = ['Vampire', 'Horror', 'Demons']  # Example list of favorite animes
print()
recommendations = recommend_animes_by_genres(user_favorites_genres)
print("User's favorite animes genres:", user_favorites_genres)
print("Recommended animes:", recommendations)
print()
print()
recommendations = recommend_animes_by_genres_and_similarity(user_favorites_titles,user_favorites_genres)
print("Recommended animes combined:", recommendations)

In [None]:
# 1. Aggregate the scores for each anime from the reviews dataframe
aggregated_scores = reviews.groupby('anime_uid')['score'].agg(['mean', 'count']).reset_index()

# 2. Merge the aggregated scores with the animes dataframe
animes = pd.merge(animes, aggregated_scores, left_on='uid', right_on='anime_uid', how='left')

# Normalize the mean score and the number of reviews
animes[['mean', 'count']] = scaler.fit_transform(animes[['mean', 'count']])

Calculating the count (number of reviews) along with the mean score is important because it provides context to the mean score and can help improve the quality of recommendations.

Consider two animes, A and B:

Anime A has a mean score of 9 based on 1,000 reviews.
Anime B has a mean score of 9.5 based on only 10 reviews.
While anime B has a higher mean score, it is based on a small number of reviews, which may not provide a reliable assessment of its quality. On the other hand, the mean score of anime A is based on a larger number of reviews, which indicates that it's more likely to be a better representation of the general audience's opinion.

By including the count (number of reviews) as a factor in the recommendation function, you give more importance to animes with a larger number of reviews. This can help ensure that the recommendations are not only based on high mean scores but also on the reliability of those scores.

The review_c

In [None]:
animes_cleaned = animes.drop(['title', 'synopsis', 'genre', 'aired', 'img_url', 'link'], axis=1).fillna(0)
animes_cleaned

In [None]:
similarity_matrix = cosine_similarity(animes_cleaned)

In [None]:
def recommend_animes_by_titles_and_score(user_favorite_titles, k=10, score_weight=0.5, review_count_weight=0.5):
    user_favorites_ids = [get_anime_id(title) for title in user_favorite_titles]
    user_favorites_ids = [x for x in user_favorites_ids if x is not None]  # Remove any None values
    user_similarities = np.sum(similarity_matrix[user_favorites_ids], axis=0)
    
    # Weighted sum of user_similarities, mean score, and review count
    combined_scores = user_similarities + score_weight * animes['mean'] + review_count_weight * animes['count']
    
    top_k_indices = combined_scores.argsort()[-k-1:][::-1]
    top_k_indices = [idx for idx in top_k_indices if idx not in user_favorites_ids][:k]
    
    return [get_anime_title(idx) for idx in top_k_indices]

# Test the new recommendation function
recommendations = recommend_animes_by_titles_and_score(user_favorites_titles)
print("User's favorite animes titles:", user_favorites_titles)
print("Recommended animes based on titles and scores:", recommendations)

Increase the weight of the user's favorite genres: When calculating the combined_scores in the recommend_animes_by_genres_and_similarity() function, you can increase the weight given to the user's favorite genres to emphasize their importance.

In [None]:
def get_genre_score(user_favorite_genres):
    genre_columns = [col for col in animes.columns if col in user_favorite_genres]
    genre_scores = animes[genre_columns].sum(axis=1)
    return genre_scores

In [None]:
animes

In [None]:
user_favorites_genres = ['Supernatural', 'Vampire', 'Demons']
genre_scores = get_genre_score(user_favorites_genres)
genre_scores

In [None]:
def recommend_animes_by_genres_and_similarity(user_favorite_titles, user_favorite_genres, k=10, genre_weight=2.0):
    user_favorites_ids = [get_anime_id(title) for title in user_favorite_titles]
    user_favorites_ids = [x for x in user_favorites_ids if x is not None]  # Remove any None values
    
    genre_scores = get_genre_score(user_favorite_genres)
    
    user_similarities = np.sum(similarity_matrix[user_favorites_ids], axis=0)
    combined_scores = user_similarities + genre_weight * genre_scores
    
    top_k_indices = combined_scores.argsort()[-k-1:][::-1]
    top_k_indices = [idx for idx in top_k_indices if idx not in user_favorites_ids][:k]

    return [get_anime_title(idx) for idx in top_k_indices]

# Test the updated recommendation function
user_favorites_titles = ['Naruto', 'Death Note', 'Attack on Titan']
user_favorites_genres = ['Vampire', 'Horror', 'Demons']
recommendations = recommend_animes_by_genres_and_similarity(user_favorites_titles, user_favorites_genres)
print("Recommended animes:", recommendations)

In [None]:
def recommend_animes_by_genres_and_similarity(user_favorite_titles, user_favorite_genres, k=10, genre_weight=3.0):
    user_favorites_ids = [get_anime_id(title) for title in user_favorite_titles]
    user_favorites_ids = [x for x in user_favorites_ids if x is not None]  # Remove any None values
    
    genre_scores = get_genre_score(user_favorite_genres)
    
    user_similarities = np.sum(similarity_matrix[user_favorites_ids], axis=0)
    combined_scores = user_similarities + genre_weight * genre_scores
    
    top_k_indices = combined_scores.argsort()[-k-1:][::-1]
    top_k_indices = [idx for idx in top_k_indices if idx not in user_favorites_ids][:k]

    return [get_anime_title(idx) for idx in top_k_indices]

# Test the updated recommendation function
user_favorites_titles = ['Naruto', 'Death Note', 'Attack on Titan']
user_favorites_genres = ['Vampire', 'Horror', 'Demons']
recommendations = recommend_animes_by_genres_and_similarity(user_favorites_titles, user_favorites_genres)
print("Recommended animes:", recommendations)

In [None]:
def recommend_animes_by_genres_only(user_favorite_genres, k=10):
    genre_scores = get_genre_score(user_favorite_genres)
    
    top_k_indices = genre_scores.argsort()[-k:][::-1]
    
    return [get_anime_title(idx) for idx in top_k_indices]

# Test the recommendation function with genres only
user_favorites_genres = ['Vampire', 'Horror', 'Demons']
recommendations = recommend_animes_by_genres_only(user_favorites_genres)
print("Recommended animes:", recommendations)


In [None]:
# Your specified genres array
genres_18_above = ['Hentai', 'Ecchi', 'Haren','Yuri','Yaoi']

# Function to check if any genre from the array is in the anime genres
def is_18_above(genre_str):
    if isinstance(genre_str, str):
        for genre in genres_18_above:
            if genre in genre_str:
                return 1
            return 0

# Add the '18_above' column to the animes DataFrame
animes['18_above'] = animes['genre'].apply(is_18_above)

# Display the updated animes DataFrame
print(animes['18_above'].value_counts())

In [None]:
def recommend_animes_by_genres_only(user_favorite_genres, k=10, age=None):
    genre_scores = get_genre_score(user_favorite_genres)
    
    if age is not None and age < 18:
        genre_scores = genre_scores[animes['18_above'] == 0]
    
    top_k_indices = genre_scores.argsort()[-k:][::-1]
    
    return [get_anime_title(idx) for idx in top_k_indices]

def recommend_animes_by_titles(user_favorite_titles, k=10, age=None):
    user_favorites_ids = [get_anime_id(title) for title in user_favorite_titles]
    user_favorites_ids = [x for x in user_favorites_ids if x is not None]  # Remove any None values
    user_similarities = np.sum(similarity_matrix[user_favorites_ids], axis=0)
    
    if age is not None and age < 18:
        user_similarities = user_similarities[animes['18_above'] == 0]
    
    top_k_indices = user_similarities.argsort()[-k-1:][::-1]
    top_k_indices = [idx for idx in top_k_indices if idx not in user_favorites_ids][:k]
    
    return [get_anime_title(idx) for idx in top_k_indices]

def recommend_animes_by_genres_and_similarity(user_favorite_titles, user_favorite_genres, k=10, age=None):
    user_favorites_ids = [get_anime_id(title) for title in user_favorite_titles]
    user_favorites_ids = [x for x in user_favorites_ids if x is not None]  # Remove any None values
    
    genre_columns = [col for col in animes.columns if col in user_favorite_genres]
    genre_scores = animes[genre_columns].sum(axis=1)
    
    user_similarities = np.sum(similarity_matrix[user_favorites_ids], axis=0)
    combined_scores = genre_scores * user_similarities
    
    if age is not None and age < 18:
        combined_scores = combined_scores[animes['18_above'] == 0]
    
    top_k_indices = combined_scores.argsort()[-k-1:][::-1]
    top_k_indices = [idx for idx in top_k_indices if idx not in user_favorites_ids][:k]

    return [get_anime_title(idx) for idx in top_k_indices]


In [None]:
user_favorites_titles = ['Naruto', 'Death Note', 'Attack on Titan']
user_favorites_genres = ['Vampire', 'Horror', 'Demons']
user_age = 16

recommendations = recommend_animes_by_titles(user_favorites_titles, age=user_age)
print("Recommended animes by titles:", recommendations)

recommendations = recommend_animes_by_genres_only(user_favorites_genres, age=user_age)
print("Recommended animes by genres:", recommendations)

recommendations = recommend_animes_by_genres_and_similarity(user_favorites_titles, user_favorites_genres, age=user_age)
print("Recommended animes combined:", recommendations)

In [None]:
def recommend_animes_by_genres_only(user_favorite_genres, k=10, age=None, type=None):
    genre_scores = get_genre_score(user_favorite_genres)

    valid_animes = animes.copy()

    if age is not None and age < 18:
        valid_animes = valid_animes[valid_animes['18_above'] == 0]

    if type == 'movie':
        valid_animes = valid_animes[valid_animes['episodes'] == 1]
    elif type == 'series':
        valid_animes = valid_animes[valid_animes['episodes'] > 1]

    genre_scores = pd.Series(genre_scores, index=animes.index)
    genre_scores = genre_scores.loc[valid_animes.index]

    top_k_indices = genre_scores.nlargest(k).index

    return [get_anime_title(idx) for idx in top_k_indices]

def recommend_animes_by_titles(user_favorite_titles, k=10, age=None, type=None):
    user_favorites_ids = [get_anime_id(title) for title in user_favorite_titles]
    user_favorites_ids = [x for x in user_favorites_ids if x is not None]  # Remove any None values
    user_similarities = np.sum(similarity_matrix[user_favorites_ids], axis=0)

    valid_animes = animes.copy()

    if age is not None and age < 18:
        valid_animes = valid_animes[valid_animes['18_above'] == 0]

    if type == 'movie':
        valid_animes = valid_animes[valid_animes['episodes'] == 1]
    elif type == 'series':
        valid_animes = valid_animes[valid_animes['episodes'] > 1]

    user_similarities = pd.Series(user_similarities, index=animes.index)
    user_similarities = user_similarities.loc[valid_animes.index]

    top_k_indices = user_similarities.nlargest(k + len(user_favorites_ids)).index
    top_k_indices = [idx for idx in top_k_indices if idx not in user_favorites_ids][:k]

    return [get_anime_title(idx) for idx in top_k_indices]

def recommend_animes_by_genres_and_similarity(user_favorite_titles, user_favorite_genres, k=10, age=None, type=None):
    user_favorites_ids = [get_anime_id(title) for title in user_favorite_titles]
    user_favorites_ids = [x for x in user_favorites_ids if x is not None]  # Remove any None values

    genre_columns = [col for col in animes.columns if col in user_favorite_genres]
    genre_scores = animes[genre_columns].sum(axis=1)

    user_similarities = np.sum(similarity_matrix[user_favorites_ids], axis=0)
    combined_scores = genre_scores * user_similarities

    valid_animes = animes.copy()

    if age is not None and age < 18:
        valid_animes = valid_animes[valid_animes['18_above'] == 0]

    if type == 'movie':
        valid_animes = valid_animes[valid_animes['episodes'] == 1]
    elif type == 'series':
        valid_animes = valid_animes[valid_animes['episodes'] > 1]
    
    top_k_indices = combined_scores.argsort()[-k-1:][::-1]
    top_k_indices = [idx for idx in top_k_indices if idx not in user_favorites_ids][:k]

    return [get_anime_title(idx) for idx in top_k_indices]

In [None]:
user_favorites_titles = ['Naruto', 'Death Note', 'Attack on Titan']
user_favorites_genres = ['Vampire', 'Horror', 'Demons']
user_age = 16
type_anime='movie'

recommendations = recommend_animes_by_titles(user_favorites_titles, age=user_age,type=type_anime)
print("Recommended animes by titles:", recommendations)

recommendations = recommend_animes_by_genres_only(user_favorites_genres, age=user_age,type=type_anime)
print("Recommended animes by genres:", recommendations)

recommendations = recommend_animes_by_genres_and_similarity(user_favorites_titles, user_favorites_genres, age=user_age,type=type_anime)
print("Recommended animes combined:", recommendations)

# Cold Start Recommendation

In [None]:
def recommend_animes_by_genres_only(user_favorite_genres, k=10, age=None, type=None):
    genre_scores = get_genre_score(user_favorite_genres)

    valid_animes = animes.copy()

    if age is not None and age < 18:
        valid_animes = valid_animes[valid_animes['18_above'] == 0]

    if type == 'movie':
        valid_animes = valid_animes[valid_animes['episodes'] == 1]
    elif type == 'series':
        valid_animes = valid_animes[valid_animes['episodes'] > 1]

    genre_scores = pd.Series(genre_scores, index=animes.index)
    genre_scores = genre_scores.loc[valid_animes.index]

    top_k_indices = genre_scores.nlargest(k).index

    return [get_anime_title(idx) for idx in top_k_indices]

In [None]:
user_favorites_titles = ['Naruto', 'Death Note', 'Attack on Titan']
user_favorites_genres = ['Vampire', 'Horror', 'Demons']
user_age = 16
type_anime='movie'


recommendations = recommend_animes_by_genres_only(user_favorites_genres, age=user_age,type=type_anime)
print("Recommended animes by genres:", recommendations)

In [None]:
# Reset the index for user_feature_matrix and favorites_anime_matrix
user_feature_matrix = user_feature_matrix.reset_index()
favorites_anime_matrix = favorites_anime_matrix.reset_index()

# Combine the gender column with the user-anime preference matrix
user_feature_matrix = pd.concat([user_feature_matrix, favorites_anime_matrix], axis=1)

In [None]:
profiles['profile']

In [None]:
user_item_matrix = reviews.pivot_table(index='profile', columns='anime_uid', values='score')
user_item_matrix

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarity_matrix = cosine_similarity(user_item_matrix.fillna(0))


In [None]:
user_similarity_matrix

In [None]:
def find_k_similar_users(user, k=5):
    user_index = user_item_matrix.index.get_loc(user)
    user_similarities = user_similarity_matrix[user_index]
    
    # Ignore the similarity score of the user with themselves
    user_similarities[user_index] = -1
    
    top_k_indices = user_similarities.argsort()[-k:][::-1]
    top_k_users = [user_item_matrix.index[idx] for idx in top_k_indices]
    top_k_scores = user_similarities[top_k_indices]
    
    return top_k_users, top_k_scores


In [None]:
animes.columns

In [None]:
def recommend_animes_for_user(user, k=5):
    similar_users, similar_users_scores = find_k_similar_users(user, k)
    similar_users_preferences = user_item_matrix.loc[similar_users].fillna(0)
    weighted_preferences = similar_users_preferences.mul(similar_users_scores, axis=0)
    user_preferences = weighted_preferences.sum(axis=0) / similar_users_scores.sum()
    
    # Ignore animes the user has already rated
    user_rated_animes = user_item_matrix.loc[user].dropna().index
    user_preferences[user_rated_animes] = -1
    
    top_k_indices = user_preferences.argsort()[-k:][::-1]
    print(top_k_indices)
    top_k_animes = [animes[animes['uid'] == anime_id].iloc[0]['title'] for anime_id in top_k_indices]
    
    return top_k_animes


In [None]:
user = 'DesolatePsyche'
recommendations = recommend_animes_for_user(user)
print("Recommended animes for user:", recommendations)


In [None]:
len(animes)

In [None]:
# One-hot encode the gender column
profiles['gender'] = profiles['gender'].apply(lambda x: 1 if x == 'Male' else (-1 if x == 'Female' else 0))
user_feature_matrix = profiles[['profile', 'gender']].set_index('profile')

# Create user-anime preference matrix from the favorites_anime column
favorites_anime_matrix = profiles.explode('favorites_anime')[['profile', 'favorites_anime']]
favorites_anime_matrix['preference'] = 1
favorites_anime_matrix = favorites_anime_matrix.pivot_table(index='profile', columns='favorites_anime', values='preference').fillna(0)

# Combine the gender column with the user-anime preference matrix
user_feature_matrix = pd.concat([user_feature_matrix, favorites_anime_matrix], axis=1)

In [None]:
reviews_with_gender = reviews.merge(profiles[['profile', 'gender']], on='profile')

In [None]:
user_anime_rating_matrix = reviews_with_gender.pivot_table(index='profile', columns='anime_uid', values='score')
user_feature_matrix = pd.concat([user_feature_matrix, user_anime_rating_matrix], axis=1)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarity_matrix = cosine_similarity(user_feature_matrix.fillna(0))
