In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('/Users/tejaswinitippanaboina/Desktop/Anime project/anime 2 data.csv')

In [3]:
numerical_columns = ['eps', 'startYr', 'finishYr', 'rating', 'votes']
for column in numerical_columns:
    df[column].fillna(df[column].median(), inplace=True)

# For categorical columns with a relatively small number of NaNs, fill NaNs with the mode
categorical_columns = ['mediaType']
for column in categorical_columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

# For text columns, fill NaNs with an empty string
text_columns = ['description', 'sznOfRelease']  # 'sznOfRelease' might have a textual value like "Spring"
for column in text_columns:
    df[column].fillna('', inplace=True)
    
    # Impute 'duration' with the median duration
df['duration'].fillna(df['duration'].median(), inplace=True)

# For 'watched', let's assume it's a numeric column. We will fill NaNs with the median as well.
df['watched'].fillna(df['watched'].median(), inplace=True)


In [4]:
df['combined_features'] = df['description'] + ' ' + df['tags'] + ' ' + df['mediaType'] + ' ' + df['studios']


In [5]:
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the vectorizer on the 'combined_features' column
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

In [6]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [7]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [11]:
def get_recommendations(title, cosine_sim=cosine_sim, df=df, indices=indices):
    # Get the index of the anime that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all anime with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar anime
    sim_scores = sim_scores[1:11]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar anime
    return df['title'].iloc[anime_indices]

# Example usage to get recommendations for Naruto
recommended_anime = get_recommendations('Naruto')
print(recommended_anime)

561                                      Naruto Shippuden
968                              Boruto: Naruto the Movie
4965    Naruto Special 2: Battle at Hidden Falls. I am...
3533                      Boruto: Naruto Next Generations
5226                       Naruto: Konoha Sports Festival
1718             Naruto Shippuden Movie 4: The Lost Tower
2081                      Naruto Shippuden Movie 2: Bonds
1856           Naruto Shippuden Movie 3: The Will of Fire
3890    Naruto Movie 3: Guardians of the Crescent Moon...
814               Naruto Shippuden Movie 6: Road to Ninja
Name: title, dtype: object
