In [6]:
import pandas as pd

Keyword = pd.read_csv("../NewData/NewKeyword.csv")
User = pd.read_csv("../NewData/NewUser.csv")

merge = Keyword.merge(User,on="title",how="inner")
merge = merge.drop('id', axis=1)

merge.head()

Unnamed: 0,title,genres,overview,keywords,vote_average,year,user_id,movie_id,rating
0,Titanic,Drama Romance Thriller,"84 years later, a 101-year-old woman named Ros...",shipwreck iceberg ship panic titanic ocean lin...,7.5,1997,240,313,5
1,Titanic,Drama Romance Thriller,"84 years later, a 101-year-old woman named Ros...",shipwreck iceberg ship panic titanic ocean lin...,7.5,1997,134,313,5
2,Titanic,Drama Romance Thriller,"84 years later, a 101-year-old woman named Ros...",shipwreck iceberg ship panic titanic ocean lin...,7.5,1997,319,313,5
3,Titanic,Drama Romance Thriller,"84 years later, a 101-year-old woman named Ros...",shipwreck iceberg ship panic titanic ocean lin...,7.5,1997,111,313,4
4,Titanic,Drama Romance Thriller,"84 years later, a 101-year-old woman named Ros...",shipwreck iceberg ship panic titanic ocean lin...,7.5,1997,236,313,4


In [7]:
# content-based model 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
from scipy.sparse.linalg import svds

movies_unique = merge[['movie_id', 'title', 'genres', 'keywords', 'overview']].drop_duplicates(subset='movie_id')

movies_unique['genres'] = movies_unique['genres'].fillna('')
movies_unique['keywords'] = movies_unique['keywords'].fillna('')
movies_unique['overview'] = movies_unique['overview'].fillna('')

movies_unique['content'] = movies_unique['genres'] + ' ' + movies_unique['keywords'] + ' ' + movies_unique['overview']

tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(movies_unique['content'])

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

movies_unique = movies_unique.reset_index(drop=True)
indices = pd.Series(movies_unique.index, index=movies_unique['title'])

def content_based(title, cosine_sim):
    if title not in indices:
        return []
    idx = indices[title]
    scores = cosine_sim[idx]
    sim_scores = []
    for i in range(len(scores)):
        sim_scores.append((i, scores[i]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = []
    for i in sim_scores:
        movie_indices.append(i[0])
    return movies_unique['title'].iloc[movie_indices].tolist()

# Collaborative Filtering

merge_clean = merge.drop_duplicates(subset=['user_id', 'movie_id'], keep='last')
user_matrix = merge_clean.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
matrix_values = user_matrix.values

U, sigma, Vt = svds(matrix_values, k=50)
sigma = np.diag(sigma)

predicted_ratings = np.dot(np.dot(U, sigma), Vt)

predictions_df = pd.DataFrame(predicted_ratings, 
                               index=user_matrix.index,
                               columns=user_matrix.columns)

def collaborative_recommender(user_id):
    if user_id not in user_matrix.index:
        return []
    user_idx = user_matrix.index.get_loc(user_id)
    user_ratings = user_matrix.iloc[user_idx]
    user_predictions = predictions_df.iloc[user_idx]
    unwatched_movies = user_ratings[user_ratings == 0].index
    recommendations = user_predictions[unwatched_movies].sort_values(ascending=False)
    top_movies = recommendations.head(10).index
    result = movies_unique[movies_unique['movie_id'].isin(top_movies)]['title'].tolist()
    return result

# hybrid
def hybrid(title, user_id, cosine_sim):
    content_recs = content_based(title, cosine_sim)
    collab_recs = collaborative_recommender(user_id)
    
    unique_collab = []
    for movie in collab_recs:
        if movie not in content_recs:
            unique_collab.append(movie)
    
    final_recs = content_recs[:5] + unique_collab[:5]
    
    return final_recs

print(hybrid('Toy Story', 196, cosine_sim))

['Pinocchio', 'Little Women', 'Vampire in Brooklyn', 'Bogus', 'Raging Bull', "My Best Friend's Wedding", 'Glory', 'Back to the Future', 'When Harry Met Sally...', 'Pretty Woman']


In [8]:
import pickle

models = {
    'cosine_sim': cosine_sim,
    'indices': indices,
    'movies_unique': movies_unique,
    'predictions_df': predictions_df,
    'user_matrix': user_matrix
}

with open('../Api/recommender_models.pkl', 'wb') as f:
    pickle.dump(models, f)

print("Models saved successfully!")

Models saved successfully!
