In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack,csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
RESOURCE_PATH = os.path.join('../','resources')
FILM_PATH = '../resources/cleaned_data/cleaned_data.csv'
RATINGS_PATH = os.path.join(RESOURCE_PATH, 'data', 'train_val_test')

rating_files = ['ratings_train.csv','ratings_val.csv','ratings_test.csv']
ratings_dataframes = []

for file in rating_files:
    file_path = os.path.join(RATINGS_PATH, file)
    try:
        df = pd.read_csv(file_path, delimiter=',', header=None, names=['user_id','fid', 'rating'])
        ratings_dataframes.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")
user_rating_train = ratings_dataframes[0]
user_rating_val = ratings_dataframes[1]
user_rating_test = ratings_dataframes[2]

ratings_data = pd.concat(ratings_dataframes[0:2], ignore_index=True)

data = pd.read_csv(FILM_PATH)


In [3]:
columns_to_keep = ['fid', 'contentRating', 
                   'genre', 'keywords', 'duration', 'actor', 'director']

film_data = data[columns_to_keep]
print(film_data.info())
film_data.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9814 entries, 0 to 9813
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fid            9814 non-null   int64  
 1   contentRating  9814 non-null   object 
 2   genre          9814 non-null   object 
 3   keywords       9814 non-null   object 
 4   duration       9814 non-null   float64
 5   actor          9814 non-null   object 
 6   director       9814 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 536.8+ KB
None


Unnamed: 0,fid,contentRating,genre,keywords,duration,actor,director
0,468569,PG,"Action,Crime,Drama","psychopath,superhero,moral dilemma,clown,crimi...",9120.0,"Christian Bale,Heath Ledger,Aaron Eckhart",Christopher Nolan
1,1375666,PG,"Action,Adventure,Sci-Fi","dream,ambiguous ending,subconscious,mindbender...",8880.0,"Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot ...",Christopher Nolan


In [4]:
combined_features = film_data['genre']+ " " + film_data['director'] + " " + film_data['actor'] + " " + film_data['keywords']
combined_features

0       Action,Crime,Drama Christopher Nolan Christian...
1       Action,Adventure,Sci-Fi Christopher Nolan Leon...
2       Drama Frank Darabont Tim Robbins,Morgan Freema...
3       Action,Sci-Fi Lana Wachowski,Lilly Wachowski K...
4       Crime,Drama Quentin Tarantino John Travolta,Um...
                              ...                        
9809    Drama,War Claire Denis Denis Lavant,Michel Sub...
9810    Action,Adventure,Comedy Michel Hazanavicius Fr...
9811    Documentary,Crime,Drama Werner Herzog Werner H...
9812    Drama Hirokazu Koreeda Hiroshi Abe,Yui Natsuka...
9813    Crime,Drama,Mystery Benjamin Caron Benedict Cu...
Length: 9814, dtype: object

In [5]:
from sklearn.preprocessing import MinMaxScaler


vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)
print(f"Vecotrs shape: {feature_vectors.shape}")

one_hot_encoder = OneHotEncoder(sparse_output=True)
content_ratings_encoded = one_hot_encoder.fit_transform(film_data[['contentRating']])

scaler = MinMaxScaler()
duration_normalized = scaler.fit_transform(film_data[['duration']])  # Normalize to range [0, 1]
# Convert to sparse matrix
duration_sparse = csr_matrix(duration_normalized)
# Step 4: Combine TF-IDF features and one-hot encoded features
combined_features_matrix = hstack([feature_vectors, content_ratings_encoded,duration_normalized])
combined_features_matrix = csr_matrix(combined_features_matrix)

print("Combined features shape:", combined_features_matrix.shape)

Vecotrs shape: (9814, 21472)
Combined features shape: (9814, 21481)


In [6]:
# Normalize all columns
# getting the similarity scores using cosine similarity
similarity = cosine_similarity(combined_features_matrix, combined_features_matrix)
print(f"Similarity shape: {similarity.shape}")
print(similarity)

Similarity shape: (9814, 9814)
[[1.         0.5720574  0.0483243  ... 0.04068326 0.03980399 0.03517427]
 [0.5720574  1.         0.04554822 ... 0.03457253 0.03710856 0.02881486]
 [0.0483243  0.04554822 1.         ... 0.03457137 0.03721218 0.04892185]
 ...
 [0.04068326 0.03457253 0.03457137 ... 1.         0.02853128 0.02575137]
 [0.03980399 0.03710856 0.03721218 ... 0.02853128 1.         0.51248339]
 [0.03517427 0.02881486 0.04892185 ... 0.02575137 0.51248339 1.        ]]


In [7]:
all_film_id = film_data['fid'].tolist()
film_id_to_index = {fid: idx for idx, fid in enumerate(all_film_id)}
# features_df = pd.DataFrame(feature_vectors.toarray(), index=list_of_all_titles)


In [8]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(combined_features_matrix)


In [9]:
def all_watched_fid(user_id, ratings_data):
    user_data = ratings_data[ratings_data['user_id'] == user_id]
    return user_data['fid'].tolist()

In [10]:
def predict_film_unwatch(user_id, ratings_data, knn,combined_features_matrix,all_film_id,film_id_to_index, n_recommendations = 20):
    recommendations = []
    user_watched_fids = all_watched_fid(user_id, ratings_data)
    user_watched_indices = [film_id_to_index[fid] for fid in user_watched_fids if fid in film_id_to_index]
    
    for idx in user_watched_indices:
        film_vector = combined_features_matrix[idx]

        distances, indices = knn.kneighbors(film_vector, n_neighbors=n_recommendations + 1)
        
        recommended_fids = [all_film_id[i] for i in indices.flatten()]
        
        recommendations.extend([rec for rec in recommended_fids if rec != all_film_id[idx]])
    unique_recommendations = list(set(recommendations) - set(user_watched_fids))
    
    return unique_recommendations[:n_recommendations]
    

In [11]:
recommendations = predict_film_unwatch(
    user_id=4120034,
    ratings_data=ratings_data,
    combined_features_matrix=combined_features_matrix,
    all_film_id=all_film_id,
    film_id_to_index=film_id_to_index,
    knn=knn,
    n_recommendations=10
)

In [12]:
print(f"Recommended films for user: {recommendations}")

for i in range(len(recommendations)):
    recommended_film_name = data[data['fid']==recommendations[i]]['name']
    print(recommended_film_name)

Recommended films for user: [1277953, 8503298, 98309, 262150, 1851397, 114694, 1212428, 2375692, 98320, 49169]
1421    Madagascar 3: Europe's Most Wanted
Name: name, dtype: object
4250    Striking Vipers
Name: name, dtype: object
8386    She-Devil
Name: name, dtype: object
6029    Black Books
Name: name, dtype: object
5649    Fire and Blood
Name: name, dtype: object
2815    Tommy Boy
Name: name, dtype: object
2611    The Lost City of Z
Name: name, dtype: object
4108    Black Sails
Name: name, dtype: object
7251    Shocker
Name: name, dtype: object
8612    Earth vs. the Flying Saucers
Name: name, dtype: object
