In [40]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [41]:
#Cargamos los datos
df =pd.read_parquet("../data/dataset.parquet")

#Solo vamos a utilizar algunas de las columnas del dataset
df = df[['title','popularity','vote_average','vote_count','genres']]

Al ser el dataframe demasiado grande para realizar este modelo, lo reducimos solo tomando las peliculas con una popularidad mayor a 5 y una cantidad de votos mayor a 10

In [42]:
# Definir un umbral de popularidad para filtrar las películas
popularity_threshold = 5 


# Filtrar el DataFrame basado en la popularidad
df = df[df['popularity'] >= popularity_threshold].reset_index(drop=True)

# Definir un umbral de cantidad de votos para filtrar las películas
vote_count_threshold = 10


# Filtrar el DataFrame basado en la cantidad de votos
df = df[df['vote_count'] >= vote_count_threshold].reset_index(drop=True)

# Sistema de recomendacion

In [43]:
# Preprocesamos generos
def preprocess_genres(df):
    genres = df['genres'].str.get_dummies(sep=' ')
    return genres

genres_encoded = preprocess_genres(df)

In [44]:

# Normalizamos `popularity` y `vote_average`
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['popularity', 'vote_average']])

In [45]:

# Ajustamos el peso de los géneros para que sean mas importantes
genres_weighted = genres_encoded * 2

In [46]:
# Creamos una matriz de características combinando todas las características
features = pd.concat([genres_weighted, pd.DataFrame(scaled_features, columns=['popularity', 'vote_average'])], axis=1)


In [47]:

# Calculamos la similitud del coseno
similarity_matrix = cosine_similarity(features)

In [48]:
def get_recommendations(title, similarity_matrix, titles, top_n=5):
    index = titles.index(title)
    sim_scores = list(enumerate(similarity_matrix[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Excluye la película misma
    movie_indices = [i[0] for i in sim_scores]
    return movie_indices

In [49]:
# Creamos el DataFrame con las recomendaciones
recommendations = []

for title in df['title']:
    recommended_indices = get_recommendations(title, similarity_matrix, df['title'].tolist())
    recommended_titles = df['title'].iloc[recommended_indices].tolist()
    recommendations.append([title] + recommended_titles)

recommendations_df = pd.DataFrame(recommendations, columns=['title', 'rec1', 'rec2', 'rec3', 'rec4', 'rec5'])

In [50]:
recommendations_df

Unnamed: 0,title,rec1,rec2,rec3,rec4,rec5
0,Four Rooms,Police Academy,Project X,The Naked Gun 2½: The Smell of Fear,Horrible Bosses,Analyze This
1,Judgment Night,Point Blank,Nighthawks,Machine Gun Preacher,F/X,22 Bullets
2,Star Wars,Rogue One: A Star Wars Story,Guardians of the Galaxy,Avengers: Age of Ultron,The Hunger Games: Catching Fire,Pacific Rim
3,Finding Nemo,Despicable Me,Lilo & Stitch,The Little Mermaid,Coraline,WALL·E
4,Forrest Gump,Dilwale Dulhania Le Jayenge,Life Is Beautiful,The Big Sick,Me Before You,Annie Hall
...,...,...,...,...,...,...
9053,Tour de Pharmacy,7 Days in Hell,Adventures in Babysitting,The Colour of Magic,Red Nose Day Actually,StarStruck
9054,Stasis,Snowmageddon,War of the Worlds 2: The Next Wave,U.F.O.,The 5th Wave,2012: Ice Age
9055,Security,Maze Runner: The Scorch Trials,John Wick,Baby Driver,Deadpool,Pirates of the Caribbean: Dead Men Tell No Tales
9056,S.W.A.T.: Under Siege,Mechanic: Resurrection,Rage,Exit Wounds,Open Windows,London Has Fallen


In [51]:
#Creamos un archivo parquet con este nuevo dataset
recommendations_df.to_parquet("../data/dataset_recomendaciones.parquet",engine='fastparquet', compression='snappy')