In [1]:
# carga de librerías
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [2]:
# carga del dataset procesado
new_df = pd.read_parquet("datasets/movies_model.parquet")

In [3]:
# visualización rápida
new_df.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,"[Led, by, Woody,, Andy's, toys, live, happily,..."
1,8844,Jumanji,"[When, siblings, Judy, and, Peter, discover, a..."
3,31357,Waiting to Exhale,"[Cheated, on,, mistreated, and, stepped, on,, ..."
5,949,Heat,"[Obsessive, master, thief,, Neil, McCauley, le..."
6,11860,Sabrina,"[An, ugly, duckling, having, undergone, a, rem..."


In [4]:
# arreglo de los índices
new_df = new_df.reset_index(drop=True)
new_df.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,"[Led, by, Woody,, Andy's, toys, live, happily,..."
1,8844,Jumanji,"[When, siblings, Judy, and, Peter, discover, a..."
2,31357,Waiting to Exhale,"[Cheated, on,, mistreated, and, stepped, on,, ..."
3,949,Heat,"[Obsessive, master, thief,, Neil, McCauley, le..."
4,11860,Sabrina,"[An, ugly, duckling, having, undergone, a, rem..."


In [5]:
# visualización columna tags
new_df["tags"][0]

array(['Led', 'by', 'Woody,', "Andy's", 'toys', 'live', 'happily', 'in',
       'his', 'room', 'until', "Andy's", 'birthday', 'brings', 'Buzz',
       'Lightyear', 'onto', 'the', 'scene.', 'Afraid', 'of', 'losing',
       'his', 'place', 'in', "Andy's", 'heart,', 'Woody', 'plots',
       'against', 'Buzz.', 'But', 'when', 'circumstances', 'separate',
       'Buzz', 'and', 'Woody', 'from', 'their', 'owner,', 'the', 'duo',
       'eventually', 'learns', 'to', 'put', 'aside', 'their',
       'differences.', 'Animation', 'Comedy', 'Family', 'TomHanks',
       'TimAllen', 'DonRickles', 'JohnLasseter'], dtype=object)

In [6]:
# unir en cadenas separadas por espacios
new_df["tags"] = new_df["tags"].apply(lambda x: " ".join(x))

In [7]:
# visualizar resultado
new_df["tags"][0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences. Animation Comedy Family TomHanks TimAllen DonRickles JohnLasseter"

In [8]:
# pasar todo a minúsculas
new_df["tags"] = new_df["tags"].apply(lambda x: x.lower()) # sale un warning

In [9]:
# visualizar resultado
new_df["tags"][0]

"led by woody, andy's toys live happily in his room until andy's birthday brings buzz lightyear onto the scene. afraid of losing his place in andy's heart, woody plots against buzz. but when circumstances separate buzz and woody from their owner, the duo eventually learns to put aside their differences. animation comedy family tomhanks timallen donrickles johnlasseter"

Creamos un objeto CountVectorizer con las siguientes configuraciones:  
* max_features=5000: número máximo de características
* stop_words="english": palabras que se deben eliminar del texto.significativa.  

Que utilizaremos para convertir un conjunto de texto en un conjunto de vectores. Cada vector representa la  frecuencia de cada característica en el texto.

In [7]:
cv = CountVectorizer(max_features=5000, stop_words="english")

**Devolvemos el conjunto de vectores como una matriz.**

In [9]:
vectors = cv.fit_transform(new_df["tags"]).toarray()

**Creamos un objeto PorterStemmer que utilizaremos para reducir las palabras a su raíz.**

In [10]:
ps = PorterStemmer()

**Creamos una función que realizará estos pasos:**  
1. Divide el texto en palabras individuales.
2. Aplica el algoritmo de stemming de Porter a cada palabra individual.
3. Une las palabras individuales en una cadena de texto.

In [11]:
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

**Aplicamos la función a la columna tags**

In [12]:
new_df["tags"] = new_df["tags"].apply(stem)

**Calculamos la similitud coseno entre los vectores en la matriz** 

In [13]:
similarity = cosine_similarity(vectors)

**Creamos la función recomendadora**

In [16]:
# función recomendadora
def recommend(movie):
    movie_index = new_df[new_df["title"] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    pel = []
    for i in movie_list:
        list_pel = {new_df.iloc[i[0]].title}
        pel.append(list_pel)
        # print(i[0])
    return pel

In [17]:
# prueba de la función
recommend("Lethal Weapon")

['Lethal Weapon 2',
 'Lethal Weapon 4',
 'Vares - The Kiss of Evil',
 "The Hunter's Prayer",
 'Shaft in Africa']