# Sistema de Recomendación (ML)

### Importar librerías

In [1]:
# Importar librerías
import pandas as pd

# LIbreria para evaluar str como objetos Python
from ast import literal_eval

### Lectura de Datos y construcción de dataframe para desarrollo del sistema

Se carga el dataset `movies_cleaned.csv`y se construye el Dataframe solo con las columnas a utilizar, por razones de <span style = "color:red;">procesamiento y memoria ram</span> éste dataframe se constituira con 10.000 filas quedando como sigue:<br>
1. Filas: 10.000
2. Columnas: 5
De igualmanera para hacer mas manejable la administración de los recursos se eliminaron algunas columnas como `bbbb`

In [2]:
path = "datasets/movies_cleaned.csv"
n_row = 10000
data = pd.read_csv(path)[['title','overview','genres','cast','director']].head(n_row)
data.head(50)

Unnamed: 0,title,overview,genres,cast,director
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...","['animation', 'comedy', 'family']","['tom hanks', 'tim allen', 'don rickles', 'jim...",John Lasseter
1,Jumanji,When siblings Judy and Peter discover an encha...,"['adventure', 'fantasy', 'family']","['robin williams', 'jonathan hyde', 'kirsten d...",Joe Johnston
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,"['romance', 'comedy']","['walter matthau', 'jack lemmon', 'ann-margret...",Howard Deutch
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","['comedy', 'drama', 'romance']","['whitney houston', 'angela bassett', 'loretta...",Forest Whitaker
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,['comedy'],"['steve martin', 'diane keaton', 'martin short...",Charles Shyer
5,Heat,"Obsessive master thief, Neil McCauley leads a ...","['action', 'crime', 'drama', 'thriller']","['al pacino', 'robert de niro', 'val kilmer', ...",Michael Mann
6,Sabrina,An ugly duckling having undergone a remarkable...,"['comedy', 'romance']","['harrison ford', 'julia ormond', 'greg kinnea...",Sydney Pollack
7,Tom and Huck,"A mischievous young boy, Tom Sawyer, witnesses...","['action', 'adventure', 'drama', 'family']","['jonathan taylor thomas', 'brad renfro', 'rac...",Peter Hewitt
8,Sudden Death,International action superstar Jean Claude Van...,"['action', 'adventure', 'thriller']","['jean-claude van damme', 'powers boothe', 'do...",Peter Hyams
9,GoldenEye,James Bond must unmask the mysterious head of ...,"['adventure', 'action', 'thriller']","['pierce brosnan', 'sean bean', 'izabella scor...",Martin Campbell


In [3]:
data.shape

(10000, 5)

### Sistema de Recomendación Basado en Metadata
El sistema de recomendación escojido para este proyecto es el basado en Metadata, en vista de que se tienen características disponible que hacen viable su utilización

In [4]:
# Se realiza una adecuación de las columnas "overview, genres, cast" según cada caso específico
data['overview'] = data['overview'].apply(lambda x : x.split())  # Se realiza un split() para generar una lista

# Se aplica la función literal_eval para hacer que los datos puedan ser procesados
data['genres'] = data['genres'].apply(literal_eval)  
data['cast'] = data['cast'].apply(literal_eval)

In [5]:
# Función para adecuar los datos eliminando espacios en blanco y convirtiendo a minúsculas evita ambiguedades
def sanitize(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [6]:
# Aplicar función a columnas "cast, director, genres, overview"
for feature in ['cast', 'director', 'genres', 'overview']:
    data[feature] = data[feature].apply(sanitize)

In [7]:
# Verificar resultado con muestra aleatoria
data.sample(10)

Unnamed: 0,title,overview,genres,cast,director
4233,September,"[at, a, summer, house, in, vermont,, neighbor,...",[drama],"[miafarrow, diannewiest, samwaterston, elaines...",woodyallen
4789,Moscow on the Hudson,"[a, russian, circus, visits, the, us., a, clow...","[comedy, drama, romance]","[robinwilliams, maríaconchitaalonso, cleavantd...",paulmazursky
2276,The General,"[in, a, twenty-year, career, marked, by, obses...","[drama, action, crime]","[brendangleeson, adriandunbar, seanmcginley, m...",johnboorman
1257,Amityville: A New Generation,"[when, young, photographer, keyes, (ross, part...",[horror],"[rosspartridge, robertharvey, lalasloatman, da...",johnmurlowski
9711,Twentieth Century,"[oscar, jaffe, is, a, successful, broadway, di...","[comedy, romance]","[johnbarrymore, carolelombard, walterconnolly,...",howardhawks
3540,Mad Max 2: The Road Warrior,"[max, rockatansky, returns, as, the, heroic, l...","[adventure, action, thriller, sciencefiction]","[melgibson, brucespence, michaelpreston, maxph...",georgemiller
4157,Suspect,"[when, a, supreme, court, judge, commits, suic...","[crime, drama, thriller]","[cher, dennisquaid, liamneeson, johnmahoney, j...",peteryates
5301,The Merchant of Four Seasons,"[hans, is, a, street, fruit, peddler, and, bor...",[drama],"[hanshirschmüller, irmhermann, hannaschygulla,...",rainerwernerfassbinder
5439,Between Strangers,"[three, women, confront, their, pasts, which, ...",[drama],"[sophialoren, mirasorvino, deborahkaraunger, p...",edoardoponti
4948,How to Kill Your Neighbor's Dog,"[the, story, of, peter, mcgowan,, a, chain-smo...","[comedy, drama]","[kennethbranagh, robinwright, suzihofrichter, ...",michaelkalesniko


In [8]:
# Función que crea una sopa de palabras a partir de la metadata
def create_soup(x):
    return ' '.join(x['overview']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [9]:
# Se crea una nueva columna "soup" al dataframe aplicando la función "create_soup"
data['soup'] = data.apply(create_soup, axis=1)

In [10]:
# Para verificar resultado se muestra la primera fila de la columna "soup"
data.iloc[0]['soup']

"led by woody, andy's toys live happily in his room until andy's birthday brings buzz lightyear onto the scene. afraid of losing his place in andy's heart, woody plots against buzz. but when circumstances separate buzz and woody from their owner, the duo eventually learns to put aside their differences. tomhanks timallen donrickles jimvarney wallaceshawn johnratzenberger anniepotts johnmorris erikvondetten lauriemetcalf r.leeermey sarahfreeman pennjillette johnlasseter animation comedy family"

### Inicio de Modelo 

In [11]:
# Importar "CountVectorizer" de sklearn
from sklearn.feature_extraction.text import CountVectorizer

# Instanciar un nuevo objeto CountVectorizer y se crea vector para la columna "soup"
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(data['soup'])

In [12]:
# Importar función "cosine_similarity" de sklearn
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
# Calcular la puntuación de similitud del coseno (equivalente al producto punto para los vectores tf-idf)
similarity = cosine_similarity(count_matrix)

In [15]:
# Reiniciar el índice del dataframe y se construye el mapeo inverso
data = data.reset_index()
indices2 = pd.Series(data.index, index=data['title'])

### Función de recomendación
Función que toma el **título de la película** como entrada y retorna como recomendacion **5 películas mas similaras**

In [45]:
def content_recommender(title, cosine_sim=similarity, df=data, indices=indices2):
    '''
    Retornar las 5 películas mas similares a la película pasada como parámetro.
    
    La función presenta una lógica estructuradacomo sigue:
        1. Se Obtiene el índice de la película que coincide con el título.
        2. Se Obtiene los puntajes de similitud por pares de todas las películas con esa película y
           la convierte en una lista de tuplas.
        3. Se Ordenan las películas según las puntuaciones de similitud del coseno.
        4. Se Obtienen los puntajes de las 5 películas más similares. Ignorando la primera película.
        5. Se Obtienen los índices de películas.
        6. Se Retorna las 5 películas más similares.
        
    Parametros:
    ----------
    title: str = Obligatorio titulo de la película 
    cosine_sim: ndarray = Puntuación de similitud del coseno
    df: pandas.core.frame.DataFrame = Fuente de datos
    indices: pandas.core.series.Series = index
    '''
    try:

        idx = indices[title]

        sim_scores = list(enumerate(cosine_sim[idx]))

        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        sim_scores = sim_scores[1:6]

        movie_indices = [i[0] for i in sim_scores]

        return df['title'].iloc[movie_indices]
    except:
        return f'La película {title} no se encuentra en la Base de Datos'

In [46]:
content_recommender('Happy Gilmore')  # cosine_sim2, df, indices2

813                Tin Cup
2541             Big Daddy
4449      Miss Firecracker
5819     National Security
5685    Eight Crazy Nights
Name: title, dtype: object

In [24]:
data['title'].tolist()

['Toy Story',
 'Jumanji',
 'Grumpier Old Men',
 'Waiting to Exhale',
 'Father of the Bride Part II',
 'Heat',
 'Sabrina',
 'Tom and Huck',
 'Sudden Death',
 'GoldenEye',
 'The American President',
 'Dracula: Dead and Loving It',
 'Balto',
 'Nixon',
 'Cutthroat Island',
 'Casino',
 'Sense and Sensibility',
 'Four Rooms',
 'Ace Ventura: When Nature Calls',
 'Money Train',
 'Get Shorty',
 'Copycat',
 'Assassins',
 'Powder',
 'Leaving Las Vegas',
 'Othello',
 'Now and Then',
 'Persuasion',
 'The City of Lost Children',
 'Shanghai Triad',
 'Dangerous Minds',
 'Twelve Monkeys',
 'Babe',
 'Carrington',
 'Dead Man Walking',
 'Across the Sea of Time',
 'It Takes Two',
 'Clueless',
 'Cry, the Beloved Country',
 'Richard III',
 'Dead Presidents',
 'Restoration',
 'Mortal Kombat',
 'To Die For',
 'How To Make An American Quilt',
 'Se7en',
 'Pocahontas',
 'When Night Is Falling',
 'The Usual Suspects',
 'Guardian Angel',
 'Mighty Aphrodite',
 'Lamerica',
 'The Big Green',
 'Georgia',
 'Kids of the 

In [31]:
import pickle

In [33]:
#pickle.dump(data,open('datasets/movie_list.pkl','wb'))
#pickle.dump(similarity,open('datasets/similarity.pkl','wb'))

In [36]:
type(indices2)

pandas.core.series.Series

In [5]:
pd.options.mode.chained_assignment = None  # default='warn'
data['overview'] = data['overview'].apply(lambda x:x.split())

In [6]:
data['director'] = data['director'].apply(lambda x:[x])

In [7]:
data.sample(5)

Unnamed: 0,id,title,overview,genres,cast,director
7653,24190,Time of the Wolf,"[When, Anna, and, her, family, arrive, at, the...",['drama'],"['isabelle huppert', 'patrice chéreau', 'brigi...",[Michael Haneke]
5346,206042,Children on Their Birthdays,"[Havoc, is, created, in, a, small, Southern, c...","['comedy', 'family', 'romance']","['sheryl lee', 'christopher mcdonald', 'tom ar...",[Mark Medoff]
4903,29475,The Five Heartbeats,"[In, the, early, 1960's,, a, quintet, of, hope...","['drama', 'music']","['robert townsend', 'michael wright', 'leon ro...",[Robert Townsend]
26124,86193,The Incredible 2-Headed Transplant,"[Dr., Roger, Girard, is, a, rich, scientist, c...","['horror', 'science fiction']","['bruce dern', 'pat priest', 'casey kasem', 'a...",[Anthony M. Lanza]
4950,270306,Maryam,"[An, Iranian-born, teenager, living, in, subur...",['drama'],"['mariam parris', 'david ackert', 'shaun toub'...",[Ramin Serry]


In [8]:
from ast import literal_eval
data['tags'] = data['overview'] + data['genres'].apply(literal_eval) + data['cast'].apply(literal_eval) + data['director']

In [9]:
new = data.drop(columns=['overview','genres','cast','director'])

In [10]:
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1000,stop_words='english')

In [12]:
vector = cv.fit_transform(new['tags']).toarray()

In [13]:
vector.shape

(30000, 1000)

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vector)

In [None]:
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new.iloc[i[0]].title)

In [None]:
recommend('Gandhi')

In [None]:
new['title'].tolist()