# Sistema de Recomendación (ML)

### Importar librerías

In [1]:
# Importar librerías
import pandas as pd

# LIbreria para evaluar str como objetos Python
from ast import literal_eval

### Lectura de Datos y construcción de dataframe para desarrollo del sistema

Se carga el dataset `movies_cleaned.csv`y se construye el Dataframe solo con las columnas a utilizar, por razones de <span style = "color:red;">procesamiento y memoria ram</span> éste dataframe se constituira con 18.000 filas quedando como sigue:<br>
1. Filas: 18.000
2. Columnas: 6<br>
De igualmanera para hacer mas manejable la administración de los recursos se eliminaron algunas columnas, utilizando las que a nustro juicio podrian aportar mas valor al análisis tales como `title`, `belongs_to_collection`, `overview`, `genres`, `cast`, `director`.

In [2]:
path = "datasets/movies_cleaned.csv"
n_row = 18000
data = pd.read_csv(path)[['title','belongs_to_collection', 'overview','genres','cast','director']].head(n_row)
data.head()

Unnamed: 0,title,belongs_to_collection,overview,genres,cast,director
0,Toy Story,toy story collection,"Led by Woody, Andy's toys live happily in his ...","['animation', 'comedy', 'family']","['tom hanks', 'tim allen', 'don rickles', 'jim...",John Lasseter
1,Jumanji,no collection,When siblings Judy and Peter discover an encha...,"['adventure', 'fantasy', 'family']","['robin williams', 'jonathan hyde', 'kirsten d...",Joe Johnston
2,Grumpier Old Men,grumpy old men collection,A family wedding reignites the ancient feud be...,"['romance', 'comedy']","['walter matthau', 'jack lemmon', 'ann-margret...",Howard Deutch
3,Waiting to Exhale,no collection,"Cheated on, mistreated and stepped on, the wom...","['comedy', 'drama', 'romance']","['whitney houston', 'angela bassett', 'loretta...",Forest Whitaker
4,Father of the Bride Part II,father of the bride collection,Just when George Banks has recovered from his ...,['comedy'],"['steve martin', 'diane keaton', 'martin short...",Charles Shyer


In [3]:
data.shape

(18000, 6)

### Sistema de Recomendación Basado en Metadata
El sistema de recomendación escojido para este proyecto es el basado en Metadata, en vista de que se tienen características disponible que hacen viable su utilización

In [4]:
# Se realiza una adecuación de las columnas "overview, genres, cast" según cada caso específico
data['overview'] = data['overview'].apply(lambda x : x.split())  # Se realiza un split() para generar una lista
data['belongs_to_collection'] = data['belongs_to_collection'].replace('no collection', '')
# data['belongs_to_collection'] = data['belongs_to_collection'].apply(lambda x : x.split())  # Se realiza un split() para generar una lista

# Se aplica la función literal_eval para hacer que los datos puedan ser procesados
data['genres'] = data['genres'].apply(literal_eval)  
data['cast'] = data['cast'].apply(literal_eval)

In [5]:
# Función para adecuar los datos eliminando espacios en blanco y convirtiendo a minúsculas evita ambiguedades
def sanitize(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [6]:
# Aplicar función a columnas "cast, director, genres, overview"
for feature in ['cast', 'director', 'genres', 'overview', 'belongs_to_collection']:
    data[feature] = data[feature].apply(sanitize)

In [7]:
# Verificar resultado con muestra aleatoria
data.head(10)

Unnamed: 0,title,belongs_to_collection,overview,genres,cast,director
0,Toy Story,toystorycollection,"[led, by, woody,, andy's, toys, live, happily,...","[animation, comedy, family]","[tomhanks, timallen, donrickles, jimvarney, wa...",johnlasseter
1,Jumanji,,"[when, siblings, judy, and, peter, discover, a...","[adventure, fantasy, family]","[robinwilliams, jonathanhyde, kirstendunst, br...",joejohnston
2,Grumpier Old Men,grumpyoldmencollection,"[a, family, wedding, reignites, the, ancient, ...","[romance, comedy]","[waltermatthau, jacklemmon, ann-margret, sophi...",howarddeutch
3,Waiting to Exhale,,"[cheated, on,, mistreated, and, stepped, on,, ...","[comedy, drama, romance]","[whitneyhouston, angelabassett, lorettadevine,...",forestwhitaker
4,Father of the Bride Part II,fatherofthebridecollection,"[just, when, george, banks, has, recovered, fr...",[comedy],"[stevemartin, dianekeaton, martinshort, kimber...",charlesshyer
5,Heat,,"[obsessive, master, thief,, neil, mccauley, le...","[action, crime, drama, thriller]","[alpacino, robertdeniro, valkilmer, jonvoight,...",michaelmann
6,Sabrina,,"[an, ugly, duckling, having, undergone, a, rem...","[comedy, romance]","[harrisonford, juliaormond, gregkinnear, angie...",sydneypollack
7,Tom and Huck,,"[a, mischievous, young, boy,, tom, sawyer,, wi...","[action, adventure, drama, family]","[jonathantaylorthomas, bradrenfro, rachaelleig...",peterhewitt
8,Sudden Death,,"[international, action, superstar, jean, claud...","[action, adventure, thriller]","[jean-claudevandamme, powersboothe, dorianhare...",peterhyams
9,GoldenEye,jamesbondcollection,"[james, bond, must, unmask, the, mysterious, h...","[adventure, action, thriller]","[piercebrosnan, seanbean, izabellascorupco, fa...",martincampbell


In [8]:
# Función que crea una sopa de palabras a partir de la metadata
def create_soup(x):
    return ' '.join(x['overview']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])+ ' ' + x['belongs_to_collection']

In [9]:
# Se crea una nueva columna "soup" al dataframe aplicando la función "create_soup"
data['soup'] = data.apply(create_soup, axis=1)

In [10]:
# Para verificar resultado se muestra la primera fila de la columna "soup"
data.iloc[17800]['soup']

"set against the backdrop of the succession of queen elizabeth i, and the essex rebellion against her, the story advances the theory that it was in fact edward de vere, earl of oxford who penned shakespeare's plays. jamiecampbellbower rhysifans davidthewlis joelyrichardson vanessaredgrave sebastianarmesto rafespall edwardhogg xaviersamuel samreid paolodevita trystangravelle robertemms tonyway julianbleach derekjacobi alexhassell jamesgarnon markrylance jasperbritton neddennehy johnkeogh lloydhutchinson vickykrieps helenbaxendale paulaschramm amykwolek lukethomastaylor isaiahmichalski timohuber richarddurden shaunlawton detlefbothe jamesclyde christiansengewald jean-loupfourure victoriagabrysch axelsichrovsky katrinpollitt patriciagrove lauralozito godebenedix nicromm henrylloyd-hughes patrickdiemling patrickheyn ninosandow craigsalisbury rainerguldener trystanpütter andrékaczmarczyk jonashämmerle mikemaas christianleonard christianbanzhaf victoriacalero martinengler alfredhartung olive

In [11]:
data.sample(10)

Unnamed: 0,title,belongs_to_collection,overview,genres,cast,director,soup
14336,True Heart Susie,,"[true, heart, susie, (gish), secretly, loves, ...","[comedy, romance, drama]","[lilliangish, robertharron, wilburhigby, loyol...",d.w.griffith,true heart susie (gish) secretly loves her nei...
14683,Letters to Father Jacob,,"[with, few, options,, newly, pardoned, convict...","[drama, foreign]","[kaarinahazard, jukkakeinonen, heikkinousiainen]",klaushärö,"with few options, newly pardoned convict leila..."
10641,Over the Edge,,"[the, music, of, cheap, trick,, the, cars,, an...","[crime, drama]","[mattdillon, vincentspano, andyromano, ellenge...",jonathankaplan,"the music of cheap trick, the cars, and the ra..."
5461,Django,django-originalfilms,"[django, is, a, 1966, italian, spaghetti, west...","[action, western]","[franconero, josébódalo, loredananusciak, ánge...",sergiocorbucci,django is a 1966 italian spaghetti western fil...
11711,This Is England,thisisenglandcollection,"[a, story, about, a, troubled, boy, growing, u...","[drama, crime]","[thomasturgoose, stephengraham, johartley, and...",shanemeadows,a story about a troubled boy growing up in eng...
2557,The Blair Witch Project,blairwitchcollection,"[in, october, of, 1994, three, student, filmma...","[horror, mystery]","[michaelc.williams, heatherdonahue, joshualeon...",danielmyrick,in october of 1994 three student filmmakers di...
14590,Running Mates,,"[a, story, of, longtime, love, finally, coming...","[comedy, romance]","[edharris, dianekeaton, edbegleyjr., russtambl...",michaellindsay-hogg,a story of longtime love finally coming to fru...
7255,Cat-Women of the Moon,,"[astronauts, travel, to, the, moon, where, the...",[sciencefiction],"[sonnytufts, victorjory, mariewindsor, carolbr...",arthurhilton,astronauts travel to the moon where they disco...
3792,Ghoulies,ghouliescollection,"[a, young, man, and, his, girlfriend, move, in...","[comedy, horror, fantasy]","[peterliapis, lisapelikan, michaeldesbarres, j...",lucabercovici,a young man and his girlfriend move into the m...
759,A Hungarian Fairy Tale,,"[shot, in, b&amp;w,, gyula, gazdag's, film, fo...",[],[],gazdaggyula,"shot in b&amp;w, gyula gazdag's film follows t..."


In [12]:
# Disminuir el tamaño del DataFrame solo utilisando las columnas (features) necesarias para el algoritmo
data = data[['title', 'soup']]
data

Unnamed: 0,title,soup
0,Toy Story,"led by woody, andy's toys live happily in his ..."
1,Jumanji,when siblings judy and peter discover an encha...
2,Grumpier Old Men,a family wedding reignites the ancient feud be...
3,Waiting to Exhale,"cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,just when george banks has recovered from his ...
...,...,...
17995,The Hunger Games,every year in the ruins of what was once north...
17996,Blind Justice,"a simple-minded circus strongman, john sikes, ..."
17997,Womb,a woman's consuming love forces her to bear th...
17998,Fire of Conscience,fire of conscience is an 2010 hong kong action...


### Inicio de Modelo 

In [13]:
# Importar "CountVectorizer" de sklearn
from sklearn.feature_extraction.text import CountVectorizer

# Instanciar un nuevo objeto CountVectorizer y se crea vector para la columna "soup"
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(data['soup'])

In [14]:
data.head(3)

Unnamed: 0,title,soup
0,Toy Story,"led by woody, andy's toys live happily in his ..."
1,Jumanji,when siblings judy and peter discover an encha...
2,Grumpier Old Men,a family wedding reignites the ancient feud be...


In [15]:
# Importar función "cosine_similarity" de sklearn
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
# Calcular la puntuación de similitud del coseno (equivalente al producto punto para los vectores tf-idf)
similarity = cosine_similarity(count_matrix)

In [18]:
# Reiniciar el índice del dataframe y se construye el mapeo inverso
data = data.reset_index()
indices = pd.Series(data.index, index=data['title'])

### Función de recomendación
Función que toma el **título de la película** como entrada y retorna como recomendacion **5 películas mas similaras**

In [19]:
def content_recommender(title, cosine_sim=similarity, df=data, indices=indices):
    '''
    Retornar las 5 películas mas similares a la película pasada como parámetro.
    
    La función presenta una lógica estructuradacomo sigue:
        1. Se Obtiene el índice de la película que coincide con el título.
        2. Se Obtiene los puntajes de similitud por pares de todas las películas con esa película y
           la convierte en una lista de tuplas.
        3. Se Ordenan las películas según las puntuaciones de similitud del coseno.
        4. Se Obtienen los puntajes de las 5 películas más similares. Ignorando la primera película.
        5. Se Obtienen los índices de películas.
        6. Se Retorna las 5 películas más similares.
        
    Parametros:
    ----------
    title: str = Obligatorio titulo de la película 
    cosine_sim: ndarray = Puntuación de similitud del coseno
    df: pandas.core.frame.DataFrame = Fuente de datos
    indices: pandas.core.series.Series = index
    '''
    try:

        idx = indices[title]

        sim_scores = list(enumerate(cosine_sim[idx]))

        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        sim_scores = sim_scores[1:6]

        movie_indices = [i[0] for i in sim_scores]

        return df['title'].iloc[movie_indices]
    except:
        return f'La película {title} no se encuentra en la Base de Datos'

In [26]:
content_recommender('City By The Sea')  # cosine_sim2, df, indices2

10416     South Central
14817          New York
5105           Rashomon
17447    Shadows & Lies
2840           The City
Name: title, dtype: object

In [40]:
data['title'].tolist()

['Toy Story',
 'Jumanji',
 'Grumpier Old Men',
 'Waiting to Exhale',
 'Father of the Bride Part II',
 'Heat',
 'Sabrina',
 'Tom and Huck',
 'Sudden Death',
 'GoldenEye',
 'The American President',
 'Dracula: Dead and Loving It',
 'Balto',
 'Nixon',
 'Cutthroat Island',
 'Casino',
 'Sense and Sensibility',
 'Four Rooms',
 'Ace Ventura: When Nature Calls',
 'Money Train',
 'Get Shorty',
 'Copycat',
 'Assassins',
 'Powder',
 'Leaving Las Vegas',
 'Othello',
 'Now and Then',
 'Persuasion',
 'The City of Lost Children',
 'Shanghai Triad',
 'Dangerous Minds',
 'Twelve Monkeys',
 'Babe',
 'Carrington',
 'Dead Man Walking',
 'Across the Sea of Time',
 'It Takes Two',
 'Clueless',
 'Cry, the Beloved Country',
 'Richard III',
 'Dead Presidents',
 'Restoration',
 'Mortal Kombat',
 'To Die For',
 'How To Make An American Quilt',
 'Se7en',
 'Pocahontas',
 'When Night Is Falling',
 'The Usual Suspects',
 'Guardian Angel',
 'Mighty Aphrodite',
 'Lamerica',
 'The Big Green',
 'Georgia',
 'Kids of the 

In [None]:
# se crea archivo csv para su uso en producción.
data.to_csv('datasets/movies_ml_18.csv', index=False, encoding='utf-8')

### Crear archivos .pkl para almacenar datos serializados.

Crear archivos `movie_list_15.pkl`, `similarity_15.pkl`, `indices.pkl_15` para que esten disponibles en **producción**.

In [59]:
# Importar módulo pickle
import pickle

In [60]:

# pickle.dump(data,open('datasets/movie_list_15.pkl','wb'))
# pickle.dump(similarity,open('datasets/similarity_15.pkl','wb'))
# pickle.dump(indices,open('datasets/indices.pkl_15','wb'))

## <span style = "color:green">Fin de ML</span>