In [17]:
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer, util

In [18]:
df = pd.read_csv('nico_movie_recommender.csv', index_col = 'Unnamed: 0')
df

Unnamed: 0,title,genre,duration,text
0,The Shawshank Redemption,Drama,142,Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",175,The aging patriarch of an organized crime dyna...
2,The Dark Knight,"Action, Crime, Drama",152,When the menace known as the Joker wreaks havo...
3,The Lord of the Rings: The Return of the King,"Action, Adventure, Drama",201,Gandalf and Aragorn lead the World of Men agai...
4,Schindler's List,"Biography, Drama, History",195,"In German-occupied Poland during World War II,..."
...,...,...,...,...
7145,Monster a Go-Go,"Horror, Sci-F",68,"A space capsule crash-lands on Earth, and the ..."
7146,Pledge This!,Comedy,91,"At South Beach University, a beautiful sororit..."
7147,Manos: The Hands of Fate,Horror,70,A family gets lost on the road and stumbles up...
7148,Superbabies: Baby Geniuses 2,"Comedy, Family, Sci-F",88,A group of smart-talking toddlers find themsel...


In [19]:
def get_data(file_path):
    df = pd.read_csv(file_path, index_col = 'Unnamed: 0')
    return df

In [20]:
# Here we embed the descriptions of the movies --> shape (7150, 384)
# No matter the len of the texts it will be embeded as 384

def embed_sentences(df):
    sentences = np.array(df['text'])
    
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embedded_sentences = model.encode(sentences, convert_to_numpy = True)
    print(len(embedded_sentences))
    return embedded_sentences

In [21]:
# Here we create a list of titles so that we can zip it with the sentences maintaing the movies ID with the description

def get_embbedings_dict(df, embedded_sentences):
    
    titles = df['title'].to_list()
    
    title_embedding_dict = {}
    
    for key, value in zip(titles, embedded_sentences):
        title_embedding_dict[key] = value
    return title_embedding_dict

In [22]:
# Here we input the new movies text, df, and our embedded sentences. We embed the new movies text. Fine cosine simularity
# between new_move/new_text embedded to a list of our df embedded texts

def get_3_most_similar_movies(new_movie_text, df, embedded_sentences):
    title_embedded_dict = get_embbedings_dict(df, embedded_sentences)
    
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    new_movie_text_embedded = model.encode(new_movie_text, convert_to_numpy=True)
    
    
    arg_cos_sim = np.argsort(util.pytorch_cos_sim(new_movie_text_embedded, list(title_embedded_dict.values())))
    
    # This is to aviod outputing the Users movie of choice
    A = list(title_embedding_dict.values())[arg_cos_sim[0][-1]].round(4)
    B = new_movie_text_embedded.round(4)
    
    if np.array_equal(A, B) == True:
        first_arg_most_similar = arg_cos_sim[0][-2]
        second_arg_most_similar = arg_cos_sim[0][-3]
        third_arg_most_similar = arg_cos_sim[0][-4]
    else:
        first_arg_most_similar = arg_cos_sim[0][-1]
        second_arg_most_similar = arg_cos_sim[0][-2]
        third_arg_most_similar = arg_cos_sim[0][-3]
        
    #A list of keys indexed with the arg_cos_sim tensor position
    first_movie_most_sim = list(title_embedding_dict.keys())[first_arg_most_similar] 
    second_movie_most_sim = list(title_embedding_dict.keys())[second_arg_most_similar]
    third_movie_most_sim = list(title_embedding_dict.keys())[third_arg_most_similar]
    
    return first_movie_most_sim, second_movie_most_sim, third_movie_most_sim

In [23]:
embedded_sentences = embed_sentences(df)

7150


In [24]:
embedded_sentences

array([[-0.06326336,  0.04146251, -0.04707528, ..., -0.03055873,
        -0.01641737, -0.01581548],
       [ 0.02928159, -0.02202932, -0.04759281, ..., -0.01140113,
         0.02967963, -0.08190241],
       [ 0.01888789,  0.05393051, -0.07068749, ...,  0.07443296,
         0.05758369, -0.01486043],
       ...,
       [-0.05408384,  0.03755486, -0.06260994, ..., -0.02542926,
        -0.01030796, -0.00706608],
       [-0.07651567, -0.02488332, -0.01685206, ...,  0.13467704,
         0.0121746 ,  0.01384022],
       [-0.00628567,  0.05565746,  0.04748098, ...,  0.01046593,
         0.02671084, -0.01874348]], dtype=float32)

In [25]:
title_embedding_dict = get_embbedings_dict(df, embedded_sentences) # The length of this goes to 7150 --> 6932 (we are some how losing 228 movies with this)

In [33]:
len(title_embedding_dict)

6932

In [11]:
len(embedded_sentences)
len(df['title'].to_list())

7150

In [12]:
df['text'][60]

'An insane American general orders a bombing attack on the Soviet Union, triggering a path to nuclear holocaust that a war room full of politicians and generals frantically tries to stop.'

In [40]:
get_3_most_similar_movies("An insane American general orders a bombing attack on the Soviet Union, triggering a path to nuclear holocaust that a war room full of politicians and generals frantically tries to stop.", df, embedded_sentences)

384
6932


('Catch-22', 'Unthinkable', 'Seven Days in May')

- Webscrape the movies and the description
- Webscrape the Users movie and description from input on streamlit
- Pass it through our pipeline, sentence_transformer
    - This will clean ---> vectorize ----> encode each Movie description.
    - Will call User_scrapers input from stremlit
- streamlit framework pushed to herouku to be live

In [40]:
title_embedding_dict = {}
titles = df['title'].to_list()

for key, value in zip(titles, embedded_sentences):
    title_embedding_dict[key] = value

In [41]:
len(title_embedding_dict)

6932

In [46]:
print(len(embedded_sentences))

len(list(zip(titles, embedded_sentences)))

7150


7150