In [3]:
#from django.db import models
import os
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

In [4]:
class Strategries():

    def bert():
        ...

In [5]:
PLOTS_DATA = os.path.join("backend-django", "data", "movies.csv")
# overviews
plots = pd.read_csv(PLOTS_DATA).dropna()
print(plots.head())

   movieID                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                            overview  
0  Led by Woody, Andy's toys live happily in his ...  
1  When siblings Judy and Peter discover an encha...  
2  A family wedding reignites the ancient feud be...  
3  Cheated on, mistreated and stepped on, the wom...  
4  Just when George Banks has recovered from his ...  


In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\legion\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [7]:
#!python -m spacy download en_core_web_sm

import nltk
from nltk.corpus import stopwords
import spacy
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))

In [39]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertModel
import warnings
warnings.filterwarnings('ignore')
from typing import List
import torch

class BERT_strategy:
    ''' Movie recommendation strategy based on BERT model - High-performance semantic similarity
    Works by finding similarities between movies' overviews:
        (1) creating tokens out of each overviews
        (2) sending tokanized overviews though BERT model
        (3) choice of recommendation according to cosine similarity score between model outputs
    '''
    def __init__(self, movies: pd.DataFrame, cache_dir: str = './data/') -> None:
        # ensure cache dir exists
        self.cache_dir = cache_dir
        os.makedirs(self.cache_dir, exist_ok=True)

        self.model = DistilBertModel.from_pretrained("distilbert-base-uncased", cache_dir=self.cache_dir)
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', cache_dir=self.cache_dir)
        if self.tokenizer.pad_token is None:
            # add tokenizer
            self.tokenizer.add_special_tokens({'pad_token': 'EOS'}) # end of sentence token
            self.model.resize_token_embeddings(len(self.tokenizer))
        self.df = movies
        self.overviews = [self.preprocess_string(str(a)) for a in self.df['overview'].values.tolist()]
        
        print("Getting bert embeddings...")
        embeddings_path = os.path.join(self.cache_dir, 'overviews_embeddings.pt')
        if not os.path.exists(embeddings_path):
            self.overviews_embeddings = self.model_outputs(self.overviews)
            torch.save(self.overviews_embeddings, embeddings_path)
        else:
            self.overviews_embeddings = torch.load(embeddings_path)
        print("Done")
            
    def preprocess_string(self, text: str):
        doc = nlp(text)
        cleaned_text = ' '.join([token.lemma_ for token in doc if token.text.lower() not in stop_words and token.is_alpha])
        
        return cleaned_text

    def model_outputs(self, items: List[str], batch_size: int = 64):
        all_outputs = []
        num_batches = len(items) // batch_size + (len(items) % batch_size != 0)
        
        for i in tqdm(range(0, len(items), batch_size), desc="Progress", total=num_batches):
            batch_items = items[i:i+batch_size]
            inputs = self.tokenizer(batch_items, add_special_tokens=True, padding=True, max_length=100, truncation=True, return_tensors="pt")

            with torch.no_grad():
                outputs = self.model(**inputs).last_hidden_state[:, 0, :].numpy()  # [batch, maxlen, hidden_state] -> using only [batch, hidden_state]
                all_outputs.append(outputs)
        
        return np.concatenate(all_outputs, axis=0)

    def recommend(self, query: str, k: int = 5):
        '''         
        @inputs
            query: string, description of movie you seek for
        @outputs
            recommendation: list of top k movies with highest similarity score
        '''
        query_embedding = self.model_outputs([self.preprocess_string(query)])
        print(query_embedding)
        sim = cosine_similarity(query_embedding, self.overviews_embeddings)[0]

        movie_ids = self.df['movieID'].values.tolist()
        top_k_indicies = sim.argsort()[-k:][::-1].tolist()

        return [movie_ids[i] for i in top_k_indicies]
    
bert = BERT_strategy(plots)

Getting bert embeddings...
Done


In [40]:
out_ids = bert.recommend("Amy begins her first night shift in a hotel with a murderous past. Witnessing terrifying events and trapped within a loop, Amy must find a way to escape the flesh obsessed murderer and save residents of the hotel")

Progress: 100%|██████████| 1/1 [00:00<00:00, 27.75it/s]

[[-2.66157717e-01 -1.41092166e-01  1.75003037e-02 -8.98907408e-02
   1.70342714e-01 -1.92662049e-02  8.04049730e-01 -2.04639230e-03
   1.56170046e-02 -3.04371029e-01  2.58200467e-01 -4.01669174e-01
   2.69036852e-02  6.81553781e-01 -1.51997834e-01  1.86904907e-01
   3.67965311e-01  1.92120612e-01  3.35135937e-01 -5.15100099e-02
   4.20797229e-01 -5.63268244e-01  3.49920243e-01  2.25656331e-01
   1.24935448e-01 -2.20034793e-02 -2.42146164e-01  1.92668229e-01
   5.86223677e-02  2.04392001e-01  1.46270573e-01 -1.19263001e-01
   1.58734433e-02 -4.58267212e-01  2.57323503e-01 -1.71047673e-02
   1.46093577e-01  9.17640552e-02 -8.81221443e-02  3.11756581e-01
  -2.20554739e-01 -9.75680500e-02  1.17772534e-01 -9.83754620e-02
   2.02146694e-01 -5.59600115e-01 -2.24485135e+00 -3.74289125e-01
  -1.01840019e-01 -3.40554953e-01  3.56394351e-01 -8.55824277e-02
  -3.30474228e-04 -2.50966340e-01  1.89764097e-01  7.97038078e-01
  -7.32346326e-02  9.38795730e-02 -1.49005398e-01  2.49224797e-01
   3.83699




In [10]:
out_ids

[2518, 1053, 1996, 987, 2298]

In [11]:
plots[plots['movieID'].isin(out_ids)][['title', 'overview']].values

array([['Bliss (1997)',
        'A mind-bending love story following Greg who, after recently being divorced and then fired, meets the mysterious Isabel, a woman living on the streets and convinced that the polluted, broken world around them is just a computer simulation. Doubtful at first, Greg eventually discovers there may be some truth to Isabel’s wild conspiracy.'],
       ['Normal Life (1996)',
        "Chris Anderson and his wife Pam live a fairly normal life until Chris loses his job on the police force and secretly turns to robbing banks to make his wife's dreams come true. Upon discovering his secret, she joins his deadly crime wave and together they terrorize an unsuspecting suburban town."],
       ['Poltergeist III (1988)',
        "Carol Anne has been sent to live with her Aunt and Uncle in an effort to hide her from the clutches of the ghostly Reverend Kane, but he tracks her down and terrorises her in her relatives' appartment in a tall glass building. Will he finally a

In [12]:
plots[plots['title'].str.startswith('Toy Story')].values

array([[1, 'Toy Story (1995)',
        "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."],
       [3114, 'Toy Story 2 (1999)',
        "Andy heads off to Cowboy Camp, leaving his toys to their own devices. Things shift into high gear when an obsessive toy collector named Al McWhiggen, owner of Al's Toy Barn kidnaps Woody. Andy's toys mount a daring rescue mission, Buzz Lightyear meets his match and Woody has to decide where he and his heart truly belong."]],
      dtype=object)

In [13]:
len(plots)

3814

In [24]:
from transformers import T5Tokenizer, T5EncoderModel

In [71]:
class T5Predictor:
    def __init__(self, movies, cache_dir: str = './data/', model_name: str = "t5-small"):
        self.cache_dir = cache_dir
        os.makedirs(self.cache_dir, exist_ok=True)

        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5EncoderModel.from_pretrained(model_name)

        self.df = movies
        self.overviews = [self.preprocess(str(a)) for a in self.df['overview'].values.tolist()]
        
        print("Getting T5 embeddings...")
        embeddings_path = os.path.join(self.cache_dir, 'overviews_embeddings_t5.pt')
        if not os.path.exists(embeddings_path):
            self.overviews_embeddings = self.model_outputs(self.overviews)
            torch.save(self.overviews_embeddings, embeddings_path)
        else:
            self.overviews_embeddings = torch.load(embeddings_path)
        print("Done")
    
    def preprocess(self, text: str) -> np.ndarray:
        doc = nlp(text)

        cleaned_text = ' '.join([token.lemma_ for token in doc if token.text.lower() not in stop_words and token.is_alpha])
        
        return cleaned_text
    
    def model_outputs(self, items: List[str], batch_size: int = 64):
        all_outputs = []
        num_batches = len(items) // batch_size + (len(items) % batch_size != 0)
        
        for i in tqdm(range(0, len(items), batch_size), desc="Progress", total=num_batches):
            batch_items = items[i:i+batch_size]
            inputs = self.tokenizer(batch_items, add_special_tokens=True, padding=True, max_length=100, truncation=True, return_tensors="pt")

            with torch.no_grad():
                outputs = self.model(**inputs).last_hidden_state[:, 0, :].numpy()  # [batch, maxlen, hidden_state] -> using only [batch, hidden_state]
                all_outputs.append(outputs)
        return np.concatenate(all_outputs, axis=0)
    
    
    def find_most_similar(self, input_text: str, top_n: int = 5) -> list:
        output  = self.model_outputs([self.preprocess(input_text)])
        #print(output)
        similarities = cosine_similarity(output, self.overviews_embeddings)[0]
        
        most_similar_indices = similarities.argsort()[-top_n:][::-1]
        movie_ids = self.df['movieID'].values.tolist()

        result = [movie_ids[i] for i in most_similar_indices]
        print(f"Movie IDS: {result}")
        return result

In [72]:
t5 = T5Predictor(plots)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Getting T5 embeddings...


Progress: 100%|██████████| 60/60 [02:09<00:00,  2.16s/it]

Done





In [113]:
out_ids = bert.recommend("A funny movie with talking animals")

Progress: 100%|██████████| 1/1 [00:00<00:00, 44.62it/s]

[[-2.31389374e-01 -1.16050191e-01 -3.40510756e-02 -8.09760690e-02
  -7.09153488e-02 -1.90737501e-01  2.31461972e-01  4.33843464e-01
  -2.02163160e-01 -6.44693226e-02  8.74928385e-03 -1.29121304e-01
  -9.20528546e-02  2.35302612e-01 -5.57978228e-02  7.24287331e-02
  -1.39271453e-01  2.07740694e-01  2.73483098e-01 -1.51595056e-01
   2.82573625e-02 -3.21703017e-01 -2.91800126e-02 -1.75978646e-01
  -8.28471035e-02 -5.34971878e-02 -3.54028083e-02  1.23387761e-01
   1.98172241e-01  1.25928879e-01  9.41045769e-03  1.07434124e-01
  -6.56422600e-02 -1.42140597e-01  7.66285509e-03 -3.20078321e-02
   8.25711787e-02 -3.89065370e-02 -1.51125547e-02  1.19604692e-01
   2.57000104e-02  8.89517367e-04  1.14825815e-01 -9.16471556e-02
   6.76917732e-02 -2.22942233e-01 -2.06314993e+00  1.06673418e-02
  -2.56794810e-01 -2.00130895e-01  3.02332282e-01  4.60351631e-02
   2.45962828e-01  2.57436514e-01  1.48710832e-01  3.70654225e-01
  -3.23712602e-02  1.90957218e-01  5.42325340e-02  8.05940032e-02
   1.17585




In [114]:
out_idst5 = t5.find_most_similar("A funny movie with talking animals")
out_idst5

Progress: 100%|██████████| 1/1 [00:00<00:00, 32.15it/s]

Movie IDS: [1122, 3429, 3819, 3828, 2050]





[1122, 3429, 3819, 3828, 2050]

In [115]:
plots[plots['movieID'].isin(out_ids)][['title', 'overview']].values

array([['Lawnmower Man, The (1992)',
        'A simple man is turned into a genius through the application of computer science.'],
       ['Smile Like Yours, A (1997)',
        'A comedy about a couple who cannot conceive a baby'],
       ['Mouse Hunt (1997)', 'A band of mice steals grains from cats.'],
       ['Tinseltown (1998)',
        "In Hollywood it's all about who you know, and the only person two friends know is a serial killer."],
       ['Lost Son, The (1999)',
        'Mac is at the threshold of losing everything while directing his feature film.']],
      dtype=object)

In [116]:
plots[plots['movieID'].isin(out_idst5)][['title', 'overview']].values

array([['Plutonium Circus (1995)',
        'Funny yet incisive look at the PANTEX Nuclear Weapons Plant, in Amarillo, TX, which was used for nuclear weapons assembly during the Cold War. The plant, which provides most of the jobs for those living in Amarillo, now operates as a disassembly-line where the weapons, which were once built there, are now being taken apart with the plutonium getting stored underground. The film deals with the issues of storing the plutonium and the effects the plant has had (and is having) on the town of Amarillo, as well as how it has affected the way people thought about the Cold War and its aftermath. It does so by taking a look at the lives and activities of the Amarillo residents directly or indirectly associated with, or having strong opinions about the Plant.'],
       ['Herbie Goes Bananas (1980)',
        'The adorable little VW helps its owners break up a counterfeiting ring in Mexico.'],
       ['Creature Comforts (1990)',
        'A humorous and t

In [92]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [109]:
class lfidf:
   def __init__(self, movies, cache_dir: str = './data/', model_name: str = "t5-small"):
      self.cache_dir = cache_dir
      os.makedirs(self.cache_dir, exist_ok=True)

      self.lemmatizer = WordNetLemmatizer()
      self.tfidf_vectorizer = TfidfVectorizer()
      
      self.df = movies
      self.overviews = [self.preprocess(str(a)) for a in self.df['overview'].values.tolist()]
      
      embeddings_path = os.path.join(self.cache_dir, 'overviews_embeddings_lfidf.pt')
      if not os.path.exists(embeddings_path):
         self.overviews_embeddings = self.tfidf_vectorizer.fit_transform(self.overviews)
         torch.save(self.overviews_embeddings, embeddings_path)
      else:
         self.tfidf_vectorizer.fit(self.overviews)
         self.overviews_embeddings = torch.load(embeddings_path)
      

   def preprocess(self, text):
      tokens = word_tokenize(text.lower())
      filtered_tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word.isalnum()]
      return ' '.join(filtered_tokens)
   
   def find_best_fit_movies(self, prompt, top_n=5):
      prompt_processed = self.preprocess(prompt)
      prompt_vector = self.tfidf_vectorizer.transform([prompt_processed])
      
      similarity_scores = cosine_similarity(prompt_vector, self.overviews_embeddings)[0]
      most_similar_indices = similarity_scores.argsort()[-top_n:][::-1]
      movie_ids = self.df['movieID'].values.tolist()

      result = [movie_ids[i] for i in most_similar_indices]
      print(f"Movie IDS: {result}")
      return result
         

In [110]:
lfidf = lfidf(plots)

In [130]:
out_idslf = lfidf.find_best_fit_movies("Funny movie about talking animals")
out_idslf

Movie IDS: [2443, 634, 243, 1360, 333]


[2443, 634, 243, 1360, 333]

In [131]:
plots[plots['movieID'].isin(out_idslf)][['title', 'overview']].values

array([['Gordy (1995)',
        'A talking pig named Gordy becomes involved in a quest to save his family from the slaughterhouse.'],
       ['Tommy Boy (1995)',
        'To save the family business, two ne’er-do-well traveling salesmen hit the road with disastrously funny consequences.'],
       ['Theodore Rex (1995)',
        'In an alternate futuristic society, a tough female police detective is paired with a talking dinosaur to find the killer of dinosaurs and other prehistoric animals leading them to a mad scientist bent on creating a new Armageddon.'],
       ['Identification of a Woman (Identificazione di una donna) (1982)',
        "The movie director Niccolò has just been left by his wife. Subsequently he embarks on an obsessive relationship with a young woman who eventually leaves him and disappears while searching for her, he meets a variety of other willing girls. This gives him the idea of making a movie about women's relationships. He starts to search for a woman who can 

In [162]:
from gensim.models import Word2Vec
class word2vec:
   def __init__(self, movies, cache_dir: str = './data/', model_name: str = "t5-small"):
      self.cache_dir = cache_dir
      os.makedirs(self.cache_dir, exist_ok=True)

      self.lemmatizer = WordNetLemmatizer()
      self.tfidf_vectorizer = TfidfVectorizer()
      
      self.df = movies
      self.overviews = [self.preprocess(str(a)) for a in self.df['overview'].values.tolist()]
      self.word2vec_model = Word2Vec(self.overviews, vector_size=100, window=5, min_count=1, workers=4)

      embeddings_path = os.path.join(self.cache_dir, 'overviews_embeddings_word2vec.pt')
      if not os.path.exists(embeddings_path):
         self.overviews_embeddings = [self.vectorize_sentence(a, self.word2vec_model) for a in self.overviews]
         
         torch.save(self.overviews_embeddings, embeddings_path)
      else:
         self.overviews_embeddings = torch.load(embeddings_path)
      

   def preprocess(self, text):
      tokens = word_tokenize(text.lower())
      filtered_tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word.isalnum()]
      return ' '.join(filtered_tokens)
   
   def vectorize_sentence(self, sentence, model):
      vectors = [model.wv[word] for word in sentence if word in model.wv]
      if not vectors:
         return np.zeros(model.vector_size)
      return np.mean(vectors, axis=0)
   
   def find_best_fit_movies(self, prompt, top_n=5):
      prompt_processed = self.preprocess(prompt)
      prompt_vector = self.vectorize_sentence(prompt_processed, self.word2vec_model)
      similarity_scores = [cosine_similarity([prompt_vector], [overview])[0][0] for overview in self.overviews_embeddings]
      
      #most_similar_indices = similarity_scores.nlargest(top_n).index
      # similarity_scores = cosine_similarity(prompt_vector, self.overviews_embeddings)[0]
      most_similar_indices = np.argsort(similarity_scores)[-top_n:][::-1]
      movie_ids = self.df['movieID'].values.tolist()

      result = [movie_ids[i] for i in most_similar_indices]
      print(f"Movie IDS: {result}")
      return result
         

In [163]:
words = word2vec(plots)

In [169]:
out_idswords = words.find_best_fit_movies("buzz lightyear")
out_idswords

[0.8646652, 0.83766663, 0.84035873, 0.8320831, 0.833397, 0.8200444, 0.84836984, 0.8488707, 0.8497766, 0.8349058, 0.8341093, 0.83449155, 0.83378637, 0.8430012, 0.82107717, 0.8296086, 0.83428353, 0.8110009, 0.827028, 0.8284133, 0.84321904, 0.83011055, 0.8382337, 0.8434102, 0.8325455, 0.7998518, 0.83924955, 0.8354376, 0.818656, 0.82660884, 0.8308071, 0.83577627, 0.8295379, 0.82753944, 0.83074105, 0.8328463, 0.8517064, 0.833789, 0.832806, 0.8209554, 0.82609344, 0.8183186, 0.83320403, 0.8452732, 0.8350979, 0.8065775, 0.8326493, 0.8524684, 0.8295759, 0.8351065, 0.83772653, 0.8252324, 0.82140535, 0.83294505, 0.81953835, 0.8480341, 0.8141236, 0.8182094, 0.8413095, 0.838038, 0.8336382, 0.83410627, 0.8418002, 0.83298236, 0.8383667, 0.8369723, 0.80898285, 0.8618585, 0.8256802, 0.82567585, 0.8334872, 0.81796396, 0.8251867, 0.82162654, 0.83802164, 0.83692896, 0.82448393, 0.8305355, 0.8293899, 0.8206334, 0.84715104, 0.82075894, 0.8172603, 0.8354642, 0.8273645, 0.8462034, 0.8208983, 0.83288014, 0.847

[2086, 396, 3149, 1213, 244]

In [170]:
plots[plots['movieID'].isin(out_idswords)][['title', 'overview']].values

array([['Gumby: The Movie (1995)',
        'On the brink of a big deal with mogul Lucky Claybert, Gumby and his band The Clayboys must do battle with the villainous Blockheads, who have kidnapped their loyal canine Lowbelly.'],
       ['Fall Time (1995)',
        'Three young men decide to plan a mock kidnapping, but everything goes wrong because a real bank robbery was already planned by two other guys.'],
       ['GoodFellas (1990)',
        'The true story of Henry Hill, a half-Irish, half-Sicilian Brooklyn kid who is adopted by neighbourhood gangsters at an early age and climbs the ranks of a Mafia family under the guidance of Jimmy Conway.'],
       ['One Magic Christmas (1985)',
        "Ginny Grainger, a young mother, rediscovers the joy and beauty of Christmas, thanks to the unshakable faith of her six-year-old daughter Abbie and Gideon, Ginny's very own guardian angel."],
       ['Diamonds (1999)',
        'Mystery about an ex-prizefighter who embarks on a journey to find 13 m