In [10]:
#from django.db import models
import os
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

In [2]:
class Strategries():

    def bert():
        ...

In [4]:
PLOTS_DATA = os.path.join("backend-django", "data", "movies.csv")
# overviews
plots = pd.read_csv(PLOTS_DATA).dropna()
print(plots.head())

   movieID                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                            overview  
0  Led by Woody, Andy's toys live happily in his ...  
1  When siblings Judy and Peter discover an encha...  
2  A family wedding reignites the ancient feud be...  
3  Cheated on, mistreated and stepped on, the wom...  
4  Just when George Banks has recovered from his ...  


In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\weraz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
#!python -m spacy download en_core_web_sm

import nltk
from nltk.corpus import stopwords
import spacy
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))

In [11]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertModel
import warnings
warnings.filterwarnings('ignore')
from typing import List
import torch

class BERT_strategy:
    ''' Movie recommendation strategy based on BERT model - High-performance semantic similarity
    Works by finding similarities between movies' overviews:
        (1) creating tokens out of each overviews
        (2) sending tokanized overviews though BERT model
        (3) choice of recommendation according to cosine similarity score between model outputs
    '''
    def __init__(self, movies: pd.DataFrame, cache_dir: str = './data/') -> None:
        # ensure cache dir exists
        self.cache_dir = cache_dir
        os.makedirs(self.cache_dir, exist_ok=True)

        self.model = DistilBertModel.from_pretrained("distilbert-base-uncased", cache_dir=self.cache_dir)
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', cache_dir=self.cache_dir)
        if self.tokenizer.pad_token is None:
            # add tokenizer
            self.tokenizer.add_special_tokens({'pad_token': 'EOS'}) # end of sentence token
            self.model.resize_token_embeddings(len(self.tokenizer))
        self.df = movies
        self.overviews = [self.preprocess_string(str(a)) for a in self.df['overview'].values.tolist()]
        
        print("Getting bert embeddings...")
        embeddings_path = os.path.join(self.cache_dir, 'overviews_embeddings.pt')
        if not os.path.exists(embeddings_path):
            self.overviews_embeddings = self.model_outputs(self.overviews)
            torch.save(self.overviews_embeddings, embeddings_path)
        else:
            self.overviews_embeddings = torch.load(embeddings_path)
        print("Done")
            
    def preprocess_string(self, text: str):
        doc = nlp(text)
        cleaned_text = ' '.join([token.lemma_ for token in doc if token.text.lower() not in stop_words and token.is_alpha])
        
        return cleaned_text

    def model_outputs(self, items: List[str], batch_size: int = 64):
        all_outputs = []
        num_batches = len(items) // batch_size + (len(items) % batch_size != 0)
        
        for i in tqdm(range(0, len(items), batch_size), desc="Progress", total=num_batches):
            batch_items = items[i:i+batch_size]
            inputs = self.tokenizer(batch_items, add_special_tokens=True, padding=True, max_length=100, truncation=True, return_tensors="pt")

            with torch.no_grad():
                outputs = self.model(**inputs).last_hidden_state[:, 0, :].numpy()  # [batch, maxlen, hidden_state] -> using only [batch, hidden_state]
                all_outputs.append(outputs)
        
        return np.concatenate(all_outputs, axis=0)

    def recommend(self, query: str, k: int = 5):
        '''         
        @inputs
            query: string, description of movie you seek for
        @outputs
            recommendation: list of top k movies with highest similarity score
        '''
        query_embedding = self.model_outputs([self.preprocess_string(query)])
        sim = cosine_similarity(query_embedding, self.overviews_embeddings)[0]

        movie_ids = self.df['movieID'].values.tolist()
        top_k_indicies = sim.argsort()[-k:].tolist()

        return [movie_ids[i] for i in top_k_indicies]
    
bert = BERT_strategy(plots)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting bert embeddings...


Batches:  25%|██▌       | 15/59 [01:23<04:04,  5.56s/it]


KeyboardInterrupt: 

In [None]:
out_ids = bert.recommend("Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.")

In [None]:
plots[plots['movieID'].isin(out_ids)][['title', 'overview']].values

array([['Toy Story (1995)',
        "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."],
       ['Beavis and Butt-head Do America (1996)',
        "Slacker duo Beavis and Butt-Head wake to discover their TV has been stolen. Their search for a new one takes them on a clueless adventure across America, during which they manage to accidentally become America's most wanted."],
       ['Adventures in Babysitting (1987)',
        "When plans with her boyfriend fall through, high school senior Chris Parker ends up babysitting the Anderson kids, Brad and Sara. What should be a quiet night in, however, turns into a series of ridiculous exploits, starting when they leave the house to pick up Chris' friend Brenda. Soon, Brad's buddy Daryl is invo

In [67]:
plots[plots['title'].str.startswith('Toy Story')].values

array([[1, 'Toy Story (1995)',
        "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."],
       [3114, 'Toy Story 2 (1999)',
        "Andy heads off to Cowboy Camp, leaving his toys to their own devices. Things shift into high gear when an obsessive toy collector named Al McWhiggen, owner of Al's Toy Barn kidnaps Woody. Andy's toys mount a daring rescue mission, Buzz Lightyear meets his match and Woody has to decide where he and his heart truly belong."]],
      dtype=object)

In [79]:
len(plots)

3771