# 0. Configuration

In [13]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [14]:
pip install pymystem3

You should consider upgrading via the '/Users/vydolga/.pyenv/versions/3.9.16/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [15]:
import re
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from ast import literal_eval
from pymystem3 import Mystem
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import warnings
warnings.filterwarnings('ignore')

# download stop words beforehand
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vydolga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1.1. Helper functions to avoid copypaste

In [16]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

In [17]:
# init lemmatizer to avoid slow performance
mystem = Mystem() 

def word_tokenize_clean(doc: str, stop_words: list):
    '''
    tokenize from string to list of words
    '''

    # split into lower case word tokens \w lemmatization
    tokens = list(set(mystem.lemmatize(doc.lower())))
  
    # remove tokens that are not alphabetic (including punctuation) and not a stop word
    tokens = [word for word in tokens if word.isalpha() and not word in stop_words \
              not in list(punctuation)]
    return tokens

# 2. Main

## 2.1. Data Preparation

In [18]:
# read csv information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [19]:
# let's see what columns we have
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

To get accurate results we need to preprocess text a bit. The pipeline will be as follows:

- Filter only necessary columns from movies_metadada : id, original_title, overview;
- Define `model_index` for model to match back with `id` column;
- Text cleaning: removing stopwords & punctuation, lemmatization for further tokenization and tagged document creatin required for gensim.Doc2Vec

In [20]:
# filter cols
sample = movies_metadata[['id', 'original_title', 'overview','tagline']].copy()
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              45466 non-null  object
 1   original_title  45466 non-null  object
 2   overview        44512 non-null  object
 3   tagline         20412 non-null  object
dtypes: object(4)
memory usage: 1.4+ MB


In [21]:
# as you see from above, we have missing overview in some cases -- let's fill it with the original title
sample.loc[sample['overview'].isnull(), 'overview'] = sample.loc[sample['overview'].isnull(), 'original_title']
sample.loc[sample['tagline'].isnull(), 'tagline'] = sample.loc[sample['tagline'].isnull(), 'original_title']

sample.isnull().sum()

id                0
original_title    0
overview          0
tagline           0
dtype: int64

In [22]:
# define model_index and make it as string
sample = sample.reset_index().rename(columns = {'index': 'model_index'})
sample['model_index'] = sample['model_index'].astype(str)

In [37]:
# create mapper with title and model_idnex to use it further in evaluation
movies_inv_mapper_up = {}
k=0
movies_inv_mapper = dict(zip(sample['original_title'].str.lower(), sample['model_index'].astype(int)))
for i in movies_inv_mapper:
    movies_inv_mapper_up[i] = [k, k+1, k+2]
    k=k+3
    # movies_inv_mapper_up[i] = k
    # k=k+1
    # movies_inv_mapper_up[i] = k
    # k=k+1
movies_inv_mapper_up

{'toy story': [0, 1, 2],
 'jumanji': [3, 4, 5],
 'grumpier old men': [6, 7, 8],
 'waiting to exhale': [9, 10, 11],
 'father of the bride part ii': [12, 13, 14],
 'heat': [15, 16, 17],
 'sabrina': [18, 19, 20],
 'tom and huck': [21, 22, 23],
 'sudden death': [24, 25, 26],
 'goldeneye': [27, 28, 29],
 'the american president': [30, 31, 32],
 'dracula: dead and loving it': [33, 34, 35],
 'balto': [36, 37, 38],
 'nixon': [39, 40, 41],
 'cutthroat island': [42, 43, 44],
 'casino': [45, 46, 47],
 'sense and sensibility': [48, 49, 50],
 'four rooms': [51, 52, 53],
 'ace ventura: when nature calls': [54, 55, 56],
 'money train': [57, 58, 59],
 'get shorty': [60, 61, 62],
 'copycat': [63, 64, 65],
 'assassins': [66, 67, 68],
 'powder': [69, 70, 71],
 'leaving las vegas': [72, 73, 74],
 'othello': [75, 76, 77],
 'now and then': [78, 79, 80],
 'persuasion': [81, 82, 83],
 'la cité des enfants perdus': [84, 85, 86],
 '摇啊摇，摇到外婆桥': [87, 88, 89],
 'dangerous minds': [90, 91, 92],
 'twelve monkeys': [

In [24]:
# preprocess by removing non-character data, stopwords
cols = ['overview', 'original_title','tagline']

sample['combined'] = sample[cols].values.tolist()
tags_corpus = np.concatenate(sample['combined'].values.tolist())
# tags_corpus = sample['overview'].values
tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
stop_words = stopwords.words('english')

tags_doc = [word_tokenize_clean(description, stop_words) for description in tags_corpus]
tags_doc[:1]

[['separate',
  'eventually',
  'happily',
  'scene',
  'aside',
  'lightyear',
  'losing',
  'woody',
  'plots',
  'andy',
  'duo',
  'circumstances',
  'birthday',
  'owner',
  'room',
  'toys',
  'live',
  'put',
  'learns',
  'differences',
  'led',
  'buzz',
  'place',
  'afraid',
  'heart',
  'onto',
  'brings']]

In [25]:
# prepare data as model input for Word2Vec
## it takes some time to execute
tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

In [26]:
# let's check what do we have
## tag = movie index
tags_doc[-1]

TaggedDocument(words=['queerama'], tags=['136397'])

# 2.2. Model Training and Evaluation

In [27]:
VEC_SIZE = 50
ALPHA = .02
MIN_ALPHA = .00025
MIN_COUNT = 5
EPOCHS = 20

In [28]:
# initialize
model = Doc2Vec(vector_size = VEC_SIZE,
                alpha = ALPHA, 
                min_alpha = MIN_ALPHA,
                min_count = MIN_COUNT,
                dm = 0)

In [29]:
# generate vocab from all tag docs
model.build_vocab(tags_doc)

In [30]:
# train model
model.train(tags_doc,
            total_examples = 45466,
            epochs = EPOCHS)

## 2.3. Evaluate the model

Let's assume that we watched movie `batman` and based on that generate recommendation similar to it's description.

To do that we need
- To extract movie id from `movies_inv_mapper` we created to map back titles from model output
- Load embeddings from trained model
- Use built-in most_similar() method to get most relevant recommendations based on film embedding
- Finally, map title names for sense-check

In [43]:
# get id
movie_id = movies_inv_mapper_up.get('batman')
movie_id = movie_id[1]
movie_id

1756

In [44]:
# load trained embeddings 
movies_vectors = model.dv.vectors

In [45]:
movie_embeddings = movies_vectors[movie_id]

In [46]:
# get recommendations
similars = model.docvecs.most_similar(positive = [movie_embeddings], topn = 20)
output = pd.DataFrame(similars, columns = ['model_index', 'model_score'])
output.head()

Unnamed: 0,model_index,model_score
0,1756,1.0
1,113215,0.974853
2,32779,0.97484
3,81125,0.974784
4,29341,0.974576


In [52]:
# reverse values and indices to map names in dataframe
k = 0
name_mapper = {}
# name_mapper = {v: k for k, v in movies_inv_mapper_up.items()}
for i in movies_inv_mapper_up:
    name_mapper[k] = i
    k=k+1
    name_mapper[k] = i
    k=k+1
    name_mapper[k] = i
    k=k+1
name_mapper

{0: 'toy story',
 1: 'toy story',
 2: 'toy story',
 3: 'jumanji',
 4: 'jumanji',
 5: 'jumanji',
 6: 'grumpier old men',
 7: 'grumpier old men',
 8: 'grumpier old men',
 9: 'waiting to exhale',
 10: 'waiting to exhale',
 11: 'waiting to exhale',
 12: 'father of the bride part ii',
 13: 'father of the bride part ii',
 14: 'father of the bride part ii',
 15: 'heat',
 16: 'heat',
 17: 'heat',
 18: 'sabrina',
 19: 'sabrina',
 20: 'sabrina',
 21: 'tom and huck',
 22: 'tom and huck',
 23: 'tom and huck',
 24: 'sudden death',
 25: 'sudden death',
 26: 'sudden death',
 27: 'goldeneye',
 28: 'goldeneye',
 29: 'goldeneye',
 30: 'the american president',
 31: 'the american president',
 32: 'the american president',
 33: 'dracula: dead and loving it',
 34: 'dracula: dead and loving it',
 35: 'dracula: dead and loving it',
 36: 'balto',
 37: 'balto',
 38: 'balto',
 39: 'nixon',
 40: 'nixon',
 41: 'nixon',
 42: 'cutthroat island',
 43: 'cutthroat island',
 44: 'cutthroat island',
 45: 'casino',
 46: 

In [53]:
output['title_name'] = output['model_index'].astype(int).map(name_mapper)
output


Unnamed: 0,model_index,model_score,title_name
0,1756,1.0,batman
1,113215,0.974853,curse of the crimson altar
2,32779,0.97484,beerfest
3,81125,0.974784,12 + 1
4,29341,0.974576,mysterious skin
5,99229,0.974541,famous nathan
6,77246,0.974404,rosalie
7,43451,0.974252,torrid zone
8,32108,0.97422,peaceful warrior
9,74875,0.974041,shortcut to happiness


# TODO

- Add `original_title`, `keywords`, `tagline` and other metadata to train sample and then retrain embeddings;
- Make visualization of embeddings with links of films with each other;
- Compare results with the embeddings we created in lecture
- Write function get_recommendations() which takes arguments we used 2.3., but such that we can use embeddings of several watched films to get recommendations

# Appendix

Here, we wrap up all pipeline into functions to re-use if needed and it is just prettier to code this way :)

In [34]:
def get_clean_tags_array(agg_tags: pd.DataFrame,
                         text_col = 'tag'):
    '''text preprocessing
    '''
    tags_corpus = agg_tags[text_col].values
    tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
    stop_words = stopwords.words('english')


    # preprocess corpus of movie tags before feeding it into Doc2Vec model
    tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

    return tags_doc


In [35]:
def train_embeddings(tags_doc: np.array,
                     epochs = 20,
                     vec_size = 50,
                     alpha = .02,
                     min_alpha =  0.00025,
                     min_count = 5,
                     save_path: str = None):
    """
    fit doc2vec model to prepared corpus
    :tags_doc: result of get_clean_tags_array()
    :max_epocs: int
    :vec_size: int
    :alpha: float
    """
    #initialize
    model = Doc2Vec(vector_size = vec_size,
                    alpha = alpha, 
                    min_alpha = min_alpha,
                    min_count = min_count,
                    dm = 0)
    
    #generate vocab from all tag docs
    model.build_vocab(tags_doc)
    
    #train model
    model.train(tags_doc,
                total_examples = model.corpus_count,
                epochs = epochs)
    
    #save model to dir
    if save_path:
        model.save(f'{save_path}/d2v_model.pkl')
    
    return model