In [1]:
import pandas as pd

from transformers import AutoTokenizer, AutoModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_movies = pd.read_csv('../../data/merged_movies.csv')

In [3]:
df_movies.head()

Unnamed: 0,movieId,title,genres,tmdbid,overview,production_countries,runtime,spoken_languages,vote_average,vote_count
0,1,Toy Story (1995),"Adventure,Animation,Children,Comedy,Fantasy",862,"Led by Woody, Andy's toys live happily in his ...",United States of America,81,English,8.0,18253
1,2,Jumanji (1995),"Adventure,Children,Fantasy",8844,When siblings Judy and Peter discover an encha...,United States of America,104,"English,Français",7.2,10435
2,3,Grumpier Old Men (1995),"Comedy,Romance",15602,A family wedding reignites the ancient feud be...,United States of America,101,English,6.5,374
3,4,Waiting to Exhale (1995),"Comedy,Drama,Romance",31357,"Cheated on, mistreated and stepped on, the wom...",United States of America,127,English,6.3,160
4,5,Father of the Bride Part II (1995),Comedy,11862,Just when George Banks has recovered from his ...,United States of America,106,English,6.2,725


In [4]:
df_movies.describe()

Unnamed: 0,movieId,tmdbid,runtime,vote_average,vote_count
count,25483.0,25483.0,25483.0,25483.0,25483.0
mean,58096.091198,59979.116352,99.598791,6.203465,564.843464
std,44146.907337,66410.098298,25.399315,0.995472,1818.29422
min,1.0,2.0,0.0,0.0,0.0
25%,6692.0,15055.5,89.0,5.7,26.0
50%,63645.0,37744.0,97.0,6.3,78.0
75%,98602.5,77228.0,110.0,6.9,293.0
max,131258.0,418029.0,566.0,10.0,36373.0


In [11]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25483 entries, 0 to 25482
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movieId               25483 non-null  int64  
 1   title                 25483 non-null  object 
 2   genres                25483 non-null  object 
 3   tmdbid                25483 non-null  int64  
 4   overview              25483 non-null  object 
 5   production_countries  25483 non-null  object 
 6   runtime               25483 non-null  int64  
 7   spoken_languages      25433 non-null  object 
 8   vote_average          25483 non-null  float64
 9   vote_count            25483 non-null  int64  
dtypes: float64(1), int64(4), object(5)
memory usage: 1.9+ MB


In [8]:
# Загрузка модели и токенизатора
model_name = 'intfloat/multilingual-e5-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Функция для генерации эмбеддингов
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    # Используем среднее по последнему скрытому слою
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings

# Обработка каждого фильма для получения эмбеддинга
def generate_movie_embeddings(movie):
    # Конкатенация данных из нужных колонок
    text = f"{movie['title']} | {movie['genres']} | {movie['overview']} | {movie['production_countries']} | {movie['runtime']} | {movie['spoken_languages']} | {movie['vote_average']} | {movie['vote_count']}"
    return get_embedding(text)

In [9]:
# Пример данных
data = {
    "movieId": [1, 2, 3],
    "title": ["Toy Story (1995)", "Jumanji (1995)", "Grumpier Old Men (1995)"],
    "genres": ["Adventure,Animation,Children,Comedy,Fantasy", "Adventure,Children,Fantasy", "Comedy,Romance"],
    "overview": [
        "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene.",
        "When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world.",
        "A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max."
    ],
    "production_countries": ["United States of America", "United States of America", "United States of America"],
    "runtime": [81, 104, 101],
    "spoken_languages": ["English", "English,Français", "English"],
    "vote_average": [8.0, 7.2, 6.5],
    "vote_count": [18253, 10435, 374]
}

# Создание DataFrame
movies_df = pd.DataFrame(data)

# Применение функции для каждого фильма
movies_df['embedding'] = movies_df.apply(generate_movie_embeddings, axis=1)

# Вывод результатов
print(movies_df[['movieId', 'embedding']])

   movieId                                          embedding
0        1  [[1.2067407, -0.9202704, -0.8746483, -1.03731,...
1        2  [[1.011993, -0.33529368, -0.79698354, -0.74171...
2        3  [[0.9790182, -0.26317513, -1.216532, -1.310239...


In [10]:
movies_df.embedding.iloc[0][0].shape

(1024,)