In [1]:
import pandas as pd
from elasticsearch import Elasticsearch, helpers

In [61]:
# Подключение к Elasticsearch
es = Elasticsearch("http://localhost:9200", http_auth=("elastic", "password123"))

# Создание индекса для фильмов
index_name = "movies"

# Создание индекса с анализаторами и синонимами
index_body = {
    "settings": {
        "analysis": {
            "filter": {
                "synonym_filter": {
                    "type": "synonym",
                    "synonyms": [
                        "sci-fi, science fiction",
                        "romcom, romantic comedy",
                        "thriller, suspense"
                    ]
                },
                "english_stemmer": {
                    "type": "stemmer",
                    "language": "english"
                }
            },
            "analyzer": {
                "synonym_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": ["lowercase", "synonym_filter", "english_stemmer"]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "movieId": {"type": "integer"},
            "title": {"type": "text", "analyzer": "synonym_analyzer"},
            "genres": {"type": "text", "analyzer": "synonym_analyzer"},
            "overview": {"type": "text", "analyzer": "synonym_analyzer"},
            "production_countries": {"type": "text"},
            "runtime": {"type": "integer"},
            "spoken_languages": {"type": "text"},
            "vote_average": {"type": "float"},
            "vote_count": {"type": "integer"}
        }
    }
}
# Удаляем индекс, если он уже существует, и создаем его заново
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, body=index_body)

  es = Elasticsearch("http://localhost:9200", http_auth=("elastic", "password123"))


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'movies'})

In [62]:
df_movies = pd.read_csv('../data/merged_movies.csv')
df_movies = df_movies.dropna(how='any')
df_movies_small = df_movies.iloc[:800]

In [66]:
# Подготовка данных для индексации в Elasticsearch
def generate_movie_actions(df):
    for _, row in df.iterrows():
        yield {
            "_index": index_name,
            "_id": row["movieId"],  # movieId используется как уникальный идентификатор
            "_source": {
                "movieId": row["movieId"],
                "title": row["title"],
                "genres": row["genres"],
                "tmdbid": row["tmdbid"],
                "overview": row["overview"],
                "production_countries": row["production_countries"],
                "runtime": row["runtime"],
                "spoken_languages": row["spoken_languages"],
                "vote_average": row["vote_average"],
                "vote_count": row["vote_count"]
            }
        }

# Индексация данных о фильмах
helpers.bulk(es, generate_movie_actions(df_movies))

print("Movies indexed successfully.")


Movies indexed successfully.


In [74]:
def search_movie(query, index_name="movies", top_k=5):
    """
    Функция для поиска фильмов по текстовому запросу.
    
    Args:
        query (str): Текст запроса.
        index_name (str): Название индекса.
        top_k (int): Количество возвращаемых результатов.
    
    Returns:
        list: Список найденных фильмов.
    """
    # Тело запроса
    search_body = {
        "query": {
            "function_score": {
                "query": {
                    "multi_match": {
                        "query": query,
                        "fields": ["title^3", "genres^2", "overview", "production_countries", "spoken_languages"],
                        "fuzziness": "AUTO"
                    }
                },
                "boost_mode": "multiply",
                "functions": [
                    {
                        "field_value_factor": {
                            "field": "vote_average",
                            "factor": 1.5,
                            "missing": 0
                        }
                    },
                    {
                        "field_value_factor": {
                            "field": "vote_count",
                            "factor": 1.2,
                            "missing": 1
                        }
                    }
                ]
            }
        },
        "size": top_k
    }
    
    # Выполнение запроса
    response = es.search(index=index_name, body=search_body)
    
    # Обработка результатов
    results = []
    for hit in response["hits"]["hits"]:
        results.append({
            "movieId": hit["_source"]["movieId"],
            "title": hit["_source"]["title"],
            "genres": hit["_source"]["genres"],
            "overview": hit["_source"]["overview"],
            "score": hit["_score"]  # Релевантность к запросу
        })
    
    return results

# Пример использования функции
query = "dracula"
top_movies = search_movie(query)
print("Top recommended movies:")
for movie in top_movies:
    print(movie)


Top recommended movies:
{'movieId': 1339, 'title': "Dracula (Bram Stoker's Dracula) (1992)", 'genres': 'Fantasy,Horror,Romance,Thriller', 'overview': 'In 19th century England, Count Dracula travels to London and meets Mina Harker, a young woman who appears as the reincarnation of his lost love.', 'score': 1869111.1}
{'movieId': 114795, 'title': 'Dracula Untold (2014)', 'genres': 'Action,Drama,Fantasy', 'overview': "Vlad Tepes is a great hero, but when he learns the Sultan is preparing for battle and needs to form an army of 1,000 boys, he vows to find a way to protect his family. Vlad turns to dark forces in order to get the power to destroy his enemies and agrees to go from hero to monster as he's turned into the mythological vampire, Dracula.", 'score': 1651774.9}
{'movieId': 97225, 'title': 'Hotel Transylvania (2012)', 'genres': 'Animation,Children,Comedy', 'overview': "Welcome to Hotel Transylvania, Dracula's lavish five-stake resort, where monsters and their families can live it u