Part I

In [None]:
import os

import streamlit as st
from dotenv import load_dotenv
from api.omdb import OMDBApi
from recsys import ContentBaseRecSys

global TOP_K, omdbapi

load_dotenv()

API_KEY = os.getenv("API_KEY")
MOVIES = os.getenv("MOVIES")
DISTANCE = os.getenv("DISTANCE")

omdbapi = OMDBApi(API_KEY)
TOP_K = 5
min_rating = 0

recsys = ContentBaseRecSys(
    movies_dataset_filepath=MOVIES,
    distance_filepath=DISTANCE,
)

st.image("assets/popcorn.jpeg", width=200)
st.title("Рекомендатор: Cinema Recommendation System")

selected_movie = st.sidebar.selectbox(
    "Выбери фильм, который тебе понравился:", 
    sorted(recsys.get_title())
)

selected_genres = st.sidebar.multiselect(
    "Уточни жанры, которые тебе интересны:",
    sorted(list(recsys.get_genres()))
)

min_rating = st.sidebar.select_slider(
    "Укажи, ниже какого рейтинга кино не предлагать:",
    sorted(recsys.rate_range())
)


@st.cache_data
def show_recommendation(_recsys, _omdbapi, _selected_movie, selected_genres, min_rating):
    st.subheader("Проверяю из того, что есть, нового фильма не жди.")

    recommended_movies = recsys.recommendation(
        _omdbapi, _selected_movie, TOP_K, selected_genres, min_rating
        )

    st.header("Тобой посмотрены уже эти фильмы?")
    num_cols = TOP_K
    cols = st.columns(num_cols, gap="large")

    

    for i, (movie_title, movie_poster) in enumerate(recommended_movies):
        with cols[i % num_cols]:
            if movie_poster is not None:
                st.image(movie_poster, width = 130, caption = movie_title)
            else:
                st.image('assets/none.jpeg', width = 130)


if st.sidebar.button('Рекомендации от Рекомендатора'):
    # Обработчик кнопки.
    if selected_genres:
        show_recommendation(recsys, omdbapi, selected_movie, selected_genres, min_rating)
    else:
        show_recommendation(recsys, omdbapi, selected_movie, None, min_rating)

Part II

In [None]:
from typing import List, Set

import pandas as pd
from .utils import parse
from itertools import chain

class ContentBaseRecSys:

    def __init__(self, movies_dataset_filepath: str, distance_filepath: str):
        '''
        Загрузка таблицы distances и изменение типа значений индексов и колонок.
        '''
        self.distance = pd.read_csv(distance_filepath, index_col='movie_id')
        self.distance.index = self.distance.index.astype(int)
        self.distance.columns = self.distance.columns.astype(int)
        self._init_movies(movies_dataset_filepath)


    def _init_movies(self, movies_dataset_filepath) -> None:
        '''
        Приватный метод для инициализации таблицы фильмов.
        '''
        self.movies = pd.read_csv(movies_dataset_filepath, index_col='movie_id')
        self.movies.index = self.movies.index.astype(int)
        self.movies['genres'] = self.movies['genres'].apply(parse)


    def get_title(self) -> List[str]:
        '''
        Получение списка названий всех фильмов.
        '''
        titles = sorted(self.movies['original_title'].values)
        return set(titles)


    def get_genres(self) -> Set[str]:
        '''
        Получение списка уникальных значений по жанру фильмов.
        '''
        genres = sorted(chain.from_iterable(self.movies['genres']))
        return set(genres)


    def get_years(self) -> Set[str]:
        '''
        Получение списка уникальных значений по дате релиза фильмов.
        '''
        years = sorted(self.movies['release_date'].str[:4].dropna())
        return set(years)
    

    def rate_range(self) -> Set[str]:
        '''
        Получение шкалы рейтинга фильмов.
        '''
        rating = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        return set(rating)   
    

    def get_similar_movies(self, title: str) -> List[str]:
        '''
        Нахождение похожих фильмов с distances.
        '''
        if title not in self.movies['original_title'].values:
            raise ValueError(f'Фильм "{original_title}", возможно, еще не сняли!')
        
        movie_index_list = self.movies.index[self.movies['original_title'] == title].tolist()
        movie_index = movie_index_list[0] if movie_index_list else None
        if movie_index is None:
            return []

        distances = self.distance.loc[movie_index]
        similar_movies_indices = distances.nlargest(len(distances)).index.tolist()
        similar_movies_titles = self.movies.loc[similar_movies_indices, 'original_title'].tolist()
        return similar_movies_titles


    def filter_movies_by_genre(self, movie_titles: List[str], genres: Set[str]) -> List[str]:
        '''
        Фильтр по жанрам.
        '''
        if not genres:
            return movie_titles
        
        return [title for title in movie_titles if any(genre in self.movies.loc[
            self.movies['original_title'] == title, 'genres'
            ].values[0] for genre in genres)]


    def filter_movies_by_votes(self, movie_titles: List[str], min_rating: int) -> List[str]:
        '''
        Фильтр по рейтингу.
        '''
        # Взвешиваем рейтинг по формуле IMDb
        C = self.movies['vote_average'].mean()
        m = self.movies['vote_count'].quantile(0.1)
        self.movies['score'] = self.movies.apply(
            lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) +
            (m/(m+x['vote_count']) * C), axis=1)

        return [title for title in movie_titles if (self.movies.loc[
            self.movies['original_title'] == title, 'score'
            ].values[0]) >= min_rating]

    def recommendation(self, omdbapi, selected_movie: str, TOP_K: int, selected_genres: 
        Set[str] = None, min_rating: int = None) -> List[str]:
        '''
        !!! Рекомендация !!!
        '''
        similar_movies = self.get_similar_movies(selected_movie)

        if selected_genres:
            similar_movies = self.filter_movies_by_genre(similar_movies, selected_genres)

        if min_rating:
            similar_movies = self.filter_movies_by_votes(similar_movies, min_rating)

        recommended_movie_names = similar_movies[:TOP_K]
        recommended_movie_posters = omdbapi.get_posters(recommended_movie_names)
        # специальный zip-файл, вроде двухмерного массива, сложенного по соответствию индексов
        return zip(recommended_movie_names, recommended_movie_posters)

Part III. Prequel

In [None]:
import pandas as pd


from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


df_credits = pd.read_csv('../../datasets/tmdb_5000_credits.csv')
df_movies = pd.read_csv('../../datasets/tmdb_5000_movies.csv')

df_movies.rename(columns={'id': 'movie_id'}, inplace=True)

df_merged = pd.merge(df_movies, df_credits, on='movie_id')
df_merged.dropna(subset=['overview', 'genres', 'keywords'], inplace=True)
df_filtered = df_merged[df_merged['status'] == 'Released']

#def clean_data(x)

num_movies_remaining = len(df_filtered)
#print(f"Количество фильмов в датасете после фильтрации: {num_movies_remaining}.")

df_filtered.loc[:, 'overview'] = df_filtered['overview'].fillna('')
df_filtered.loc[:, 'overview'] = df_filtered['overview'] + df_filtered['keywords']

# Векторизация данных
stop_words = list(text.ENGLISH_STOP_WORDS)
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=10000)

tfidf_matrix = tfidf_vectorizer.fit_transform(df_filtered['overview'])

#print(f"Размер матрицы Tf-Idf: {tfidf_matrix.shape}.")

# Расчет метрики косинусного сходства
df_cosine_sim = pd.DataFrame(
    cosine_similarity(tfidf_matrix), index=df_filtered.movie_id, 
    columns=df_filtered.movie_id
    )

df_cosine_sim.to_csv('../../src/assets/distance.csv')
df_filtered.to_csv('../../src/assets/movies.csv', index=True)


...