In [25]:
import pandas as pd
df_check = pd.read_csv(r"C:\Users\Zilya\Git\recommandation-films-creuse\data\df_ml_ready.csv")
print(df_check.columns.tolist())

['imdb_id', 'title', 'genres', 'overview', 'rating', 'runtime', 'poster_path', 'popularity', 'year', 'numVotes', 'actors', 'producers', 'poster_url', 'genres_text']


In [26]:
import pandas as pd

# 1. Сначала загружаем файл с диска в память
df_ml_ready = pd.read_csv('../data/df_ml_ready.csv')

# 2. Теперь запускаем твою функцию валидации (которую мы написали выше)
df_ml_ready = validate_all_movies(df_ml_ready)

# 3. Проверяем результат
print(f"Данные проверены! В базе: {len(df_ml_ready)} фильмов.")

C:\Users\Zilya\AppData\Local\Temp\ipykernel_8932\2304165042.py:46: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  valid_movies.append(movie_obj.dict())


Данные проверены! В базе: 27109 фильмов.


In [27]:
from pydantic import BaseModel, Field, validator
from typing import Optional, Union
import pandas as pd

class MovieCompleteSchema(BaseModel):
    # Строковые поля
    imdb_id: str
    title: str
    genres: str
    overview: str
    poster_path: Optional[str] = None
    actors: Optional[str] = "Inconnu"
    producers: Optional[str] = "Inconnu"
    poster_url: Optional[str] = None
    genres_text: str
    
    # Числовые поля
    rating: float
    runtime: float
    popularity: float
    year: int
    numVotes: int

    # Валидатор для обработки пустых значений (NaN)
    @validator('overview', 'genres_text', pre=True)
    def handle_empty_strings(cls, v):
        if pd.isna(v) or v == "":
            return "Information non disponible"
        return v

    @validator('rating', 'runtime', 'popularity', 'year', 'numVotes', pre=True)
    def handle_nan_numbers(cls, v):
        if pd.isna(v):
            return 0
        return v

def validate_all_movies(df):
    # Превращаем DataFrame в список словарей
    raw_data = df.to_dict(orient='records')
    valid_movies = []
    
    for item in raw_data:
        try:
            # Проверка каждой строки через Pydantic
            movie_obj = MovieCompleteSchema(**item)
            valid_movies.append(movie_obj.dict())
        except Exception as e:
            # Если данные совсем не подходят, мы просто увидим ошибку, но код не остановится
            continue
            
    return pd.DataFrame(valid_movies)

# Запускаем полную очистку
df_ml_ready = validate_all_movies(df_ml_ready)
print(f"Данные проверены! В базе: {len(df_ml_ready)} фильмов.")

C:\Users\Zilya\AppData\Local\Temp\ipykernel_8932\2304165042.py:25: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  @validator('overview', 'genres_text', pre=True)
C:\Users\Zilya\AppData\Local\Temp\ipykernel_8932\2304165042.py:31: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  @validator('rating', 'runtime', 'popularity', 'year', 'numVotes', pre=True)
C:\Users\Zilya\AppData\Local\Temp\ipykernel_8932\2304165042.py:46: PydanticDepreca

Данные проверены! В базе: 27109 фильмов.


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# 1. Подготовка текста: соединяем жанры и описание
# Мы переводим всё в нижний регистр, чтобы 'Drama' и 'drama' были одним словом
df_ml_ready['metadata'] = (df_ml_ready['genres_text'] + " " + df_ml_ready['overview']).str.lower()

# 2. Настройка TF-IDF
# stop_words='english' уберет мусорные слова (the, a, is), которые не несут смысла
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_ml_ready['metadata'])

# 3. Обучение модели KNN
# Используем 'cosine' (косинусное сходство), так как оно лучше всего работает с текстом
model_knn = NearestNeighbors(n_neighbors=6, metric='euclidean')
model_knn.fit(tfidf_matrix)



0,1,2
,"n_neighbors  n_neighbors: int, default=5 Number of neighbors to use by default for :meth:`kneighbors` queries.",6
,"radius  radius: float, default=1.0 Range of parameter space to use by default for :meth:`radius_neighbors` queries.",1.0
,"algorithm  algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm  based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force.",'auto'
,"leaf_size  leaf_size: int, default=30 Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.",30
,"metric  metric: str or callable, default='minkowski' Metric to use for distance computation. Default is ""minkowski"", which results in the standard Euclidean distance when p = 2. See the documentation of `scipy.spatial.distance `_ and the metrics listed in :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric values. If metric is ""precomputed"", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only ""nonzero"" elements may be considered neighbors. If metric is a callable function, it takes two arrays representing 1D vectors as inputs and must return one value indicating the distance between those vectors. This works for Scipy's metrics, but is less efficient than passing the metric name as a string.",'euclidean'
,"p  p: float (positive), default=2 Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.",2
,"metric_params  metric_params: dict, default=None Additional keyword arguments for the metric function.",
,"n_jobs  n_jobs: int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",


In [33]:
def recommend_movies(title, df=df_ml_ready, model=model_knn, n_reco=5):
    
    if title not in df['title'].values:
        return f"Le film '{title}' n'est pas dans notre base de données."

    
    idx = df[df['title'] == title].index[0]

    
    distances, indices = model.kneighbors(
        tfidf_matrix[idx], 
        n_neighbors=n_reco + 1
    )

  
    similar_indices = indices[0][1:]

  
    return df.iloc[similar_indices][['title', 'genres_text', 'rating', 'year', 'numVotes']]



In [34]:
# test the function 1
recommend_movies("Ariel")

Unnamed: 0,title,genres_text,rating,year,numVotes
1863,Life,"Comedy, Crime",6.646,1999,60505
19148,They Made Me a Fugitive,"Drama, Thriller, Crime",6.4,1947,2200
21760,And God Said to Cain,"Thriller, Mystery, Western, Action",6.6,1970,1957
8258,25 Years of Innocence,Drama,7.632,2020,1387
25201,Marina,"Romance, Drama, Music",6.9,2013,3976
