In [35]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings('ignore')

In [36]:
data  =  pd.read_csv("C:/Users/remis/Documents/M1/cine5/python/data/DATASETULTIME.csv")


In [37]:
def load_and_prepare_data(filepath):
    """
    Charge et pr√©pare les donn√©es pour la recommandation.
    
    Args:
        filepath: Chemin vers le fichier CSV
        
    Returns:
        DataFrame avec les donn√©es nettoy√©es
    """
    data = pd.read_csv(filepath)
    
    # Remplir les valeurs manquantes
    data = data.fillna('')
    
    # S'assurer que Rating et Total_Gross sont num√©riques
    data['Rating'] = pd.to_numeric(data['Rating'], errors='coerce').fillna(0)
    data['Total_Gross'] = pd.to_numeric(data['Total_Gross'], errors='coerce').fillna(0)
    data['Runtime(Mins)'] = pd.to_numeric(data['Runtime(Mins)'], errors='coerce').fillna(0)
    data['Year'] = pd.to_numeric(data['Year'], errors='coerce').fillna(0)
    
    return data

In [38]:
def create_bag_of_words(data, include_plot=False, genre_weight=1):
    """
    Cr√©e le bag of words pour chaque film.
    
    Args:
        data: DataFrame des films
        include_plot: Inclure le r√©sum√© du film
        genre_weight: Nombre de r√©p√©titions pour renforcer l'importance des genres
        
    Returns:
        Series contenant le bag of words pour chaque film
    """
    bag = (
        data["Movie_Title"].astype(str) + " " +
        data["Director"].astype(str) + " " +
        data["Actors"].astype(str) + " " +
        (data["main_genre"].astype(str) + " ") * genre_weight +
        (data["side_genre"].astype(str) + " ") * max(1, genre_weight - 1)
    )
    
    if include_plot:
        bag = bag + " " + data["plot"].astype(str)
    
    return bag

In [39]:
def normalize_feature(values):
    """
    Normalise un vecteur de valeurs entre 0 et 1.
    
    Args:
        values: Array numpy de valeurs
        
    Returns:
        Array numpy normalis√©
    """
    v_min = values.min()
    v_max = values.max()
    if v_max - v_min == 0:
        return np.zeros_like(values)
    return (values - v_min) / (v_max - v_min)

In [40]:
def model_content_pure(query_idx, tfidf_matrix, data, top_k=5):
    """
    Mod√®le 1: Content-Based Pure (baseline)
    Utilise uniquement la similarit√© de contenu.
    
    Args:
        query_idx: Index du film requ√™te
        tfidf_matrix: Matrice TF-IDF
        data: DataFrame des films
        top_k: Nombre de recommandations
        
    Returns:
        DataFrame des recommandations avec scores
    """
    query_vec = tfidf_matrix[query_idx]
    sims = cosine_similarity(query_vec, tfidf_matrix)[0]
    
    # Score = similarit√© uniquement
    scores = sims
    
    return _get_top_recommendations(query_idx, scores, sims, data, top_k)

In [41]:
def model_hybrid_rating(query_idx, tfidf_matrix, data, top_k=5, alpha=0.2):
    """
    Mod√®le 2: Hybrid Content + Rating
    Combine similarit√© de contenu et rating.
    
    Args:
        query_idx: Index du film requ√™te
        tfidf_matrix: Matrice TF-IDF
        data: DataFrame des films
        top_k: Nombre de recommandations
        alpha: Poids du rating (0.2 = 20%)
        
    Returns:
        DataFrame des recommandations avec scores
    """
    query_vec = tfidf_matrix[query_idx]
    sims = cosine_similarity(query_vec, tfidf_matrix)[0]
    
    # Normaliser les ratings
    ratings = data["Rating"].to_numpy()
    rating_norm = normalize_feature(ratings)
    
    # Score hybride
    scores = (1 - alpha) * sims + alpha * rating_norm
    
    return _get_top_recommendations(query_idx, scores, sims, data, top_k)

In [42]:
def model_hybrid_full(query_idx, tfidf_matrix, data, top_k=5, alpha=0.2, beta=0.1):
    """
    Mod√®le 3: Hybrid Content + Rating + Box Office
    Combine similarit√©, rating et popularit√© (box office).
    
    Args:
        query_idx: Index du film requ√™te
        tfidf_matrix: Matrice TF-IDF
        data: DataFrame des films
        top_k: Nombre de recommandations
        alpha: Poids du rating (0.2 = 20%)
        beta: Poids du box office (0.1 = 10%)
        
    Returns:
        DataFrame des recommandations avec scores
    """
    query_vec = tfidf_matrix[query_idx]
    sims = cosine_similarity(query_vec, tfidf_matrix)[0]
    
    # Normaliser ratings et box office
    ratings = data["Rating"].to_numpy()
    rating_norm = normalize_feature(ratings)
    
    box_office = data["Total_Gross"].to_numpy()
    box_office_norm = normalize_feature(box_office)
    
    # Score hybride complet
    scores = (1 - alpha - beta) * sims + alpha * rating_norm + beta * box_office_norm
    
    return _get_top_recommendations(query_idx, scores, sims, data, top_k)

In [43]:
def model_knn(query_idx, data, top_k=5):
    """
    Mod√®le 4: K-Nearest Neighbors
    Utilise toutes les features num√©riques et cat√©gorielles encod√©es.
    
    Args:
        query_idx: Index du film requ√™te
        data: DataFrame des films
        top_k: Nombre de recommandations
        
    Returns:
        DataFrame des recommandations avec scores
    """
    # Pr√©parer les features num√©riques
    features_numeric = data[['Year', 'Rating', 'Runtime(Mins)', 'Total_Gross']].copy()
    
    # One-hot encoding des genres
    main_genre_dummies = pd.get_dummies(data['main_genre'], prefix='main')
    side_genre_dummies = pd.get_dummies(data['side_genre'], prefix='side')
    
    # Combiner toutes les features
    features = pd.concat([features_numeric, main_genre_dummies, side_genre_dummies], axis=1)
    
    # Normaliser
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    
    # KNN
    knn = NearestNeighbors(n_neighbors=top_k+1, metric='euclidean')
    knn.fit(features_scaled)
    
    # Trouver les voisins
    distances, indices = knn.kneighbors([features_scaled[query_idx]])
    
    # Exclure le film lui-m√™me
    indices = indices[0][1:]
    distances = distances[0][1:]
    
    # Convertir distances en scores de similarit√© (inverse)
    max_dist = distances.max() if distances.max() > 0 else 1
    sims = 1 - (distances / max_dist)
    
    # Cr√©er le DataFrame de r√©sultats
    recs = data.iloc[indices][["Movie_Title", "Year", "main_genre", "side_genre", "Rating", "Total_Gross"]].copy()
    recs["similarity"] = sims
    recs["score"] = sims
    
    return recs

In [44]:
def _get_top_recommendations(query_idx, scores, sims, data, top_k):
    """
    Fonction utilitaire pour extraire les top K recommandations.
    
    Args:
        query_idx: Index du film requ√™te
        scores: Scores finaux
        sims: Scores de similarit√©
        data: DataFrame des films
        top_k: Nombre de recommandations
        
    Returns:
        DataFrame des recommandations
    """
    # Trier par score d√©croissant
    sorted_idx = np.argsort(scores)[::-1]
    
    # Filtrer pour exclure le film requ√™te
    filtered_idx = [i for i in sorted_idx if i != query_idx][:top_k]
    
    # Cr√©er le DataFrame de r√©sultats
    recs = data.iloc[filtered_idx][["Movie_Title", "Year", "main_genre", "side_genre", "Rating", "Total_Gross"]].copy()
    recs["similarity"] = sims[filtered_idx]
    recs["score"] = scores[filtered_idx]
    
    return recs

In [45]:
def evaluate_recommendations(recs):
    """
    √âvalue la qualit√© des recommandations selon plusieurs m√©triques.
    
    Args:
        recs: DataFrame des recommandations
        
    Returns:
        Dict contenant les m√©triques d'√©valuation
    """
    metrics = {
        'avg_rating': recs['Rating'].mean(),
        'avg_similarity': recs['similarity'].mean(),
        'avg_score': recs['score'].mean(),
        'genre_diversity': len(recs['main_genre'].unique()),
        'year_std': recs['Year'].std(),
        'avg_box_office': recs['Total_Gross'].mean()
    }
    
    return metrics

In [46]:
def compare_models(data, test_indices, models_config):
    """
    Compare tous les mod√®les sur plusieurs films de test.
    
    Args:
        data: DataFrame des films
        test_indices: Liste des indices de films √† tester
        models_config: Dict de configuration des mod√®les
        
    Returns:
        DataFrame comparatif des r√©sultats
    """
    all_results = []
    
    for test_idx in test_indices:
        print(f"\n{'='*80}")
        print(f"FILM DE TEST: {data.iloc[test_idx]['Movie_Title']} ({data.iloc[test_idx]['Year']})")
        print(f"Genre: {data.iloc[test_idx]['main_genre']} | Rating: {data.iloc[test_idx]['Rating']}")
        print(f"{'='*80}\n")
        
        for model_name, config in models_config.items():
            print(f"--- {model_name} ---")
            
            # Obtenir les recommandations
            recs = config['function'](**config['params'])
            
            # √âvaluer
            metrics = evaluate_recommendations(recs)
            metrics['model'] = model_name
            metrics['test_film'] = data.iloc[test_idx]['Movie_Title']
            metrics['test_idx'] = test_idx
            
            all_results.append(metrics)
            
            # Afficher les recommandations
            print(recs[['Movie_Title', 'Year', 'main_genre', 'Rating', 'score']].to_string(index=False))
            print(f"\nM√©triques: Rating moy={metrics['avg_rating']:.2f}, "
                  f"Similarit√© moy={metrics['avg_similarity']:.3f}, "
                  f"Diversit√© genres={metrics['genre_diversity']}\n")
    
    return pd.DataFrame(all_results)

In [47]:
def aggregate_results(results_df):
    """
    Agr√®ge les r√©sultats de tous les tests pour comparer les mod√®les.
    
    Args:
        results_df: DataFrame des r√©sultats individuels
        
    Returns:
        DataFrame avec les moyennes par mod√®le
    """
    agg = results_df.groupby('model').agg({
        'avg_rating': 'mean',
        'avg_similarity': 'mean',
        'avg_score': 'mean',
        'genre_diversity': 'mean',
        'year_std': 'mean',
        'avg_box_office': 'mean'
    }).round(3)
    
    return agg

In [48]:
def main():
    """
    Fonction principale pour ex√©cuter la comparaison compl√®te.
    """
    print("Chargement des donn√©es...")
    data = load_and_prepare_data("C:/Users/remis/Documents/M1/cine5/python/data/DATASETULTIME.csv")
    
    # Choisir des films de test vari√©s
    test_indices = [4, 10, 50, 100, 500]  # Vous pouvez changer ces indices
    
    print(f"Nombre total de films: {len(data)}")
    print(f"Films de test: {len(test_indices)}")
    
    # Pr√©parer les bag of words pour diff√©rentes configurations
    print("\nPr√©paration des bag of words...")
    bow_simple = create_bag_of_words(data, include_plot=False, genre_weight=1)
    bow_with_plot = create_bag_of_words(data, include_plot=True, genre_weight=1)
    bow_weighted = create_bag_of_words(data, include_plot=False, genre_weight=3)
    
    # Vectorisation TF-IDF
    print("Vectorisation TF-IDF...")
    vectorizer_simple = TfidfVectorizer(max_features=5000, stop_words='english')
    tfidf_simple = vectorizer_simple.fit_transform(bow_simple)
    
    vectorizer_plot = TfidfVectorizer(max_features=5000, stop_words='english')
    tfidf_plot = vectorizer_plot.fit_transform(bow_with_plot)
    
    vectorizer_weighted = TfidfVectorizer(max_features=5000, stop_words='english')
    tfidf_weighted = vectorizer_weighted.fit_transform(bow_weighted)
    
    # Configuration des mod√®les
    models_config = {}
    
    for test_idx in test_indices:
        models_config = {
            "1_Content_Pure": {
                'function': model_content_pure,
                'params': {
                    'query_idx': test_idx,
                    'tfidf_matrix': tfidf_simple,
                    'data': data,
                    'top_k': 5
                }
            },
            "2_Hybrid_Rating": {
                'function': model_hybrid_rating,
                'params': {
                    'query_idx': test_idx,
                    'tfidf_matrix': tfidf_simple,
                    'data': data,
                    'top_k': 5,
                    'alpha': 0.2
                }
            },
            "3_Hybrid_Full": {
                'function': model_hybrid_full,
                'params': {
                    'query_idx': test_idx,
                    'tfidf_matrix': tfidf_simple,
                    'data': data,
                    'top_k': 5,
                    'alpha': 0.2,
                    'beta': 0.1
                }
            },
            "4_Content_Plot": {
                'function': model_content_pure,
                'params': {
                    'query_idx': test_idx,
                    'tfidf_matrix': tfidf_plot,
                    'data': data,
                    'top_k': 5
                }
            },
            "5_Weighted_Genres": {
                'function': model_hybrid_rating,
                'params': {
                    'query_idx': test_idx,
                    'tfidf_matrix': tfidf_weighted,
                    'data': data,
                    'top_k': 5,
                    'alpha': 0.2
                }
            },
            "6_KNN": {
                'function': model_knn,
                'params': {
                    'query_idx': test_idx,
                    'data': data,
                    'top_k': 5
                }
            }
        }
        
        # Comparer les mod√®les
        print("\n" + "="*80)
        print("COMPARAISON DES MOD√àLES")
        print("="*80)
        
        results = compare_models(data, [test_idx], models_config)
    
    # Agr√©ger et afficher les r√©sultats
    print("\n" + "="*80)
    print("R√âSULTATS AGR√âG√âS (Moyennes sur tous les tests)")
    print("="*80)
    aggregated = aggregate_results(results)
    print(aggregated.to_string())
    
    # Sauvegarder les r√©sultats
    results.to_csv("model_comparison_detailed.csv", index=False)
    aggregated.to_csv("model_comparison_aggregated.csv")
    
    print("\n‚úÖ Comparaison termin√©e!")
    print("üìä R√©sultats sauvegard√©s dans 'model_comparison_detailed.csv' et 'model_comparison_aggregated.csv'")
    
    return results, aggregated

In [49]:
results, aggregated = main()

Chargement des donn√©es...
Nombre total de films: 5540
Films de test: 5

Pr√©paration des bag of words...
Vectorisation TF-IDF...

COMPARAISON DES MOD√àLES

FILM DE TEST: The Lord of the Rings: The Two Towers (2002)
Genre: Action | Rating: 8.8

--- 1_Content_Pure ---
                                      Movie_Title  Year main_genre  Rating    score
    The Lord of the Rings: The Return of the King  2003     Action     9.0 0.922897
The Lord of the Rings: The Fellowship of the Ring  2001     Action     8.8 0.789790
                            The Lord of the Rings  1978  Animation     6.2 0.272297
                                        Apt Pupil  1998      Crime     6.7 0.243883
                                         Daylight  1996     Action     5.9 0.225332

M√©triques: Rating moy=7.32, Similarit√© moy=0.491, Diversit√© genres=3

--- 2_Hybrid_Rating ---
                                      Movie_Title  Year main_genre  Rating    score
    The Lord of the Rings: The Return of the K