In [87]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import save_npz


In [88]:
path = 'C:/Users/remis/Documents/M1/cine5/python/data/movies.db'
conn = sqlite3.connect(path)
data = pd.read_sql_query("SELECT * FROM movies", conn)
conn.close()

In [89]:
res_data  =  pd.DataFrame(columns=['A', 'B', 'C'])


In [90]:

def load_data_from_db(db_path):
    """Charge les données des films depuis la base de données SQLite."""
    conn = sqlite3.connect(db_path)
    data = pd.read_sql_query("SELECT * FROM movies", conn)
    conn.close()
    return data

def create_text_features(df):
    """Crée les textes combinés pour chaque feature."""
    return {
        'director': df["Director"],
        'actors': df["Actors"],
        'genres': (df["main_genre"]),
        "side_genre":(df["side_genre"]),
        'plot': df["plot"]
        # "title":df["title"]
    }

def create_tfidf_matrices(text_features):
    """Crée les matrices TF-IDF pour chaque feature."""
    vectorizers = {
        'director': TfidfVectorizer( stop_words='english'),
        'actors': TfidfVectorizer( stop_words='english'),
        'genres': TfidfVectorizer( stop_words='english'),
        'plot': TfidfVectorizer( stop_words='english'),
        'side_genre': TfidfVectorizer( stop_words='english')
    }
    
    tfidf_matrices = {}
    for feature_name, vectorizer in vectorizers.items():
        tfidf_matrices[feature_name] = vectorizer.fit_transform(text_features[feature_name])
    
    return tfidf_matrices,vectorizer


def compute_similarities(tfidf_matrices, film_position):
    """Calcule les similarités cosinus pour chaque feature."""
    similarities = {}
    for feature_name, tfidf_matrix in tfidf_matrices.items():
        similarities[feature_name] = cosine_similarity(
            tfidf_matrix[film_position], 
            tfidf_matrix
        )[0]
    return similarities


def compute_combined_similarity(similarities, weights):
    """Calcule la similarité combinée."""
    w_director, w_actors, w_genres, w_side_genres,w_plot = weights
    total_weight = sum(weights)
    
    combined = (
        w_director * similarities['director'] +
        w_actors * similarities['actors'] +
        w_genres * similarities['genres'] +
        w_plot * similarities['plot']+
        w_side_genres + similarities['side_genre']
    ) / total_weight
    
    return combined


def compute_rating_bonus(ratings):
    """Calcule le bonus basé sur le rating."""
    return np.maximum(0, (ratings - 6.0) / 4.0)


def compute_year_proximity(years, query_year):
    """Calcule la proximité temporelle."""
    year_diff = np.abs(years - query_year)
    return np.maximum(0, 1 - (year_diff / 50))


def compute_total_scores(combined_sim, rating_bonus, year_proximity, w_content, w_rating, w_year):
    """Calcule les scores totaux."""
    return w_content * combined_sim + w_rating * rating_bonus + w_year * year_proximity


def get_top_recommendations(scores, film_position, n=5):
    """Retourne les indices des top recommandations."""
    sorted_idx = np.argsort(scores)[::-1]
    filtered_idx = [i for i in sorted_idx if i != film_position][:n]
    return filtered_idx


def create_recommendations_dataframe(df, filtered_idx, scores):
    """Crée le DataFrame des recommandations."""
    recofinals = df.iloc[filtered_idx].copy()
    recofinals["score"] = scores[filtered_idx]
    return recofinals


def create_score_components_dataframe(df, filtered_idx, scores):
    """Crée le DataFrame de décomposition des scores."""
    return pd.DataFrame({
        'film_id': df.iloc[filtered_idx].index,
        'Movie_Title': df.iloc[filtered_idx]['Movie_Title'].values,
        'score_total': scores[filtered_idx],
    })

# 1. Modifier ta fonction pour accepter un DataFrame pré-chargé
def recommendation_optimized(film_idx, df, tfidf_matrices, w_director=3.0, w_actors=1.5, 
                            w_genres=5.0, w_side_genres=1.0, w_plot=2.5, 
                            w_content=0.88, w_rating=0.08, w_year=0.04, n=5):
    """Version optimisée qui ne recharge pas les données"""
    content_weights = (w_director, w_actors, w_genres, w_side_genres, w_plot)
    excluded_indices = film_idx.copy()
    best_idx = []
    rating_bonus = compute_rating_bonus(df["rating"].to_numpy())
    years = df["year"].to_numpy()
    
    for index in film_idx:
        similarities = compute_similarities(tfidf_matrices, index)
        combined_sim = compute_combined_similarity(similarities, content_weights)
        year_proximity = compute_year_proximity(years, df.loc[index, 'year'])
        score = compute_total_scores(combined_sim, rating_bonus, year_proximity, 
                                     w_content, w_rating, w_year)
        sorted_idx = np.argsort(score)[::-1]
        
        for i in sorted_idx:
            if df.index[i] in excluded_indices:
                continue
            else:
                excluded_indices.append(i)
                best_idx.append(i)
                break
    
    return df.iloc[best_idx]

        

In [91]:
res_data['A'] = data['id']
res_data

Unnamed: 0,A,B,C
0,0,,
1,1,,
2,2,,
3,3,,
4,4,,
...,...,...,...
5223,5223,,
5224,5224,,
5225,5225,,
5226,5226,,


In [92]:
df = load_data_from_db(path)
text_features = create_text_features(df)
tfidf_matrices, _ = create_tfidf_matrices(text_features)



In [93]:
recommendations = []
for i in res_data['A']:
    recommended_df = recommendation_optimized([i], df, tfidf_matrices)
    recommendations.append(recommended_df.index[0])

res_data['B'] = recommendations

In [94]:
res_data

Unnamed: 0,A,B,C
0,0,2,
1,1,45,
2,2,4,
3,3,45,
4,4,2,
...,...,...,...
5223,5223,4898,
5224,5224,4681,
5225,5225,3341,
5226,5226,5040,


In [95]:
recommendations = []
for i in res_data['B']:
    recommended_df = recommendation_optimized([i], df, tfidf_matrices)
    recommendations.append(recommended_df.index[0])

res_data['C'] = recommendations

In [96]:
res_data

Unnamed: 0,A,B,C
0,0,2,4
1,1,45,1
2,2,4,2
3,3,45,1
4,4,2,4
...,...,...,...
5223,5223,4898,5045
5224,5224,4681,4928
5225,5225,3341,3351
5226,5226,5040,5226


In [97]:
nombre_differences = (res_data['A'] != res_data['C']).sum()
print(f"Nombre de différences : {nombre_differences}")

Nombre de différences : 2794
