In [56]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import save_npz


conn = sqlite3.connect("/home/tuanh/M1/projet/cine5_a2ia/python/data/movies.db")
data = pd.read_sql_query("SELECT * FROM movies", conn)
conn.close()
print(data.columns)

Index(['id', 'title', 'year', 'Director', 'Actors', 'main_genre', 'side_genre',
       'rating', 'plot', 'image_filename'],
      dtype='object')


In [57]:
# data = pd.read_csv("/home/tuanh/M1/projet/cine5_a2ia/python/data/movies.csv")
# data["index"] = data.index
# data.to_csv('/home/tuanh/M1/projet/cine5_a2ia/python/data/movies.csv', index=False) 



In [58]:

def load_data_from_db(db_path):
    """Charge les données des films depuis la base de données SQLite."""
    conn = sqlite3.connect(db_path)
    data = pd.read_sql_query("SELECT * FROM movies", conn)
    conn.close()
    return data

def create_text_features(df):
    """Crée les textes combinés pour chaque feature."""
    return {
        'director': df["Director"],
        'actors': df["Actors"],
        'genres': (df["main_genre"]),
        "side_genre":(df["side_genre"]),
        'plot': df["plot"]
        # "title":df["title"]
    }

def create_tfidf_matrices(text_features):
    """Crée les matrices TF-IDF pour chaque feature."""
    vectorizers = {
        'director': TfidfVectorizer( stop_words='english'),
        'actors': TfidfVectorizer( stop_words='english'),
        'genres': TfidfVectorizer( stop_words='english'),
        'plot': TfidfVectorizer( stop_words='english'),
        'side_genre': TfidfVectorizer( stop_words='english')
    }
    
    tfidf_matrices = {}
    for feature_name, vectorizer in vectorizers.items():
        tfidf_matrices[feature_name] = vectorizer.fit_transform(text_features[feature_name])
    
    return tfidf_matrices,vectorizer


def compute_similarities(tfidf_matrices, film_position):
    """Calcule les similarités cosinus pour chaque feature."""
    similarities = {}
    for feature_name, tfidf_matrix in tfidf_matrices.items():
        similarities[feature_name] = cosine_similarity(
            tfidf_matrix[film_position], 
            tfidf_matrix
        )[0]
    return similarities


def compute_combined_similarity(similarities, weights):
    """Calcule la similarité combinée."""
    w_director, w_actors, w_genres, w_side_genres,w_plot = weights
    total_weight = sum(weights)
    
    combined = (
        w_director * similarities['director'] +
        w_actors * similarities['actors'] +
        w_genres * similarities['genres'] +
        w_plot * similarities['plot']+
        w_side_genres + similarities['side_genre']
    ) / total_weight
    
    return combined


def compute_rating_bonus(ratings):
    """Calcule le bonus basé sur le rating."""
    return np.maximum(0, (ratings - 6.0) / 4.0)


def compute_year_proximity(years, query_year):
    """Calcule la proximité temporelle."""
    year_diff = np.abs(years - query_year)
    return np.maximum(0, 1 - (year_diff / 50))


def compute_total_scores(combined_sim, rating_bonus, year_proximity, w_content, w_rating, w_year):
    """Calcule les scores totaux."""
    return w_content * combined_sim + w_rating * rating_bonus + w_year * year_proximity


def get_top_recommendations(scores, film_position, n=5):
    """Retourne les indices des top recommandations."""
    sorted_idx = np.argsort(scores)[::-1]
    filtered_idx = [i for i in sorted_idx if i != film_position][:n]
    return filtered_idx


def create_recommendations_dataframe(df, filtered_idx, scores):
    """Crée le DataFrame des recommandations."""
    recofinals = df.iloc[filtered_idx].copy()
    recofinals["score"] = scores[filtered_idx]
    return recofinals


def create_score_components_dataframe(df, filtered_idx, scores):
    """Crée le DataFrame de décomposition des scores."""
    return pd.DataFrame({
        'film_id': df.iloc[filtered_idx].index,
        'Movie_Title': df.iloc[filtered_idx]['Movie_Title'].values,
        'score_total': scores[filtered_idx],
    })

def recommendation(film_idx,db_path, w_director=3.0, w_actors=1.5, w_genres=5.0,w_side_genres=1.0, 
                   w_plot=2.5, w_content=0.88, w_rating=0.08, w_year=0.04, n=5):
    """
    Fonction de recommandation simplifiée.
    
    Args:
        film_idx: Index du film de référence
        data: DataFrame contenant les données des films
        w_director, w_actors, w_genres, w_plot: Poids pour les features de contenu
        w_content, w_rating, w_year: Poids pour les composantes finales
    
    Returns:
        index_recommandations
    """
    # Préparation
    df = load_data_from_db(db_path)
    text_features = create_text_features(df)
    tfidf_matrices,_ = create_tfidf_matrices(text_features)
    content_weights = (w_director, w_actors, w_genres,w_side_genres, w_plot)
    excluded_indices = film_idx.copy()
    best_idx = []
    rating_bonus = compute_rating_bonus(df["rating"].to_numpy())
    years   = df["year"].to_numpy()
    # Calcul des similarités
    for  index in film_idx :
        
        similarities = compute_similarities(tfidf_matrices, index)
        
        combined_sim = compute_combined_similarity(similarities, content_weights)
        
        # Bonus
        rating_bonus = compute_rating_bonus(df["rating"].to_numpy())
        year_proximity = compute_year_proximity(years, df.loc[index, 'year'])
        
        # Score total
        score= compute_total_scores(combined_sim, rating_bonus, year_proximity, w_content, w_rating, w_year)
        sorted_idx = np.argsort(score)[::-1]

        for i in sorted_idx:
            # print(i)
            if df.index[i] in excluded_indices:
                continue
            else :
                excluded_indices.append(i)
                best_idx.append(i)
                break
    return df.iloc[best_idx]


        

In [59]:
m = [0,4,1287,23,1111]

In [60]:
print(data.iloc[m][["title","main_genre","side_genre","year","Director","Actors"]])

                                      title main_genre           side_genre  \
0                                   Kantara     Action    Adventure,  Drama   
4     The Lord of the Rings: The Two Towers     Action    Adventure,  Drama   
1287                Jurassic World Dominion     Action   Adventure,  Sci-Fi   
23                         K.G.F: Chapter 2     Action        Crime,  Drama   
1111                             Undisputed     Action        Crime,  Drama   

      year         Director                                             Actors  
0     2022    Rishab Shetty  Rishab Shetty, Sapthami Gowda, Kishore Kumar G...  
4     2002    Peter Jackson  Elijah Wood, Ian McKellen, Viggo Mortensen, Or...  
1287  2022  Colin Trevorrow  Chris Pratt, Bryce Dallas Howard, Laura Dern, ...  
23    2022   Prashanth Neel  Yash, Sanjay Dutt, Raveena Tandon, Srinidhi Sh...  
1111  2002      Walter Hill  Wesley Snipes, Ving Rhames, Peter Falk, Michae...  


In [61]:
path = "/home/tuanh/M1/projet/cine5_a2ia/python/data/movies.db"

films_idx = m

recommendation(films_idx,path) 

Unnamed: 0,id,title,year,Director,Actors,main_genre,side_genre,rating,plot,image_filename
2,2,The Lord of the Rings: The Return of the King,2003,Peter Jackson,"Elijah Wood, Viggo Mortensen, Ian McKellen, Or...",Action,"Adventure, Drama",9.0,Sauron's forces have laid siege to Minas Tirit...,img_2.jpg
5,5,The Lord of the Rings: The Fellowship of the Ring,2001,Peter Jackson,"Elijah Wood, Ian McKellen, Orlando Bloom, Sean...",Action,"Adventure, Drama",8.8,This movie is adapted from the novel of the sa...,img_5.jpg
535,535,Jurassic World,2015,Colin Trevorrow,"Chris Pratt, Bryce Dallas Howard, Ty Simpkins,...",Action,"Adventure, Sci-Fi",6.9,"Jurassic World, InGen's safe and fully operati...",img_565.jpg
37,37,K.G.F: Chapter 1,2018,Prashanth Neel,"Yash, Srinidhi Shetty, Ramachandra Raju, Archa...",Action,"Crime, Drama",8.2,Anand Ingalpai's book El Dorado has been banne...,img_38.jpg
895,895,Last Man Standing,1996,Walter Hill,"Bruce Willis, Bruce Dern, William Sanderson, C...",Action,"Crime, Drama",6.4,John Smith is an amoral gunslinger in the days...,img_940.jpg


In [62]:
def create_text_features(df):
    """Crée les textes combinés pour chaque feature."""
    return {
        'director': df["Director"],
        'actors': df["Actors"],
        'genres': (df["main_genre"]),
        "side_genre":(df["side_genre"]),
        'plot': df["plot"],
        'title':df["title"]
    }

def create_tfidf_matrices(text_features):
    """Crée les matrices TF-IDF pour chaque feature."""
    vectorizers = {
        'director': TfidfVectorizer( stop_words='english'),
        'actors': TfidfVectorizer( stop_words='english'),
        'genres': TfidfVectorizer( stop_words='english'),
        'side_genre': TfidfVectorizer( stop_words='english'),
        "plot": TfidfVectorizer( stop_words='english'),
        'title': TfidfVectorizer( stop_words='english',ngram_range=(3,5))
        
    }
    tfidf_matrices = {}
    for feature_name, vectorizer in vectorizers.items():
        tfidf_matrices[feature_name] = vectorizer.fit_transform(text_features[feature_name])
    
    return tfidf_matrices,vectorizers

In [63]:
from scipy.sparse import save_npz
import pickle
import os

save_dir = "/home/tuanh/M1/projet/cine5_a2ia/python/train_model"
os.makedirs(save_dir, exist_ok=True)

# build TF-IDF
tfidf_matrix, vectorizers = create_tfidf_matrices(create_text_features(data))
model_data = {
    "tfidf_matrices": tfidf_matrix,
    "vectorizers": vectorizers
}
for feature_name, matrix in tfidf_matrix.items():
    print(feature_name, matrix.shape)
with open(f"{save_dir}/recommender_model.pkl", "wb") as f:
    pickle.dump(model_data, f)




director (5228, 3048)
actors (5228, 8962)
genres (5228, 14)
side_genre (5228, 24)
plot (5228, 37442)
title (5228, 2232)


In [64]:
with open(f"{save_dir}/recommender_model.pkl", "rb") as f:
    model_data = pickle.load(f)

tfidf_matrices = model_data["tfidf_matrices"]
vectorizers = model_data["vectorizers"]

for feature_name, matrix in tfidf_matrices.items():
    print(feature_name, matrix.shape)


director (5228, 3048)
actors (5228, 8962)
genres (5228, 14)
side_genre (5228, 24)
plot (5228, 37442)
title (5228, 2232)


In [65]:
def df_to_json(df):
  
    records = df.to_dict(orient="records")
    return records


In [78]:
from rapidfuzz import process, fuzz
K = np.arange(len(data))
titles_K = data["title"].iloc[K].tolist()
fuzzy_matches = process.extract(
            "The Terminator",
            titles_K,
            scorer=fuzz.partial_ratio,
            limit=40
        )

cand_rel_idx = [m[2] for m in fuzzy_matches if m[1] >= 70]
print(K[cand_rel_idx])
print(data.iloc[cand_rel_idx]["title"])

[  66 3754    8  795  940  963  983 1774 4764  645 1223 1233 1336 2296
 3218  281  843 1436 3022 4356 4515  526  937 1231 1375 1393 1686 2492
 2794 3145 3198 3220 3270 3643 3809 4115  501  680  322  541]
66                          The Terminator
3754                          The Terminal
8               Terminator 2: Judgment Day
795                   Terminator Salvation
940                     Terminator Genisys
963     Terminator 3: Rise of the Machines
983                  Terminator: Dark Fate
1774                                The 33
4764                              The Jerk
645                             The A-Team
1223                               The One
1233                               The Net
1336                               The Meg
2296                              The Town
3218                               The Eye
281                                  Thief
843                              The Rover
1436                            The Tuxedo
3022                  