In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer
from sklearn.impute import SimpleImputer
import pickle
import os
from sklearn.linear_model import LogisticRegression




**Fonction : Charger et préparer les données**


In [2]:
def load_and_prepare_data(file_path1, file_path2):
    df1 = pd.read_csv(file_path1, low_memory=False)
    df2 = pd.read_csv(file_path2, low_memory=False)

    df = pd.concat([df1, df2], axis=0, ignore_index=True)

    # Prendre 10% des donnnées pour moins de temps de calcul

    # df = df.sample(frac=0.1, random_state=42)

    def clean_categories(df):
        # Supprimer les valeurs nulles ou NaN
        df['categories'] = df['categories'].fillna('Unknown')
        # Forcer toutes les valeurs de 'categories' en type string
        df['categories'] = df['categories'].astype(str).str.strip()
        return df
    # Nettoyer la colonne 'categories'
    df = clean_categories(df)

    print("Unique values in 'categories':", df['categories'].unique())

    embedding_cols = [col for col in df.columns if col.startswith('desc_embed_')]
    X = df[embedding_cols].values
    y = df['categories']

    imputer = SimpleImputer(strategy="mean")
    X = imputer.fit_transform(X)

    # Encoder les étiquettes
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # Enregistrer l'encodeur pour une utilisation ultérieure
    encoder_filename = "../models/label_encoder.pkl"
    with open(encoder_filename, "wb") as f:
        pickle.dump(label_encoder, f)

    # Supprimer les classes rares
    def remove_rare_classes(X, y, min_samples=2):
        """Supprime les classes ayant moins de min_samples."""
        class_counts = pd.Series(y).value_counts()
        rare_classes = class_counts[class_counts < min_samples].index
        mask = ~pd.Series(y).isin(rare_classes)
        return X[mask], y[mask]

    X, y_encoded = remove_rare_classes(X, y_encoded)

    # Réindexer les étiquettes pour les rendre continues
    unique_classes = np.unique(y_encoded)
    remap_classes = {old: new for new, old in enumerate(unique_classes)}
    y_encoded = np.array([remap_classes[label] for label in y_encoded])

    # Vérifier les nouvelles classes
    print("Remaining classes after reindexing:", np.unique(y_encoded))

    return X, y_encoded, label_encoder


**Fonction : Entraîner le modèle de prédiction des genres**


In [3]:
def train_genre_predictor(X, y_encoded, model_choice):
    # Diviser les données
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )
    # Choisir un modèle
    if model_choice == "Logistic Reg":
        model = LogisticRegression(
            max_iter=1000, 
            multi_class='ovr'
        )
    else:
        raise ValueError(f"Invalid model choice: {model_choice}")

    print("Fitting the model...")
    model.fit(X_train, y_train)

    # Sauvegarder le modèle
    model_filename = f"../models/{model_choice}.pkl"
    with open(model_filename, "wb") as f:
        pickle.dump(model, f)
    print(f"Model saved as {model_filename}")

    # Évaluer les performances
    y_pred = model.predict(X_test)
    print(f"Classification report:\n{classification_report(y_test, y_pred)}")

    return model


**Fonction : Prédire le genre d'un livre à partir de son synopsis**


In [4]:
def predict_genre(model, label_encoder, synopsis, sentence_transformer):
    """
    Predict the genre of a new book based on its synopsis
    """
    # Generate embedding for the new synopsis
    synopsis_embedding = sentence_transformer.encode([synopsis])

    # Make prediction
    genre_encoded = model.predict(synopsis_embedding)
    genre = label_encoder.inverse_transform(genre_encoded)

    # Get prediction probabilities
    proba = model.predict_proba(synopsis_embedding)
    top_genres_idx = np.argsort(proba[0])[-3:][::-1]  # Get top 3 genres
    top_genres = label_encoder.inverse_transform(top_genres_idx)
    top_probas = proba[0][top_genres_idx]

    return genre[0], list(zip(top_genres, top_probas))


**Fonction : Charger le modèle de prédiction des genres**


In [5]:
def load_genre_predictor(model_save_path):
    # model_save_path est le chemin direct vers le pkl du modèle
    model_file = model_save_path
    encoder_file = "../models/label_encoder.pkl"
    
    if not (os.path.exists(model_file) and os.path.exists(encoder_file)):
        raise FileNotFoundError("Model files not found. Please train the model first.")

    with open(model_file, 'rb') as f:
        model = pickle.load(f)
    with open(encoder_file, 'rb') as f:
        label_encoder = pickle.load(f)

    return model, label_encoder

**Programme principale**

In [6]:
# Charger les données
file_path1 = '../data/final_data_part1.csv'
file_path2 = '../data/final_data_part2.csv'

# Charger les données
print("Loading data...")
X, y_encoded, label_encoder = load_and_prepare_data(file_path1, file_path2)

Loading data...
Unique values in 'categories': ['fiction' 'english fiction' 'juvenile fiction' 'gambling'
 'biography & autobiography' 'animals, mythical' 'england'
 'young adult fiction' 'mental illness' 'comics & graphic novels'
 'epidemics' "alzheimer's disease" 'dangerously mentally ill' 'history'
 'horror tales' 'book burning' 'health & fitness' 'china'
 'performing arts' 'dent, arthur (fictitious character)'
 'american fiction' 'chocolate' 'adultery' 'british' 'allegories'
 'education' 'experimental fiction' 'true crime' 'airplane crash survival'
 'social science' 'blind' 'arthurian romances' 'provence (france)'
 'juvenile nonfiction' 'humor' 'brothers' 'boston (mass.)' 'death'
 'ryan, jack (fictitious character)' 'vampires' 'domestic fiction'
 'male friendship' 'art' "children's stories" 'dystopias' 'drama'
 'alienation (social psychology)' 'study aids' 'businessmen' 'religion'
 'mentally ill' 'college attendance' 'adventure stories' 'murder'
 'escapes' 'castle rock (me. : imagi

In [7]:
if __name__ == "__main__":

    # Choix du model avec une variable
    model_choice = "Logistic Reg"
    
    # Vérification du modèle après entraînement
    print("Training model...")
    model= train_genre_predictor(X, y_encoded, model_choice)
    print("Model trained successfully.")



Training model...
Fitting the model...
Model saved as ../models/Logistic Reg.pkl
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       229
           1       0.00      0.00      0.00        16
           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00        64
           7       0.00      0.00      0.00       239
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00        17
          11       0.00      0.00      0.00         4
          12       0.00      0.00      0.00        33
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00        29
          15       0.00      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
model_save_path = f"../models/{model_choice}.pkl"
model, label_encoder = load_genre_predictor(model_save_path)

# Chargez et testez avec le synopsis d'exemple
sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')
test_synopsis = "In a world where magic is forbidden, a young girl discovers she has extraordinary powers. She must learn to control her abilities while hiding from those who would persecute her for her gifts."

main_genre, top_predictions = predict_genre(model, label_encoder, test_synopsis, sentence_transformer)

print(f"Genre principal: {main_genre}")
print("Top 3 genres probables :")
for genre, prob in top_predictions:
    print(f"  - Genre: {genre} | Probabilité: {prob:.2f}")



Genre principal: essentialism (philosophy)
Top 3 genres probables :
  - Genre: essentialism (philosophy) | Probabilité: 0.58
  - Genre: health & fitness | Probabilité: 0.05
  - Genre: belgians | Probabilité: 0.03


  attn_output = torch.nn.functional.scaled_dot_product_attention(
