In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer
import joblib
import os

def load_and_prepare_data(file_path1, file_path2):
    """
    Load and combine the two parts of the dataset
    """
    # Load both parts of the data
    df1 = pd.read_csv(file_path1)
    df2 = pd.read_csv(file_path2)

    # Combine the dataframes
    df = pd.concat([df1, df2], axis=0, ignore_index=True)

    # Get the embedding columns (they start with 'desc_embed_')
    embedding_cols = [col for col in df.columns if col.startswith('desc_embed_')]

    # Extract features (embeddings) and target (categories)
    X = df[embedding_cols].values
    y = df['categories']

    return X, y

def train_genre_predictor(model_save_path='../models'):
    """
    Train a model to predict book genres based on synopsis embeddings and save it

    Args:
        model_save_path (str): Directory to save the model files

    Returns:
        tuple: Trained model and label encoder
    """
    # Create models directory if it doesn't exist
    os.makedirs(model_save_path, exist_ok=True)

    # Load and prepare the data
    X, y = load_and_prepare_data('../data/final_data_part1.csv', '../data/final_data_part2.csv')

    # Encode the genre labels
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42
    )

    # Train the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    print("\nModel Performance:")
    print(classification_report(y_test, y_pred))

    # Save the model and label encoder
    model_file = os.path.join(model_save_path, 'genre_predictor_model.joblib')
    encoder_file = os.path.join(model_save_path, 'label_encoder.joblib')

    joblib.dump(model, model_file)
    joblib.dump(label_encoder, encoder_file)

    print(f"\nModel saved to: {model_file}")
    print(f"Label encoder saved to: {encoder_file}")

    return model, label_encoder

def load_genre_predictor(model_save_path='../models'):
    """
    Load the trained model and label encoder from files

    Args:
        model_save_path (str): Directory containing the model files

    Returns:
        tuple: Loaded model and label encoder
    """
    model_file = os.path.join(model_save_path, 'genre_predictor_model.joblib')
    encoder_file = os.path.join(model_save_path, 'label_encoder.joblib')

    if not (os.path.exists(model_file) and os.path.exists(encoder_file)):
        raise FileNotFoundError("Model files not found. Please train the model first.")

    model = joblib.load(model_file)
    label_encoder = joblib.load(encoder_file)

    return model, label_encoder

def predict_genre(model, label_encoder, synopsis, sentence_transformer):
    """
    Predict the genre of a new book based on its synopsis
    """
    # Generate embedding for the new synopsis
    synopsis_embedding = sentence_transformer.encode([synopsis])

    # Make prediction
    genre_encoded = model.predict(synopsis_embedding)
    genre = label_encoder.inverse_transform(genre_encoded)

    # Get prediction probabilities
    proba = model.predict_proba(synopsis_embedding)
    top_genres_idx = np.argsort(proba[0])[-3:][::-1]  # Get top 3 genres
    top_genres = label_encoder.inverse_transform(top_genres_idx)
    top_probas = proba[0][top_genres_idx]

    return genre[0], list(zip(top_genres, top_probas))

# Example usage
if __name__ == "__main__":
    # Train and save the model
    print("Training model...")
    model, label_encoder = train_genre_predictor()

    # Alternative: Load previously trained model
    # model, label_encoder = load_genre_predictor()

    # Load the sentence transformer
    sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')

    # Example prediction
    test_synopsis = """
    In a world where magic is forbidden, a young girl discovers she has
    extraordinary powers. She must learn to control her abilities while
    hiding from those who would persecute her for her gifts.
    """

    predicted_genre, top_predictions = predict_genre(
        model, label_encoder, test_synopsis, sentence_transformer
    )

    print("\nPrediction Results:")
    print(f"Primary predicted genre: {predicted_genre}")
    print("\nTop 3 genre predictions with probabilities:")
    for genre, probability in top_predictions:
        print(f"{genre}: {probability:.2%}")


Training model...


  df2 = pd.read_csv(file_path2)



Model Performance:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        20
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         3
           6       0.00      0.00      0.00        63
           7       0.00      0.00      0.00       251
           8       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        11
          11       0.00      0.00      0.00         3
          12       0.00      0.00      0.00        29
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00        21
          15       0.00      0.00      0.00        21
          16       0.00      0.00      0.00         2
          17       0.00      0.00      0.00        53
          18       0.00      0.00      0.00         8
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Prediction Results:
Primary predicted genre: fiction

Top 3 genre predictions with probabilities:
female friendship: 62.25%
indic fiction (english): 6.17%
biography & autobiography: 2.45%


  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [None]:
model, label_encoder = load_genre_predictor('../models')
sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')
test_synopsis = """
In a world where magic is forbidden
Chris is a young lad who has always been fascinated by magic. He has always wanted to be a magician but his parents have always discouraged him from doing so. They believe that magic is evil and that it is not something that should be practiced. Chris has always been
"""
predicted_genre, top_predictions = predict_genre(model, label_encoder, test_synopsis, sentence_transformer)



In [None]:
predicted_genre

'fiction'

In [None]:
top_predictions

[('female friendship', 0.6255455368019699),
 ('indic fiction (english)', 0.05977310694324284),
 ('biography & autobiography', 0.021851509435945705)]

**Modele XGBoostClassifier**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer
import pickle
import os

In [None]:
!pip install xgboost
!pip install sentence_transformers




[notice] A new release of pip is available: 24.2 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.2 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


**Fonction : Charger et préparer les données**


In [None]:
def clean_categories(df):
    # Supprimer les valeurs nulles ou NaN
    df['categories'] = df['categories'].fillna('Unknown')

    # Forcer toutes les valeurs de 'categories' en type string
    df['categories'] = df['categories'].astype(str).str.strip()

    return df


In [None]:
def load_and_prepare_data(file_path1='../data/final_data_part1.csv', file_path2='../data/final_data_part2.csv'):
    df1 = pd.read_csv(file_path1, low_memory=False)
    df2 = pd.read_csv(file_path2, low_memory=False)

    df = pd.concat([df1, df2], axis=0, ignore_index=True)

    # Nettoyer la colonne 'categories'
    df = clean_categories(df)

    print("Unique values in 'categories':", df['categories'].unique())

    embedding_cols = [col for col in df.columns if col.startswith('desc_embed_')]
    X = df[embedding_cols].values
    y = df['categories']

    return X, y


**Fonction : Entraîner le modèle de prédiction des genres**


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

def train_genre_predictor():
    # Charger les données
    print("Loading data...")
    X, y = load_and_prepare_data('../data/final_data_part1.csv', '../data/final_data_part2.csv')

    # Encoder les étiquettes
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # Supprimer les classes rares
    def remove_rare_classes(X, y, min_samples=2):
        """Supprime les classes ayant moins de min_samples."""
        class_counts = pd.Series(y).value_counts()
        rare_classes = class_counts[class_counts < min_samples].index
        mask = ~pd.Series(y).isin(rare_classes)
        return X[mask], y[mask]

    X, y_encoded = remove_rare_classes(X, y_encoded)

    # Réindexer les étiquettes pour les rendre continues
    unique_classes = np.unique(y_encoded)
    remap_classes = {old: new for new, old in enumerate(unique_classes)}
    y_encoded = np.array([remap_classes[label] for label in y_encoded])

    # Vérifier les nouvelles classes
    print("Remaining classes after reindexing:", np.unique(y_encoded))

    # Diviser les données
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

    # Initialiser et entraîner le modèle
    model = XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        objective='multi:softmax',
        num_class=len(np.unique(y_encoded))  # Correspond désormais au nombre réel de classes
    )

    print("Fitting the model...")
    model.fit(X_train, y_train)

    # Évaluer les performances
    y_pred = model.predict(X_test)
    print(f"Classification report:\n{classification_report(y_test, y_pred)}")

    return model, label_encoder


**Fonction : Charger le modèle de prédiction des genres**


In [None]:
def load_genre_predictor(model_save_path='../models'):

    model_file = os.path.join(model_save_path, 'xgb_genre_predictor_model.pkl')
    encoder_file = os.path.join(model_save_path, 'label_encoder.pkl')

    if not (os.path.exists(model_file) and os.path.exists(encoder_file)):
        raise FileNotFoundError("Model files not found. Please train the model first.")

    with open(model_file, 'rb') as f:
        model = pickle.load(f)
    with open(encoder_file, 'rb') as f:
        label_encoder = pickle.load(f)

    return model, label_encoder

**Fonction : Prédire le genre d'un livre à partir de son synopsis**


In [None]:
def predict_genre(model, label_encoder, synopsis, sentence_transformer):
    """
    Predict the genre of a new book based on its synopsis
    """
    # Générer l'embedding pour le synopsis donné
    synopsis_embedding = sentence_transformer.encode([synopsis])

    # Faire la prédiction du genre principal
    genre_encoded = model.predict(synopsis_embedding)
    genre = label_encoder.inverse_transform(genre_encoded)

    # Obtenir les probabilités pour les trois genres les plus probables
    proba = model.predict_proba(synopsis_embedding)
    top_genres_idx = np.argsort(proba[0])[-3:][::-1]  # Get top 3 genres
    top_genres = label_encoder.inverse_transform(top_genres_idx)
    top_probas = proba[0][top_genres_idx]

    return genre[0], list(zip(top_genres, top_probas))


**Programme principale**

In [None]:
if __name__ == "__main__":

    # Vérification du modèle après entraînement
    print("Training model...")
    model, label_encoder = train_genre_predictor()
    print("Model trained successfully.")

    # Chargez et testez avec le synopsis d'exemple
    sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')
    test_synopsis = "In a world where magic is forbidden, a young girl discovers her extraordinary powers."
    predicted_genre, top_predictions = predict_genre(model, label_encoder, test_synopsis, sentence_transformer)

    print("Predicted genre:", predicted_genre)
    print("Top predictions:", top_predictions)


Loading and preparing data...
Unique values in 'categories': ['fiction' 'english fiction' 'juvenile fiction' 'gambling'
 'biography & autobiography' 'animals, mythical' 'england'
 'young adult fiction' 'mental illness' 'comics & graphic novels'
 'epidemics' "alzheimer's disease" 'dangerously mentally ill' 'history'
 'horror tales' 'book burning' 'health & fitness' 'china'
 'performing arts' 'dent, arthur (fictitious character)'
 'american fiction' 'chocolate' 'adultery' 'british' 'allegories'
 'education' 'experimental fiction' 'true crime' 'airplane crash survival'
 'social science' 'blind' 'arthurian romances' 'provence (france)'
 'juvenile nonfiction' 'humor' 'brothers' 'boston (mass.)' 'death'
 'ryan, jack (fictitious character)' 'vampires' 'domestic fiction'
 'male friendship' 'art' "children's stories" 'dystopias' 'drama'
 'alienation (social psychology)' 'study aids' 'businessmen' 'religion'
 'mentally ill' 'college attendance' 'adventure stories' 'murder'
 'escapes' 'castle roc

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       229
           1       0.00      0.00      0.00        16
           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00        64
           7       0.00      0.00      0.00       239
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00        17
          11       0.00      0.00      0.00         4
          12       0.00      0.00      0.00        33
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00        29
          15       0.00      0.00      0.00        16
          16       0.00      0.00      0.00         3
    



Predicted genre: essentialism (philosophy)
Top predictions: [('essentialism (philosophy)', 0.75125855), ('health & fitness', 0.063524604), ('black humor (literature)', 0.025025956)]
