In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import joblib

In [4]:
try:
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('punkt')
except:
    pass



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [5]:
# Load data
books = pd.read_csv("books_cleaned.csv")
print("Category distribution:")
print(books["categories"].value_counts().head(12))

Category distribution:
categories
Fiction                      2487
Juvenile Fiction              526
Biography & Autobiography     382
History                       252
Literary Criticism            158
Comics & Graphic Novels       150
Philosophy                    147
Religion                      135
Drama                         120
Juvenile Nonfiction           111
Poetry                         71
Literary Collections           68
Name: count, dtype: int64


In [19]:
books.columns

Index(['isbn13', 'isbn10', 'title', 'authors', 'categories', 'thumbnail',
       'description', 'published_year', 'average_rating', 'num_pages',
       'ratings_count', 'title_and_subtitle', 'tagged_description',
       'cleaned_description', 'word_count', 'avg_word_length',
       'unique_words_ratio', 'sentence_count', 'predicted_category',
       'category_source'],
      dtype='object')

In [34]:
category_mapping = {'Fiction' : "Fiction",
 'Juvenile Fiction': "Children's Fiction",
 'Biography & Autobiography': "Nonfiction",
 'History': "Nonfiction",
 'Literary Criticism': "Nonfiction",
 'Philosophy': "Nonfiction",
 'Religion': "Nonfiction",
 'Comics & Graphic Novels': "Fiction",
 'Drama': "Fiction",
 'Juvenile Nonfiction': "Children's Nonfiction",
 'Science': "Nonfiction",
 'Poetry': "Fiction"}

books["simple_categories"] = books["categories"].map(category_mapping)

In [6]:
class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        if pd.isna(text):
            return ""

        # Convert to lowercase
        text = str(text).lower()

        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Tokenize and remove stopwords
        tokens = text.split()
        tokens = [token for token in tokens if token not in self.stop_words and len(token) > 2]

        # Lemmatize tokens
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens]

        return ' '.join(tokens)

    def extract_features(self, text):
        """Extract stylistic features"""
        if pd.isna(text):
            return {
                'word_count': 0,
                'avg_word_length': 0,
                'unique_words_ratio': 0,
                'sentence_count': 0
            }

        text = str(text)
        words = text.split()
        sentences = text.split('.')

        features = {
            'word_count': len(words),
            'avg_word_length': np.mean([len(word) for word in words]) if words else 0,
            'unique_words_ratio': len(set(words)) / len(words) if words else 0,
            'sentence_count': len([s for s in sentences if len(s.strip()) > 0])
        }

        return features

# Initialize the preprocessor
preprocessor = TextPreprocessor()

# Apply cleaning
print("Cleaning descriptions...")
books['cleaned_description'] = books['description'].apply(preprocessor.clean_text)

# Extract stylistic features
print("Extracting stylistic features...")
style_features = books['description'].apply(preprocessor.extract_features).apply(pd.Series)
books = pd.concat([books, style_features], axis=1)

Cleaning descriptions...
Extracting stylistic features...


In [14]:
class AdvancedFeatureEngineer:
    def __init__(self):
        self.tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
        self.genre_keywords = {
            'fiction': ['novel', 'story', 'tale', 'character', 'plot', 'narrative'],
            'nonfiction': ['history', 'biography', 'research', 'study', 'analysis', 'facts'],
            'science': ['scientific', 'research', 'experiment', 'theory', 'data', 'study'],
            'philosophy': ['philosophy', 'thought', 'theory', 'existential', 'moral', 'ethics'],
            'religion': ['god', 'religious', 'faith', 'spiritual', 'bible', 'prayer'],
            'poetry': ['poem', 'verse', 'rhyme', 'poetic', 'stanza', 'meter'],
            'drama': ['play', 'drama', 'theater', 'act', 'scene', 'dialogue']
        }

    def extract_tfidf_features(self, texts):
        return self.tfidf.fit_transform(texts)

    def extract_keyword_features(self, text):
        """Features based on the presence of genre-specific keywords"""
        text_lower = text.lower()
        features = {}

        for genre, keywords in self.genre_keywords.items():
            features[f'kw_{genre}'] = sum(1 for keyword in keywords if keyword in text_lower)

        return features

    def extract_metadata_features(self, df):
        """Extract metadata-based features"""
        features = pd.DataFrame()

        # Publication year features
        current_year = pd.Timestamp.now().year
        features['years_since_pub'] = current_year - df['published_year']
        features['is_recent'] = (features['years_since_pub'] <= 10).astype(int)

        # Popularity features
        features['rating_scaled'] = df['average_rating'] / 5.0
        features['popularity'] = np.log1p(df['ratings_count'])

        # Length features
        features['pages_scaled'] = df['num_pages'] / df['num_pages'].max()

        return features


# Feature Engineering
print("Feature Engineering...")
feature_engineer = AdvancedFeatureEngineer()

# TF-IDF features
tfidf_features = feature_engineer.extract_tfidf_features(books['cleaned_description'])

# Keyword features
keyword_features = books['cleaned_description'].apply(
    feature_engineer.extract_keyword_features
).apply(pd.Series)

# Metadata features
metadata_features = feature_engineer.extract_metadata_features(books)

# Stylistic features
style_features = books['description'].apply(preprocessor.extract_features).apply(pd.Series)

# Combine all features
all_features = pd.concat([
    pd.DataFrame(tfidf_features.toarray()),
    keyword_features,
    metadata_features,
    style_features
], axis=1)

# 🔥 FIX: Convert all column names to strings
all_features.columns = all_features.columns.astype(str)

# Handle NaN values
all_features = all_features.fillna(0)

print(f"✅ Features shape: {all_features.shape}")
print(f"✅ Column name types: {set(type(col) for col in all_features.columns)}")


Feature Engineering...
✅ Features shape: (6374, 5016)
✅ Column name types: {<class 'str'>}


In [25]:
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from collections import Counter

class BookClassifier:
    def __init__(self):
        self.models = {
            'random_forest': RandomForestClassifier(
                n_estimators=100,
                max_depth=20,
                random_state=42,
                class_weight='balanced'
            ),
            'logistic_regression': LogisticRegression(
                max_iter=1000,
                random_state=42,
                class_weight='balanced'
            ),
            'lightgbm': lgb.LGBMClassifier(
                n_estimators=100,
                max_depth=10,
                learning_rate=0.1,
                num_leaves=31,
                min_child_samples=20,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.1,
                reg_lambda=0.1,
                random_state=42,
                class_weight='balanced',
                n_jobs=-1,
                verbose=-1
            )
        }
        self.label_encoder = LabelEncoder()
        self.feature_selector = None

    def prepare_categories(self, categories_series):
        """Prepare categories with smart grouping and return valid indices."""
        category_mapping = {
            'Fiction': 'Fiction',
            'Juvenile Fiction': "Children's Fiction",
            'Comics & Graphic Novels': 'Fiction',
            'Drama': 'Fiction',
            'Poetry': 'Fiction',
            'Biography & Autobiography': 'Nonfiction',
            'History': 'Nonfiction',
            'Literary Criticism': 'Nonfiction',
            'Philosophy': 'Nonfiction',
            'Religion': 'Nonfiction',
            'Juvenile Nonfiction': "Children's Nonfiction",
            'Science': 'Nonfiction'
        }

        # Apply mapping
        mapped_categories = categories_series.map(category_mapping)

        # Keep only valid (non-NaN) mapped categories and their indices
        mapped_categories_clean = mapped_categories.dropna()
        valid_indices = mapped_categories_clean.index

        # Encode labels (keeps order consistent with mapped_categories_clean)
        encoded_labels = self.label_encoder.fit_transform(mapped_categories_clean)

        return mapped_categories_clean, encoded_labels, valid_indices

    def train_ensemble(self, X, y):
        """Train an ensemble of models"""
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        # Train all models
        trained_models = {}
        for name, model in self.models.items():
            print(f"Training {name}...")
            model.fit(X_train, y_train)
            trained_models[name] = model

            # Evaluate model
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            print(f"Accuracy of {name}: {accuracy:.4f}")

        self.trained_models = trained_models
        return X_train, X_test, y_train, y_test

    def predict_ensemble(self, X):
        """Predict using majority voting across models"""
        predictions = []

        for name, model in self.trained_models.items():
            pred = model.predict(X)
            predictions.append(pred)

        # Majority vote
        ensemble_pred = np.apply_along_axis(
            lambda x: np.bincount(x).argmax(),
            axis=0,
            arr=np.array(predictions)
        )

        return self.label_encoder.inverse_transform(ensemble_pred)


# Data preparation for training
print("Preparing data...")
classifier = BookClassifier()

# Filter books with known categories
known_categories = books[books['categories'].notna()]

# 🔥 NEW VERSION: Retrieve the 3 returned values
mapped_categories_clean, encoded_labels, valid_indices = classifier.prepare_categories(known_categories['categories'])

# 🔥 FIX: Use valid_indices for the features
X_known = all_features.loc[valid_indices]
y_known = encoded_labels

print(f"✅ Number of training examples: {len(X_known)}")
print(f"✅ Class distribution: {Counter(mapped_categories_clean)}")
print(f"✅ Check - X_known: {len(X_known)}, y_known: {len(y_known)}")

# Ensure the sizes match
assert len(X_known) == len(y_known), f"Inconsistent sizes: X={len(X_known)}, y={len(y_known)}"

Preparing data...
✅ Number of training examples: 4606
✅ Class distribution: Counter({'Fiction': 2828, 'Nonfiction': 1141, "Children's Fiction": 526, "Children's Nonfiction": 111})
✅ Check - X_known: 4606, y_known: 4606


In [26]:
# Train the ensemble model
print("Training the ensemble model...")
X_train, X_test, y_train, y_test = classifier.train_ensemble(X_known, y_known)

# Detailed evaluation
from sklearn.metrics import classification_report

y_pred_ensemble = classifier.predict_ensemble(X_test)
print("\nDetailed classification report:")
print(classification_report(
    classifier.label_encoder.inverse_transform(y_test),
    y_pred_ensemble
))


Training the ensemble model...
Training random_forest...
Accuracy of random_forest: 0.7408
Training logistic_regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy of logistic_regression: 0.7223
Training lightgbm...
Accuracy of lightgbm: 0.7549

Detailed classification report:
                       precision    recall  f1-score   support

   Children's Fiction       0.54      0.69      0.60       105
Children's Nonfiction       0.31      0.18      0.23        22
              Fiction       0.83      0.80      0.81       566
           Nonfiction       0.69      0.69      0.69       229

             accuracy                           0.75       922
            macro avg       0.59      0.59      0.58       922
         weighted avg       0.75      0.75      0.75       922



In [27]:
# Prediction for books without categories
unknown_categories = books[books['categories'].isna()]
if len(unknown_categories) > 0:
    print(f"Predicting categories for {len(unknown_categories)} books...")

    # Features for books without categories
    X_unknown = all_features.loc[unknown_categories.index]

    # Prediction
    predicted_categories = classifier.predict_ensemble(X_unknown)

    # Add predictions to the dataset
    books.loc[unknown_categories.index, 'predicted_category'] = predicted_categories
    books.loc[unknown_categories.index, 'category_source'] = 'predicted'
else:
    books['predicted_category'] = None
    books['category_source'] = 'original'

# Create the final category column
# 🔥 CORRECTION : Create the final category column correctly

# 1. First, apply the same category mapping we used in prepare_categories
category_mapping = {
    'Fiction': 'Fiction',
    'Juvenile Fiction': "Children's Fiction",
    'Comics & Graphic Novels': 'Fiction',
    'Drama': 'Fiction',
    'Poetry': 'Fiction',
    'Biography & Autobiography': 'Nonfiction',
    'History': 'Nonfiction',
    'Literary Criticism': 'Nonfiction',
    'Philosophy': 'Nonfiction',
    'Religion': 'Nonfiction',
    'Juvenile Nonfiction': "Children's Nonfiction",
    'Science': 'Nonfiction'
}

# 2. Map the original categories using our mapping dictionary
books['mapped_original_categories'] = books['categories'].map(category_mapping)

# 3. Create final category: use mapped original categories where available, otherwise use predictions
books['final_category'] = books['mapped_original_categories'].fillna(books['predicted_category'])

# 4. Handle any remaining NaN values (safety net)
books['final_category'] = books['final_category'].fillna('Unknown')

# 5. Clean up temporary column
books = books.drop('mapped_original_categories', axis=1)

print("✅ Final category distribution:")
print(books['final_category'].value_counts())

print("\n✅ Category sources:")
print(books['category_source'].value_counts())

Predicting categories for 33 books...
✅ Final category distribution:
final_category
Fiction                  2844
Unknown                  1735
Nonfiction               1151
Children's Fiction        531
Children's Nonfiction     113
Name: count, dtype: int64

✅ Category sources:
category_source
predicted    33
Name: count, dtype: int64


In [28]:
# Save the models
joblib.dump(classifier, 'book_classifier_ensemble.pkl')
joblib.dump(feature_engineer, 'feature_engineer.pkl')

# Save the enriched dataset
output_columns = [
    'isbn13', 'title', 'authors', 'categories', 'final_category',
    'category_source', 'description', 'published_year', 'average_rating',
    'num_pages', 'ratings_count', 'thumbnail'
]

books[output_columns].to_csv("books_classified_enhanced.csv", index=False)

print("✅ Classification completed!")
print(f"📊 Books classified: {len(books)}")


✅ Classification completed!
📊 Books classified: 6374


In [30]:
# Colonnes à inclure dans le CSV final
output_columns = books.columns.tolist()  # Toutes les colonnes originales
if 'final_category' not in output_columns:
    output_columns.append('final_category')  # Juste au cas où

# Sauvegarde du nouveau dataset
books[output_columns].to_csv("books_with_final_category.csv", index=False)

print("✅ New CSV created: 'books_with_final_category.csv'")


✅ New CSV created: 'books_with_final_category.csv'


In [32]:
books.head()

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,...,title_and_subtitle,tagged_description,cleaned_description,word_count,avg_word_length,unique_words_ratio,sentence_count,predicted_category,category_source,final_category
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,...,Gilead,9780002005883 A NOVEL THAT READERS and critics...,novel reader critic eagerly anticipating decad...,199.0,4.80402,0.633166,7.0,,,Fiction
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,...,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,new christie christmas fulllength novel adapte...,205.0,4.858537,0.682927,8.0,,,Unknown
2,9780006163831,6163831,The One Tree,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,...,The One Tree,9780006163831 Volume Two of Stephen Donaldson'...,volume two stephen donaldsons acclaimed second...,14.0,6.857143,1.0,1.0,,,Unknown
3,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,...,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",memorable mesmerizing heroine jennifer brillia...,57.0,5.315789,0.877193,2.0,,,Fiction
4,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,...,The Four Loves,9780006280897 Lewis' work on the nature of lov...,lewis work nature love divide love four catego...,45.0,5.577778,0.866667,3.0,,,Unknown


In [33]:
category_percent = books['final_category'].value_counts(normalize=True) * 100

print("✅ Distribution of final categories (%):")
print(category_percent)


✅ Distribution of final categories (%):
final_category
Fiction                  44.618764
Unknown                  27.219956
Nonfiction               18.057735
Children's Fiction        8.330719
Children's Nonfiction     1.772827
Name: proportion, dtype: float64
