In [1]:
import pandas as pd

# Charger les datasets
df1 = pd.read_csv('books_dataset.csv')
df2 = pd.read_csv('Users.csv')
df3 = pd.read_csv('Ratings.csv')



In [2]:
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
import re

# Convertir en minuscules
df1['Book-Title'] = df1['Book-Title'].str.lower()
df1['description'] = df1['description'].str.lower()
df1['categories'] = df1['categories'].str.lower()
df1['Book-Author'] = df1['Book-Author'].str.lower()

df1['description'] = df1['description'].fillna('')
# Supprimer tous les caractères non alphabétiques et non numériques
df1['description'] = df1['description'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
# Supprimer les stopwords
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])

df1['description'] = df1['description'].apply(remove_stopwords)

# Appliquer le stemming
stemmer = PorterStemmer()
def stem_text(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df1['description'] = df1['description'].apply(stem_text)

In [3]:
print(df1.head())

         ISBN              Book-Title       Book-Author  Year-Of-Publication  \
0  0399135782  the kitchen god's wife           amy tan                 1991   
1  0440234743           the testament      john grisham                 1999   
2  0345402871                airframe  michael crichton                 1997   
3  055321215X     pride and prejudice       jane austen                 1983   
4  055321215X     pride and prejudice       jane austen                 1983   

          Publisher         categories  \
0  Putnam Pub Group            fiction   
1              Dell            fiction   
2  Ballantine Books  adventure stories   
3            Bantam          courtship   
4            Bantam            fiction   

                                         description  
0  winni louie age chines woman convinc die soon ...  
1  suicid billionair burntout washington litig wo...  
2  twin jet plane en rout denver hong kong mere g...  
3                                       publis

In [4]:
mean_age = df2['Age'].mean()
df2['Age'] = df2['Age'].fillna(mean_age)
# Nettoyage de la colonne 'Location'
df2['Location'] = df2['Location'].fillna('')  # Remplacer les valeurs manquantes par une chaîne vide
df2['Location'] = df2['Location'].apply(lambda x: re.sub(r'[^a-zA-Z0-9,\s]', '', x))  # Supprimer la ponctuation
df2['Location'] = df2['Location'].str.lower()  # Convertir en minuscules
df2['Location'] = df2['Location'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())  # Supprimer les espaces multiples
df2['Location'] = df2['Location'].apply(lambda x: [loc.strip() for loc in x.split(',')])
df2['Location'] = df2['Location'].apply(lambda x: ' '.join(x))


In [5]:
print(df2.head())

   User-ID                          Location        Age
0        1                  nyc new york usa  34.751434
1        2           stockton california usa  18.000000
2        3     moscow yukon territory russia  34.751434
3        4             porto vngaia portugal  17.000000
4        5  farnborough hants united kingdom  34.751434


In [6]:
data = pd.merge(df3, df1, on='ISBN')  # Fusionner ratings et books
data = pd.merge(data, df2, on='User-ID')  # Fusionner avec users
data = data.dropna(subset=['categories'])

In [7]:
print(data.head())

   User-ID        ISBN  Book-Rating       Book-Title     Book-Author  \
0   276744  038550120X            7  a painted house    john grisham   
1   276746  0425115801            0        lightning  dean r. koontz   
2   276746  0786013990            0      at the edge       david dun   
3   276762  0451167317            0    the dark half    stephen king   
4   276786  8437606322            8    anna karenina     leo tolstoy   

   Year-Of-Publication                 Publisher       categories  \
0                 2001                 Doubleday          fiction   
1                 1996  Berkley Publishing Group          fiction   
2                 2002            Pinnacle Books          fiction   
3                 1994               Signet Book  english fiction   
4                 1999    Ediciones Catedra S.A.          fiction   

                                         description  \
0  racial tension forbidden love affair murder se...   
1  aid crise life miracul intervent stra

In [8]:
!pip install transformers sentence-transformers



In [9]:
!pip install tf-keras



In [10]:
from sentence_transformers import SentenceTransformer

# Charger un modèle pré-entraîné (par exemple, 'all-MiniLM-L6-v2')
model = SentenceTransformer('all-MiniLM-L6-v2')




In [11]:
descriptions = data['description'].tolist()
description_embeddings = model.encode(descriptions)
description_embeddings_df = pd.DataFrame(description_embeddings)
description_embeddings_df.columns = [f'desc_embed_{i}' for i in range(description_embeddings.shape[1])]

# One-Hot Encoding pour les auteurs
author_onehot = pd.get_dummies(data['Book-Author'], prefix='author')

# One-Hot Encoding pour les catégories
categories_onehot = pd.get_dummies(data['categories'], prefix='category')

# One-Hot Encoding pour les éditeurs (optionnel)
publisher_onehot = pd.get_dummies(data['Publisher'], prefix='publisher')

# Combiner les nouvelles colonnes avec le tableau existant
final_data = pd.concat([data, description_embeddings_df, author_onehot, categories_onehot, publisher_onehot], axis=1)

# Sauvegarder le résultat (optionnel)
final_data.to_csv('final_data_with_embeddings.csv', index=False)

# Afficher le résultat
print(final_data.head())

    User-ID        ISBN  Book-Rating       Book-Title     Book-Author  \
0  276744.0  038550120X          7.0  a painted house    john grisham   
1  276746.0  0425115801          0.0        lightning  dean r. koontz   
2  276746.0  0786013990          0.0      at the edge       david dun   
3  276762.0  0451167317          0.0    the dark half    stephen king   
4  276786.0  8437606322          8.0    anna karenina     leo tolstoy   

   Year-Of-Publication                 Publisher       categories  \
0               2001.0                 Doubleday          fiction   
1               1996.0  Berkley Publishing Group          fiction   
2               2002.0            Pinnacle Books          fiction   
3               1994.0               Signet Book  english fiction   
4               1999.0    Ediciones Catedra S.A.          fiction   

                                         description  \
0  racial tension forbidden love affair murder se...   
1  aid crise life miracul interven

In [12]:
print(final_data[[f'desc_embed_{i}' for i in range(10)]].head())  # Affiche les 10 premières colonnes d'embeddings

   desc_embed_0  desc_embed_1  desc_embed_2  desc_embed_3  desc_embed_4  \
0     -0.081907      0.056832     -0.072894      0.003609      0.008474   
1     -0.104678      0.041777      0.026682     -0.006573      0.042539   
2     -0.080322     -0.018269      0.027509     -0.034089     -0.000206   
3     -0.051448     -0.028218     -0.029023      0.067669     -0.075008   
4     -0.033280     -0.009284     -0.039122      0.059777     -0.123746   

   desc_embed_5  desc_embed_6  desc_embed_7  desc_embed_8  desc_embed_9  
0      0.079933      0.002951     -0.071217      0.011958      0.054024  
1      0.113003      0.004491     -0.004804     -0.039285      0.044989  
2      0.032274      0.082414     -0.074306     -0.023927     -0.000848  
3      0.068725      0.036782     -0.020997     -0.045844     -0.016568  
4      0.040082      0.060801      0.062259      0.008706      0.010951  


In [19]:
print(categories_onehot.columns)


Index(['category_abused wives', 'category_accidents', 'category_actors',
       'category_actresses', 'category_adolescence',
       'category_adult education', 'category_adultery',
       'category_adventure stories', 'category_aeneas (legendary character)',
       'category_african american families',
       ...
       'category_television', 'category_travel', 'category_true crime',
       'category_united states', 'category_vampires', 'category_war',
       'category_women terrorists', 'category_world war, 1914-1918',
       'category_young adult fiction', 'category_zero (the number)'],
      dtype='object', length=314)


In [22]:
import pickle

with open('description_embeddings.pkl', 'wb') as f:
    pickle.dump(model, f)
