#1. Nettoyage des données

---



In [4]:
import pandas as pd
import os

# Check if the file exists
file_path = "hotels.xlsx"
if not os.path.exists(file_path):
    print(f"Error: File '{file_path}' not found.")
    df_hotels = None
else:
    try:
        xls = pd.ExcelFile(file_path)
        df_hotels = xls.parse('Hotels')
        df_comments = xls.parse('Commentaires')
        df_questions = xls.parse('QuestionReponse')
    except Exception as e:
        print(f"An error occurred while reading the Excel file: {e}")
        df_hotels = None

In [5]:
# Check data types
print(df_hotels.dtypes)

# Check for missing values
missing_values = df_hotels.isnull().sum()
missing_percentage = (missing_values / len(df_hotels)) * 100
print("\nMissing Values:\n", missing_values)
print("\nMissing Value Percentage:\n", missing_percentage)

Lieu                          object
Nom HOTEL                     object
adresse                       object
Etoile                        object
Prix                           int64
Rate nominal                  object
Rate ordinal                  object
Expériences vécues            object
points fort                   object
Lieux à proximité             object
Restaurants et cafés          object
Plages à proximité            object
Transports en commun          object
Aéroports les plus proches    object
a savoir                      object
Enfants et lits               object
Arrive                        object
depart                        object
restriction d'age             object
Animaux domestiques           object
dtype: object

Missing Values:
 Lieu                          0
Nom HOTEL                     0
adresse                       0
Etoile                        0
Prix                          0
Rate nominal                  0
Rate ordinal                  0
Expé

In [6]:
# Check data types
print(df_comments.dtypes)

# Check for missing values
missing_values = df_comments.isnull().sum()
missing_percentage = (missing_values / len(df_comments)) * 100
print("\nMissing Values:\n", missing_values)
print("\nMissing Value Percentage:\n", missing_percentage)

nom hotel           object
Note                object
Titre               object
Commentaire         object
Date commentaire    object
dtype: object

Missing Values:
 nom hotel            0
Note                17
Titre                0
Commentaire          0
Date commentaire    17
dtype: int64

Missing Value Percentage:
 nom hotel           0.000000
Note                0.458468
Titre               0.000000
Commentaire         0.000000
Date commentaire    0.458468
dtype: float64


In [7]:
# Check data types
print(df_questions.dtypes)

# Check for missing values
missing_values = df_questions.isnull().sum()
missing_percentage = (missing_values / len(df_questions)) * 100
print("\nMissing Values:\n", missing_values)
print("\nMissing Value Percentage:\n", missing_percentage)

nom hotel      object
question       object
answer_text    object
dtype: object

Missing Values:
 nom hotel      0
question       0
answer_text    0
dtype: int64

Missing Value Percentage:
 nom hotel      0.0
question       0.0
answer_text    0.0
dtype: float64


In [8]:
import re
# Function to clean text (remove special characters, accents)
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[éèêë]', 'e', text)
        text = re.sub(r'[àâä]', 'a', text)
        text = re.sub(r'[îï]', 'i', text)
        text = re.sub(r'[ôö]', 'o', text)
        text = re.sub(r'[ùûü]', 'u', text)
        text = re.sub(r'[^a-zA-Z0-9\s,]', '', text)  # Remove special characters
    return text

# Cleaning Hotels sheet
df_hotels = df_hotels.rename(columns=lambda x: clean_text(x))  # Normalize column names
df_hotels['nom hotel'] = df_hotels['nom hotel'].apply(clean_text)
df_hotels['etoile'] = pd.to_numeric(df_hotels['etoile'], errors='coerce')  # Convert stars to numeric
df_hotels['prix'] = pd.to_numeric(df_hotels['prix'], errors='coerce')  # Convert price to numeric

# Cleaning Commentaires sheet
df_comments = df_comments.rename(columns=lambda x: clean_text(x))
df_comments['nom hotel'] = df_comments['nom hotel'].apply(clean_text)
df_comments['note'] = df_comments['note'].astype(str).str.replace(',', '.').astype(float)  # Convert ratings to float

# Cleaning QuestionReponse sheet
df_questions = df_questions.rename(columns=lambda x: clean_text(x))
df_questions['nom hotel'] = df_questions['nom hotel'].apply(clean_text)


#3. Préparation des données pour le ChatBot

##3.1 Convertion vers Json

In [9]:
import json
import numpy as np
# Assign unique hotel IDs
df_hotels['hotel_id'] = np.arange(1, len(df_hotels) + 1)

# Merge comments and questions with hotel data
hotels_json = []
for _, hotel in df_hotels.iterrows():
    hotel_id = hotel['hotel_id']
    hotel_name = hotel['nom hotel']

    # Extract comments for the current hotel
    comments = df_comments[df_comments['nom hotel'] == hotel_name][['titre', 'commentaire', 'note', 'date commentaire']].to_dict(orient='records')

    # Extract questions & answers for the current hotel
    questions = df_questions[df_questions['nom hotel'] == hotel_name][['question', 'answertext']].to_dict(orient='records')

    # Construct hotel JSON object
    hotel_data = {
        "hotel_id": int(hotel_id),
        "name": hotel['nom hotel'],
        "location": hotel['lieu'],
        "address": hotel['adresse'],
        "stars": hotel['etoile'],
        "price": hotel['prix'],
        "rating": hotel['rate nominal'],
        "features": hotel['points fort'],
        "nearby_places": hotel['lieux a proximite'],
        "nearby_beaches": hotel['plages a proximite'],
        "transport": hotel['transports en commun'],
        "airports": hotel['aeroports les plus proches'],
        "policies": {
            "checkin": hotel['arrive'],
            "checkout": hotel['depart'],
            "age_restriction": hotel['restriction dage'],
            "pets": hotel['animaux domestiques'],
            "children_beds": hotel['enfants et lits']
        },
        "additional_info": hotel['a savoir'],
        "comments": comments,
        "faq": questions
    }

    hotels_json.append(hotel_data)

# Save to JSON file
json_output_path = "hotels_data.json"
with open(json_output_path, "w", encoding="utf-8") as json_file:
    json.dump(hotels_json, json_file, indent=4, ensure_ascii=False)

print(f"JSON file saved at: {json_output_path}")

JSON file saved at: hotels_data.json


In [10]:
# Installation des bibliothèques nécessaires
!pip install langchain langchain-community pypdf chromadb -q
!pip install langchain_groq -q
!pip install -U langchain-huggingface -q
!pip install -U langchain-chroma -q
!pip install gradio -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m78.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m20.0 MB/s[0m eta [36m0:00

In [11]:
# Importations
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.schema import Document

import os
import gradio as gr
import json

In [12]:
import json
import chromadb
from langchain_community.embeddings import OpenAIEmbeddings

# Load JSON File (Make sure you have hotels.json)
with open("hotels_data.json", "r", encoding="utf-8") as file:
    hotels_json = json.load(file)

# Initialize ChromaDB Client
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection("hotel_reviews")

# Convert JSON Data into Text for Indexing
documents = []
metadata_list = []

for hotel in hotels_json:
    hotel_name = hotel.get("name", "Unknown Hotel")

    # Process Reviews
    for comment in hotel.get("comments", []):
        doc_text = f"Hotel: {hotel_name}, Review: {comment['commentaire']}"
        documents.append(doc_text)
        metadata_list.append({"hotel": hotel_name, "type": "review"})

    # Process FAQ
    for faq in hotel.get("faq", []):
        doc_text = f"Hotel: {hotel_name}, Question: {faq['question']}, Answer: {faq['answertext']}"
        documents.append(doc_text)
        metadata_list.append({"hotel": hotel_name, "type": "faq"})

# Add Documents to ChromaDB
for i, (doc, metadata) in enumerate(zip(documents, metadata_list)):
    collection.add(documents=[doc], ids=[str(i)], metadatas=[metadata])

print("✅ Data indexed successfully in ChromaDB")

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:02<00:00, 35.5MiB/s]


✅ Data indexed successfully in ChromaDB


In [13]:
# Chemin où sauvegarder la base de données
persist_directory = "./chroma_db"

# Chargement du modèle d'embeddings open-source
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Vérifier si une base Chroma existe déjà
if os.path.exists(persist_directory):
    print("Chargement de la base Chroma existante...")
    vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_function)
else:
    print("Création d'une nouvelle base Chroma...")
    vectorstore = Chroma.from_documents(docs, embedding_function, persist_directory=persist_directory)
    #vectorstore.persist()  # Sauvegarde de la base

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Chargement de la base Chroma existante...


  vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_function)


In [14]:
import google.generativeai as genai

# Configurer le LLM avec l'API Key de Groq
llm = ChatGroq(model="llama-3.3-70b-versatile", api_key="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")

# INITIALISATION DE GEN AI
genai.configure(api_key="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")

def generate_gemini_response(prompt):
    model = genai.GenerativeModel("gemini-pro")
    response = model.generate_content(prompt)
    return response.text if response else "Erreur avec Google, Merci de contacter le support"

In [19]:
# Exemple de requête
query = "Trouvez-moi un hôtel"
results = vectorstore.similarity_search(query)

print(results)

[]


In [16]:
# TEMPLATE DE PROMPT PERSONNALISÉ
prompt_template = PromptTemplate(
    template=(
        "vous etes un assistant expert en tourisme et hôtels en particulier en tunisie"
        #resumer moi l'hotel par les avis
        "Répondez avec des informations précises et pertinentes en vous basant uniquement sur le contexte fourni.\n\n"
        "Contexte: {context}\n"
        "Question du client: {question}\n\n"
        "Réponse détaillée:"
    ),
    input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)


In [17]:
# INTERFACE GRADIO
def chatbot(question, use_gemini):
    return qa_chain.run(question)

with gr.Blocks() as demo:
    gr.Markdown("# 🤖 Chatbot RAG - Hôtels 🌍")
    with gr.Row():
        input_text = gr.Textbox(label="Posez votre question")
        use_gemini = gr.Checkbox(label="Utiliser Google Gemini ?", value=False)
    output_text = gr.Textbox(label="Réponse")
    submit = gr.Button("Envoyer")
    submit.click(fn=chatbot, inputs=[input_text, use_gemini], outputs=output_text)

demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://36ef00e1799dbc8c6a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


