# 01 - Data Exploration: Customer Support Dataset

**Objectif**: Explorer et preparer le dataset pour le fine-tuning

In [None]:
# Installation si necessaire
# !pip install datasets pandas

In [1]:
from datasets import load_dataset
import pandas as pd
import json
from pathlib import Path
from collections import Counter

DATA_DIR = Path('../data')

## 1. Charger le Dataset Bitext Customer Support

In [2]:
# Dataset Bitext - Customer Support Intent Detection
dataset = load_dataset('bitext/Bitext-customer-support-llm-chatbot-training-dataset')

print(f'Dataset loaded!')
print(dataset)

README.md: 0.00B [00:00, ?B/s]

Bitext_Sample_Customer_Support_Training_(â€¦):   0%|          | 0.00/19.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26872 [00:00<?, ? examples/s]

Dataset loaded!
DatasetDict({
    train: Dataset({
        features: ['flags', 'instruction', 'category', 'intent', 'response'],
        num_rows: 26872
    })
})


In [3]:
# Convertir en DataFrame
df = pd.DataFrame(dataset['train'])
print(f'Total samples: {len(df)}')
print(f'\nColumns: {df.columns.tolist()}')
df.head()

Total samples: 26872

Columns: ['flags', 'instruction', 'category', 'intent', 'response']


Unnamed: 0,flags,instruction,category,intent,response
0,B,question about cancelling order {{Order Number}},ORDER,cancel_order,I've understood you have a question regarding ...
1,BQZ,i have a question about cancelling oorder {{Or...,ORDER,cancel_order,I've been informed that you have a question ab...
2,BLQZ,i need help cancelling puchase {{Order Number}},ORDER,cancel_order,I can sense that you're seeking assistance wit...
3,BL,I need to cancel purchase {{Order Number}},ORDER,cancel_order,I understood that you need assistance with can...
4,BCELN,"I cannot afford this order, cancel purchase {{...",ORDER,cancel_order,I'm sensitive to the fact that you're facing f...


## 2. Analyser les donnees

In [4]:
# Categories d'intent
print('Intent categories:')
print(df['intent'].value_counts())

Intent categories:
intent
check_invoice               1000
complaint                   1000
contact_customer_service    1000
edit_account                1000
switch_account              1000
check_payment_methods        999
contact_human_agent          999
delivery_period              999
get_invoice                  999
newsletter_subscription      999
payment_issue                999
registration_problems        999
cancel_order                 998
place_order                  998
track_refund                 998
change_order                 997
check_refund_policy          997
create_account               997
get_refund                   997
review                       997
set_up_shipping_address      997
delete_account               995
delivery_options             995
recover_password             995
track_order                  995
change_shipping_address      973
check_cancellation_fee       950
Name: count, dtype: int64


In [5]:
# Exemples par categorie
print('\nExemples par intent:')
for intent in df['intent'].unique()[:5]:
    example = df[df['intent'] == intent].iloc[0]
    print(f'\n--- {intent} ---')
    print(f'User: {example["instruction"][:100]}...')
    print(f'Bot: {example["response"][:100]}...')


Exemples par intent:

--- cancel_order ---
User: question about cancelling order {{Order Number}}...
Bot: I've understood you have a question regarding canceling order {{Order Number}}, and I'm here to prov...

--- change_order ---
User: I try to change several bloody items of order {{Order Number}}...
Bot: We understand that you would like to make changes to several items in order number {{Order Number}}....

--- change_shipping_address ---
User: give me information about a delivery address modification...
Bot: I'm happy to help! If you need information on modifying your delivery address, we are here to assist...

--- check_cancellation_fee ---
User: I can't ifnd the bloody termination charge, I need help...
Bot: No worries! I'll assist you in finding the termination charge. Please provide me with your account d...

--- check_invoice ---
User: show me invoice{{Invoice Number}}...
Bot: I understand your need to locate your bill with invoice number {{Invoice Number}}. Let me assist you

In [6]:
# Longueur des messages
df['instruction_len'] = df['instruction'].str.len()
df['response_len'] = df['response'].str.len()

print('Instruction length stats:')
print(df['instruction_len'].describe())
print('\nResponse length stats:')
print(df['response_len'].describe())

Instruction length stats:
count    26872.000000
mean        46.889513
std         10.897578
min          6.000000
25%         40.000000
50%         48.000000
75%         55.000000
max         92.000000
Name: instruction_len, dtype: float64

Response length stats:
count    26872.000000
mean       634.104495
std        331.593822
min         57.000000
25%        427.000000
50%        540.000000
75%        753.000000
max       2472.000000
Name: response_len, dtype: float64


## 3. Preparer le format d'entrainement

In [7]:
# Format conversation pour Mistral
def format_conversation(row):
    """Format en conversation pour fine-tuning."""
    return {
        'text': f"""<s>[INST] You are a helpful customer support assistant. Answer the customer's question professionally and helpfully.

Customer: {row['instruction']} [/INST]

Assistant: {row['response']}</s>"""
    }

# Appliquer le format
formatted_data = [format_conversation(row) for _, row in df.iterrows()]

# Exemple
print('Exemple de format:')
print(formatted_data[0]['text'][:500])

Exemple de format:
<s>[INST] You are a helpful customer support assistant. Answer the customer's question professionally and helpfully.

Customer: question about cancelling order {{Order Number}} [/INST]

Assistant: I've understood you have a question regarding canceling order {{Order Number}}, and I'm here to provide you with the information you need. Please go ahead and ask your question, and I'll do my best to assist you.</s>


In [8]:
# Split train/val/test
from sklearn.model_selection import train_test_split

train_data, temp_data = train_test_split(formatted_data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f'Train: {len(train_data)}')
print(f'Val: {len(val_data)}')
print(f'Test: {len(test_data)}')

Train: 21497
Val: 2687
Test: 2688


In [9]:
# Sauvegarder
processed_dir = DATA_DIR / 'processed'
processed_dir.mkdir(exist_ok=True)

with open(processed_dir / 'train.json', 'w') as f:
    json.dump(train_data, f, indent=2)
    
with open(processed_dir / 'val.json', 'w') as f:
    json.dump(val_data, f, indent=2)
    
with open(processed_dir / 'test.json', 'w') as f:
    json.dump(test_data, f, indent=2)

print('Data saved!')
print(f'  - {processed_dir}/train.json')
print(f'  - {processed_dir}/val.json')
print(f'  - {processed_dir}/test.json')

Data saved!
  - ../data/processed/train.json
  - ../data/processed/val.json
  - ../data/processed/test.json


## 4. Stats finales

In [10]:
# Resume
print('='*60)
print('DATASET SUMMARY')
print('='*60)
print(f'Total samples: {len(df)}')
print(f'Intent categories: {df["intent"].nunique()}')
print(f'Avg instruction length: {df["instruction_len"].mean():.0f} chars')
print(f'Avg response length: {df["response_len"].mean():.0f} chars')
print(f'\nSplit:')
print(f'  Train: {len(train_data)} ({len(train_data)/len(formatted_data)*100:.0f}%)')
print(f'  Val: {len(val_data)} ({len(val_data)/len(formatted_data)*100:.0f}%)')
print(f'  Test: {len(test_data)} ({len(test_data)/len(formatted_data)*100:.0f}%)')

DATASET SUMMARY
Total samples: 26872
Intent categories: 27
Avg instruction length: 47 chars
Avg response length: 634 chars

Split:
  Train: 21497 (80%)
  Val: 2687 (10%)
  Test: 2688 (10%)
