<a href="https://colab.research.google.com/github/zizoupavon/Deep-Learning---LLM-vs-Fine-Tuned-SLM-with-LoRA/blob/main/Project_Intent_Classification_(2)_With_refinement_to_X_Y_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("bitext/bitext-gen-ai-chatbot-customer-support-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'bitext-gen-ai-chatbot-customer-support-dataset' dataset.
Path to dataset files: /kaggle/input/bitext-gen-ai-chatbot-customer-support-dataset


## Summary:

### Q&A
Yes, the dataset has been successfully loaded into the `df` DataFrame.

### Data Analysis Key Findings
*   The primary CSV file, `Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv`, was successfully identified and loaded into a pandas DataFrame named `df`.
*   The DataFrame `df` contains columns such as 'flags', 'instruction', 'category', 'intent', and 'response', as verified by displaying its head.

### Insights or Next Steps
*   The loaded DataFrame `df` is now ready for further data exploration, cleaning, and analysis to understand the customer support interactions.
*   A logical next step would be to perform a quick data overview, including checking data types, missing values, and unique values in categorical columns.


In [2]:
import pandas as pd
import os

# List files in the dataset directory
files_in_path = os.listdir(path)
print(f"Files available in the dataset directory: {files_in_path}")

# Identify the main CSV file. Based on previous runs or common dataset structures,
# it's often 'Bitext_Sample_-_es_to_en_-_Sheet1.csv'
# Let's try to find a .csv file or assume the given name.
csv_file_name = None
for file in files_in_path:
    if file.endswith('.csv'):
        csv_file_name = file
        break

if csv_file_name is None:
    print("No CSV file found in the dataset directory. Attempting to use a known filename.")
    # Fallback to a known filename if no CSV is automatically detected
    csv_file_name = 'Bitext_Sample_-_es_to_en_-_Sheet1.csv'

csv_file_path = os.path.join(path, csv_file_name)

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

print(f"Successfully loaded '{csv_file_name}' into DataFrame 'df'.")

# Display the head of the DataFrame to verify
df.head()

Files available in the dataset directory: ['Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv']
Successfully loaded 'Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv' into DataFrame 'df'.


Unnamed: 0,flags,instruction,category,intent,response
0,B,question about cancelling order {{Order Number}},ORDER,cancel_order,I've understood you have a question regarding ...
1,BQZ,i have a question about cancelling oorder {{Or...,ORDER,cancel_order,I've been informed that you have a question ab...
2,BLQZ,i need help cancelling puchase {{Order Number}},ORDER,cancel_order,I can sense that you're seeking assistance wit...
3,BL,I need to cancel purchase {{Order Number}},ORDER,cancel_order,I understood that you need assistance with can...
4,BCELN,"I cannot afford this order, cancel purchase {{...",ORDER,cancel_order,I'm sensitive to the fact that you're facing f...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26872 entries, 0 to 26871
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   flags        26872 non-null  object
 1   instruction  26872 non-null  object
 2   category     26872 non-null  object
 3   intent       26872 non-null  object
 4   response     26872 non-null  object
dtypes: object(5)
memory usage: 1.0+ MB


In [4]:
import re

# Define the regex pattern for placeholders
placeholder_pattern = r"\{\{([^}]+)\}\}"

# Initialize a set to store unique placeholders
unique_placeholders = set()

# Iterate through the 'instruction' column and extract placeholders
if 'instruction' in df.columns:
    for instruction_text in df['instruction'].astype(str):
        matches = re.findall(placeholder_pattern, instruction_text)
        for match in matches:
            unique_placeholders.add(match.strip())

    print("Unique placeholders found:")
    for placeholder in sorted(list(unique_placeholders)):
        print(placeholder)
else:
    print("The 'instruction' column does not exist in the DataFrame.")

Unique placeholders found:
Account Category
Account Type
Currency Symbol
Delivery City
Delivery Country
Invoice Number
Order Number
Person Name
Refund Amount


In [5]:
import re

# Define the mapping of placeholders to synthetic values
synthetic_values = {
    "Order Number": "ORD-2025-12345",
    "Person Name": "Maria Garcia",
    "email": "alex.smith@example.com", # Assuming 'email' is a placeholder
    "product_name": "Wireless Mouse", # Assuming 'product_name' is a placeholder
    "date": "05/09/2025", # Assuming 'date' is a placeholder
    "Refund Amount": "$50.00", # Based on previously found placeholders
    "Account Category": "Savings", # Based on previously found placeholders
    "Account Type": "Checking", # Based on previously found placeholders
    "Currency Symbol": "$", # Based on previously found placeholders
    "Delivery City": "New York", # Based on previously found placeholders
    "Delivery Country": "USA", # Based on previously found placeholders
    "Invoice Number": "INV-2025-54321", # Based on previously found placeholders
}

# Create a copy of the DataFrame to store the modified instructions
df_synthetic = df.copy()

# Function to replace placeholders
def replace_placeholders(text):
    for placeholder, value in synthetic_values.items():
        # Use re.escape to handle special characters in placeholder names
        text = re.sub(r'\{\{' + re.escape(placeholder) + r'\}\}', value, text, flags=re.IGNORECASE)
    return text

# Apply the replacement function to the 'instruction' column
if 'instruction' in df_synthetic.columns:
    df_synthetic['instruction_synthetic'] = df_synthetic['instruction'].apply(replace_placeholders)
    print("Instructions with synthetic values generated successfully. Displaying original and synthetic instructions for verification:")
    display(df_synthetic[['instruction', 'instruction_synthetic']].head())
else:
    print("The 'instruction' column does not exist in the DataFrame.")

Instructions with synthetic values generated successfully. Displaying original and synthetic instructions for verification:


Unnamed: 0,instruction,instruction_synthetic
0,question about cancelling order {{Order Number}},question about cancelling order ORD-2025-12345
1,i have a question about cancelling oorder {{Or...,i have a question about cancelling oorder ORD-...
2,i need help cancelling puchase {{Order Number}},i need help cancelling puchase ORD-2025-12345
3,I need to cancel purchase {{Order Number}},I need to cancel purchase ORD-2025-12345
4,"I cannot afford this order, cancel purchase {{...","I cannot afford this order, cancel purchase OR..."


In [6]:
df_synthetic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26872 entries, 0 to 26871
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   flags                  26872 non-null  object
 1   instruction            26872 non-null  object
 2   category               26872 non-null  object
 3   intent                 26872 non-null  object
 4   response               26872 non-null  object
 5   instruction_synthetic  26872 non-null  object
dtypes: object(6)
memory usage: 1.2+ MB


In [7]:
if 'instruction_synthetic' in df_synthetic.columns:
    print("Sampling 20 processed rows for verification:")
    display(df_synthetic[['instruction', 'instruction_synthetic']].sample(n=20, random_state=42))
else:
    print("The 'instruction_synthetic' column does not exist in the DataFrame. Please ensure the placeholder replacement step was executed correctly.")

Sampling 20 processed rows for verification:


Unnamed: 0,instruction,instruction_synthetic
9329,I can't talk with a human agent,I can't talk with a human agent
4160,I have got to locate hte bills from {{Person N...,I have got to locate hte bills from Maria Garcia
18500,"I cannot pay, help me to inform of a problem w...","I cannot pay, help me to inform of a problem w..."
8840,I want help speaking to customer service,I want help speaking to customer service
5098,I try to see th accepted payment options,I try to see th accepted payment options
17250,where to sign up to the company nmewsletter,where to sign up to the company nmewsletter
3589,I'd like to see the withdrwaal fee how can i d...,I'd like to see the withdrwaal fee how can i d...
9043,I want to speak with someone,I want to speak with someone
15800,can you help me getting bill #85632?,can you help me getting bill #85632?
4384,I don't know how to take a quick look at invoi...,I don't know how to take a quick look at invoi...


In [12]:
from sklearn.model_selection import train_test_split

# Drop the original 'instruction' column and keep 'instruction_synthetic'
if 'instruction' in df_synthetic.columns:
    df_synthetic = df_synthetic.drop(columns=['instruction'])

# Define the features (X) and target (y) for intent classification
X = df_synthetic.drop(columns=['response', 'intent']) # Features should not include the target or the response
y = df_synthetic['intent'] # Target is the 'intent' column

# First split: 80% train, 20% (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Second split: 50% validation, 50% test from the 20% temporary set
# This results in 10% validation and 10% test from the original dataset
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training set size: {len(X_train)} samples")
print(f"Validation set size: {len(X_val)} samples")
print(f"Test set size: {len(X_test)} samples")

# Display the head of the training set features as an example
print("\nHead of X_train:")
display(X_train.head())

Training set size: 21497 samples
Validation set size: 2687 samples
Test set size: 2688 samples

Head of X_train:


Unnamed: 0,flags,category,instruction_synthetic
18407,BL,PAYMENT,I want assistance informing of a trouble with ...
4878,BKL,INVOICE,see bill #12588
26858,BCL,REFUND,"I want to see my refund current status, help me"
25464,BKL,ORDER,see current status of order ORD-2025-12345
12237,BCIL,DELIVERY,"I need to check the options for shipping, how ..."


In [10]:
# The 'combined_instruction' column is not used for the current zero-shot LLM inference,
# and to avoid any potential confusion or perceived data leakage, this step is skipped.
# If a combined instruction is needed for future steps (e.g., fine-tuning transformers),
# it can be re-introduced with careful consideration of its components.
# For the current zero-shot LLM approach, we directly use 'category' and 'instruction_synthetic'.

Projects Goal is to -
Build, evaluate, and compare two modeling pathways for intent classification â€”
LLM-based zero/few-shot inference and fine-tuned transformers; both using uniform
evaluation metrics (Accuracy, Macro-F1, and Cost).
Your goal is to determine which pathway offers the best balance of performance, scalability,
and business value.

Approach 1: Zero- or Few-Shot via Large Language Models

Since your API key is already securely set up in secrets, we can proceed directly with sampling the data. I'll sample up to 10 examples from each unique intent class from your X_test and y_test datasets. This sampled data will be stored in X_test_sampled and y_test_sampled for all your LLM experiments.

Sample Test Data for LLM Experiments: Sample up to 10 examples from each unique intent class from the X_test and y_test datasets, storing them in X_test_sampled and y_test_sampled respectively. This ensures a controlled and representative dataset for LLM experiments while managing API costs.

Define Zero-Shot Prompt Structure: Create a Python function to construct the zero-shot prompt for the LLM. This prompt will use the combined_instruction from the sampled data and explicitly instruct the LLM to classify the intent, potentially listing the available intent classes to guide its response.

Implement LLM Inference Function: Provide a placeholder or example function that demonstrates how to make an LLM API call (e.g., using OpenAI's API client). This function will take a prompt and return the LLM's response. It will also include guidance for the user to insert their actual API key and handle the specific API integration for their chosen LLM, emphasizing careful usage to manage costs.

Run LLM Inference and Collect Predictions: Iterate through the sampled test data, generate a zero-shot prompt for each entry, call the LLM inference function to get a prediction, and collect all predicted intents. This step will also include parsing the LLM's response to extract the intent label.

Evaluate LLM Performance: Calculate and display the Accuracy and Macro-F1 scores by comparing the collected LLM predictions against the true intent labels from the sampled test data. This will provide the initial performance metrics for the zero-shot approach.

Final Task: Summarize the results of the zero-shot LLM intent classification and prepare for the next steps, potentially exploring few-shot examples or moving to the fine-tuned transformer approach.



# Task
Prepare the LLM environment, sample 10 examples from each unique intent class from `X_test` and `y_test` for LLM experiments, and then implement a zero-shot intent classification workflow. This workflow should include defining a prompt structure using `combined_instruction`, developing a function for LLM inference, running the inference to collect predictions, and finally evaluating the LLM's performance using Accuracy and Macro-F1 scores.

## Prepare LLM Environment and Sample Data

### Subtask:
Guide the user to set up their chosen LLM API key (e.g., as an environment variable) and then sample 10 examples from each unique intent class from the `X_test` and `y_test` datasets. This sampled data will be used for all LLM experiments to control costs.


### Set up LLM API Key

Before proceeding with LLM experiments, you'll need to set up your chosen LLM API key. This is typically done by setting it as an environment variable to keep your sensitive information secure and out of your code.

**Instructions:**

1.  **For local development or shell environment:** You can set an environment variable using `export` in your terminal:
    ```bash
    export OPENAI_API_KEY='your_openai_api_key_here'
    # Or for other models, e.g., Google's PaLM:
    export GOOGLE_API_KEY='your_google_api_key_here'
    ```
    Replace `'your_openai_api_key_here'` or `'your_google_api_key_here'` with your actual API key.

2.  **Within a Python script or Colab notebook:** You can set the environment variable programmatically using the `os` module. It's good practice to load this from a secret management service or, for simpler cases, directly within your code (though be cautious not to commit your actual keys).
    ```python
    import os
    # Example for OpenAI
    os.environ['OPENAI_API_KEY'] = 'your_openai_api_key_here'

    # Example for Google's PaLM
    # os.environ['GOOGLE_API_KEY'] = 'your_google_api_key_here'
    ```
    Remember to replace the placeholder with your actual API key.

**Note:** If you are using a service like Google Colab, you might use the 'Secrets' tab to securely store and access your API keys, rather than hardcoding them in the notebook.

**Reasoning**:
Now that the instructions for setting up the API key are provided, the next step is to implement the sampling logic. This involves identifying unique intents, sampling up to 10 examples per intent from the test sets, and combining them into new sampled DataFrames for features and labels.



# Task
```python
import pandas as pd

# Ensure 'intent' column is available in X_test for sampling
if 'intent' not in X_test.columns:
    print("Error: 'intent' column not found in X_test. Please ensure it's present for stratified sampling.")
else:
    # Get unique intent classes
    unique_intents = X_test['intent'].unique()
    print(f"Found {len(unique_intents)} unique intent classes.")

    X_test_sampled_list = []
    y_test_sampled_list = []

    # Iterate through each unique intent to sample examples
    for intent_class in unique_intents:
        # Filter X_test and y_test for the current intent class
        X_intent = X_test[X_test['intent'] == intent_class]
        y_intent = y_test[X_test['intent'] == intent_class] # y_test should align with X_test indices

        # Sample up to 10 examples from the current intent class
        # Use min(len(X_intent), 10) to handle classes with fewer than 10 examples
        if not X_intent.empty:
            sampled_indices = X_intent.sample(n=min(len(X_intent), 10), random_state=42).index
            X_test_sampled_list.append(X_test.loc[sampled_indices])
            y_test_sampled_list.append(y_test.loc[sampled_indices])

    # Concatenate all sampled data into new DataFrames
    X_test_sampled = pd.concat(X_test_sampled_list)
    y_test_sampled = pd.concat(y_test_sampled_list)

    print(f"\nTotal sampled examples: {len(X_test_sampled)}")
    print(f"Number of unique intents in sampled data: {X_test_sampled['intent'].nunique()}")

    print("\nHead of X_test_sampled:")
    display(X_test_sampled.head())

    print("\nValue counts for 'intent' in X_test_sampled to verify distribution:")
    display(X_test_sampled['intent'].value_counts())

    print("\nHead of y_test_sampled:")
    display(y_test_sampled.head())
```

Sample Test Data for LLM Experiments: Sample up to 10 examples from each unique intent class from the X_test and y_test datasets, storing them in X_test_sampled and y_test_sampled respectively. This ensures a controlled and representative dataset for LLM experiments while managing API costs.

We removed 'intent' and 'Respone' from X, should category be also removed or would it be data leakage? What is Data Leakage? Data leakage occurs when information from outside the training dataset is used to create the model, leading to overly optimistic performance estimates. In classification tasks, this often happens when a feature directly or indirectly reveals the target variable.

The 'category' Column in our X Features: Currently, the X features (which are used to construct the combined_instruction for the LLM prompt) include flags, category, and instruction_synthetic. So, yes, the category column is being fed to the LLM as part of the input prompt.

Does 'category' cause leakage here? It could lead to a form of leakage if the 'category' is a direct, one-to-one mapping with the 'intent'. For example, if there's only one intent per category, then knowing the category directly tells you the intent.

However, in many real-world scenarios (and seemingly in this dataset based on the examples), 'category' provides broader context (e.g., 'ORDER' category can contain intents like cancel_order, track_order, change_order). In such cases, category acts as a helpful contextual feature that allows the LLM to narrow down the possibilities without explicitly revealing the exact intent. When building a prompt for an LLM, providing relevant contextual information like the category is often beneficial for its reasoning.

For the purpose of evaluating the zero-shot LLM's ability to classify intent given this context, including 'category' in the prompt is a reasonable approach to provide the LLM with sufficient information to make its prediction. If we were building a traditional machine learning model from scratch and trying to predict intent purely from the instruction_synthetic without any pre-categorization, we would need to carefully analyze the correlation between 'category' and 'intent' to decide if 'category' should be an independent feature or if it leads to excessive leakage.

For now, we'll continue using category as a contextual element in the LLM prompt, as it is a common practice to provide LLMs with as much relevant input as possible to guide their zero-shot reasoning.



In [16]:
import pandas as pd

# The 'intent' column is the target variable y_test, not in X_test.
# We need to sample based on y_test and then use the sampled indices to get corresponding X_test rows.

# Get unique intent classes from y_test
unique_intents = y_test.unique()
print(f"Found {len(unique_intents)} unique intent classes.")

X_test_sampled_list = []
y_test_sampled_list = []

# Iterate through each unique intent to sample examples
for intent_class in unique_intents:
    # Filter y_test for the current intent class to get its indices
    intent_indices = y_test[y_test == intent_class].index

    # Sample up to 10 indices from the current intent class
    # Use min(len(intent_indices), 10) to handle classes with fewer than 10 examples
    if not intent_indices.empty:
        # Sample indices directly from the filtered indices
        sampled_indices = intent_indices.to_series().sample(n=min(len(intent_indices), 10), random_state=42).index
        X_test_sampled_list.append(X_test.loc[sampled_indices])
        y_test_sampled_list.append(y_test.loc[sampled_indices])

# Concatenate all sampled data into new DataFrames
X_test_sampled = pd.concat(X_test_sampled_list)
y_test_sampled = pd.concat(y_test_sampled_list)

print(f"\nTotal sampled examples: {len(X_test_sampled)}")
print(f"Number of unique intents in sampled data: {y_test_sampled.nunique()}") # Check unique intents in y_test_sampled

print("\nHead of X_test_sampled:")
display(X_test_sampled.head())

print("\nValue counts for 'intent' in y_test_sampled to verify distribution:")
display(y_test_sampled.value_counts()) # Display value counts from y_test_sampled

print("\nHead of y_test_sampled:")
display(y_test_sampled.head())

Found 27 unique intent classes.

Total sampled examples: 270
Number of unique intents in sampled data: 27

Head of X_test_sampled:


Unnamed: 0,flags,category,instruction_synthetic
1267,BLQZ,ORDER,i have got to swap an item of order ORD-2025-...
1179,BIL,ORDER,how do I correct purchase ORD-2025-12345?
1310,BILZ,ORDER,canm you help me to update purchase ORD-2025-1...
1048,BLMQ,ORDER,i try to switch several items of order ORD-202...
1950,BL,ORDER,I need information about correcting purchase O...



Value counts for 'intent' in y_test_sampled to verify distribution:


Unnamed: 0_level_0,count
intent,Unnamed: 1_level_1
change_order,10
newsletter_subscription,10
review,10
track_refund,10
delivery_options,10
create_account,10
delivery_period,10
contact_human_agent,10
payment_issue,10
complaint,10



Head of y_test_sampled:


Unnamed: 0,intent
1267,change_order
1179,change_order
1310,change_order
1048,change_order
1950,change_order


Unique Intents Found: We identified all 27 unique intent classes from your y_test dataset.
Total Sampled Examples: A total of 270 examples have been sampled.
Stratified Sampling: As intended, the value_counts for y_test_sampled confirms that we have exactly 10 samples for each of the 27 unique intent classes, ensuring a balanced representation for LLM evaluation.
This means our sampled dataset is now perfectly prepared for the next step: defining the zero-shot prompt structure for the LLM!


Let's redefine the zero-shot prompt structure. This updated prompt will explicitly instruct the LLM to identify the intent from the customer query and provide the list of possible intent categories it should choose from. After the modification, I will show an example of the generated prompt.

In [17]:
import pandas as pd

# Get the list of all unique intent classes from the full dataset
all_unique_intents = df_synthetic['intent'].unique()
all_unique_intents_sorted = sorted(all_unique_intents)
# Format as a bulleted list for better LLM parsing
all_unique_intents_bulleted = "\n- " + "\n- ".join(all_unique_intents_sorted)

print(f"All unique intents available (formatted for prompt): {all_unique_intents_bulleted}\n")

def create_zero_shot_prompt(category: str, instruction_synthetic: str, available_intents_bulleted: str) -> str:
    """Constructs a zero-shot prompt for intent classification with improved guardrails and formatting.

    Args:
        category (str): The category of the instruction.
        instruction_synthetic (str): The synthetically generated customer instruction without true intent.
        available_intents_bulleted (str): A bulleted string of all possible intent classes.

    Returns:
        str: The complete zero-shot prompt.
    """
    prompt = f'''You are an intent classification system.
Your task is to classify the intent of a customer query.

Customer Query Details:
- Category: {category}
- User Instruction: {instruction_synthetic}

You MUST choose one intent ONLY from the following list. Do NOT generate any other text, explanation, or new intents.
The output should be ONLY the intent label.

Available Intents:
{available_intents_bulleted}

Predicted Intent:'''
    return prompt

# Example usage with a sample from X_test_sampled
if not X_test_sampled.empty:
    sample_category = X_test_sampled['category'].iloc[0]
    sample_instruction_synthetic = X_test_sampled['instruction_synthetic'].iloc[0]
    example_prompt = create_zero_shot_prompt(sample_category, sample_instruction_synthetic, all_unique_intents_bulleted)
    print("Example Zero-Shot Prompt (improved):\n")
    print(example_prompt)
else:
    print("X_test_sampled is empty. Cannot generate an example prompt.")

All unique intents available (formatted for prompt): 
- cancel_order
- change_order
- change_shipping_address
- check_cancellation_fee
- check_invoice
- check_payment_methods
- check_refund_policy
- complaint
- contact_customer_service
- contact_human_agent
- create_account
- delete_account
- delivery_options
- delivery_period
- edit_account
- get_invoice
- get_refund
- newsletter_subscription
- payment_issue
- place_order
- recover_password
- registration_problems
- review
- set_up_shipping_address
- switch_account
- track_order
- track_refund

Example Zero-Shot Prompt (improved):

You are an intent classification system.
Your task is to classify the intent of a customer query.

Customer Query Details:
- Category: ORDER
- User Instruction: i have got to swap  an item of order ORD-2025-12345

You MUST choose one intent ONLY from the following list. Do NOT generate any other text, explanation, or new intents.
The output should be ONLY the intent label.

Available Intents:

- cancel_orde

## Implement LLM Inference Function

### Subtask:
Provide a placeholder or example function that demonstrates how to make an LLM API call. This function will take a prompt and return the LLM's response. It will also include guidance for the user to insert their actual API key and handle the specific API integration for their chosen LLM, emphasizing careful usage to manage costs.

In [18]:
import os
import google.generativeai as genai
from google.colab import userdata
import time
from google.api_core import exceptions as core_exceptions # Import specific exceptions

# Configure the Gemini API (if you are using Google's Gemini)
# If you are using another LLM like OpenAI GPT, you would import their client library here
try:
    # Retrieve GOOGLE_API_KEY from Colab secrets
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    if GOOGLE_API_KEY:
        genai.configure(api_key=GOOGLE_API_KEY)
        print("Gemini API configured successfully.")
    else:
        print("Warning: GOOGLE_API_KEY not found. Please ensure it's set in Colab secrets with the name 'GOOGLE_API_KEY'.")
except Exception as e:
    print(f"Error configuring Gemini API: {e}")

def get_llm_prediction(prompt: str, model_name: str = 'gemini-2.0-flash', timeout_seconds: int = 60, max_retries: int = 3, initial_delay: float = 1.0) -> str:
    """Makes an API call to the specified LLM and returns its response.

    Args:
        prompt (str): The zero-shot prompt to send to the LLM.
        model_name (str): The name of the LLM model to use (e.g., 'gemini-2.0-flash', 'gpt-3.5-turbo').
        timeout_seconds (int): Maximum time in seconds to wait for an API response.
        max_retries (int): Maximum number of times to retry a request on failure.
        initial_delay (float): Initial delay in seconds before retrying.

    Returns:
        str: The extracted intent from the LLM's response, or an error message.
    """
    for attempt in range(max_retries):
        try:
            if model_name.startswith('gemini'):
                model = genai.GenerativeModel(model_name)
                response = model.generate_content(prompt, request_options={'timeout': timeout_seconds})
                # Check if the response contains any text
                if response.parts and response.parts[0].text:
                    return response.text.strip()
                else:
                    return "LLM Inference Error: No text found in response."
            # Add conditions for other models here (e.g., for OpenAI GPT, Anthropic Claude)
            # elif model_name.startswith('gpt'):
            #     from openai import OpenAI
            #     client = OpenAI(api_key=OPENAI_API_KEY)
            #     response = client.chat.completions.create(
            #         model=model_name,
            #         messages=[{"role": "user", "content": prompt}],
            #         max_tokens=50 # Limit response length to get just the intent
            #     )
            #     return response.choices[0].message.content.strip()
            else:
                return "Error: Unsupported model_name. Please update the inference function."
        except core_exceptions.ResourceExhausted as e: # Catch rate limit errors
            if attempt < max_retries - 1:
                delay = initial_delay * (2 ** attempt) # Exponential backoff
                print(f"Rate limit hit. Retrying in {delay:.2f} seconds... (Attempt {attempt + 1}/{max_retries})")
                time.sleep(delay)
            else:
                return f"LLM Inference Error after {max_retries} retries (Rate Limit): {e}"
        except Exception as e:
            return f"LLM Inference Error: {e}"
    return "LLM Inference Error: Max retries exceeded without successful response."

# Example of how to use the function (will be executed in the next step)
# You would typically iterate through your X_test_sampled here.
# test_prompt = create_zero_shot_prompt("I want to cancel my order", all_unique_intents_bulleted)
# prediction = get_llm_prediction(test_prompt)
# print(f"\nTest Prediction: {prediction}")

Gemini API configured successfully.


## Run LLM Inference and Collect Predictions

### Subtask:
Iterate through the sampled test data, generate a zero-shot prompt for each entry, call the LLM inference function to get a prediction, and collect all predicted intents. This step will also include parsing the LLM's response to extract the intent label.

In [22]:
import os
import google.generativeai as genai
from google.colab import userdata
import time
from google.api_core import exceptions as core_exceptions
import json

# Configure the Gemini API (if you are using Google's Gemini)
try:
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    if GOOGLE_API_KEY:
        genai.configure(api_key=GOOGLE_API_KEY)
        print("Gemini API configured successfully.")
    else:
        print("Warning: GOOGLE_API_KEY not found. Please ensure it's set in Colab secrets with the name 'GOOGLE_API_KEY'.")
except Exception as e:
    print(f"Error configuring Gemini API: {e}")

# --- Pricing for Gemini models (Adjust these values based on actual current rates) ---
# These are example prices and should be updated with actual current pricing from Google AI Studio.
# For demonstration purposes, using placeholder values for 'gemini-2.0-flash-lite'
GEMINI_FLASH_INPUT_PRICE_PER_1K_TOKENS = 0.00005 # e.g., $0.05 per 1000 input tokens
GEMINI_FLASH_OUTPUT_PRICE_PER_1K_TOKENS = 0.00015 # e.g., $0.15 per 1000 output tokens

def safe_llm_predict(prompt: str, model_name: str = 'gemini-2.0-flash-lite', timeout: int = 60, max_retries: int = 8) -> tuple:
    """
    Robust Gemini prediction wrapper:
    - Automatically retries on 429 rate limits
    - Uses retry_delay from Gemini error
    - Exponential backoff
    - Returns prediction, inference_time, input_tokens, output_tokens, and cost for the call.
    """
    input_tokens = 0
    output_tokens = 0
    inference_time = 0.0
    cost = 0.0
    predicted_text = "LLM Error: Max retries exceeded" # Default error message

    for attempt in range(max_retries):
        start_time = time.time()
        try:
            model = genai.GenerativeModel(model_name)
            response = model.generate_content(prompt, request_options={"timeout": timeout})
            inference_time = time.time() - start_time

            predicted_text = response.text.strip()

            # Extract token counts from usage_metadata if available
            if response.usage_metadata:
                input_tokens = response.usage_metadata.prompt_token_count
                output_tokens = response.usage_metadata.candidates_token_count
            else:
                # Fallback if usage_metadata is not directly available, assume 0 for this call
                print("Warning: usage_metadata not found in response. Assuming 0 tokens for this call.")
                input_tokens = 0
                output_tokens = 0

            cost = (input_tokens / 1000 * GEMINI_FLASH_INPUT_PRICE_PER_1K_TOKENS) + \
                   (output_tokens / 1000 * GEMINI_FLASH_OUTPUT_PRICE_PER_1K_TOKENS)

            return predicted_text, inference_time, input_tokens, output_tokens, cost

        except Exception as e:
            err = str(e)
            inference_time = time.time() - start_time

            if "429" in err:
                print(f"[429] Rate limit hit (attempt {attempt+1}/{max_retries})")
                try:
                    data = json.loads(err[err.index("{"):])
                    retry_delay = data.get("retry_delay", {}).get("seconds", 2)
                except:
                    retry_delay = 2

                sleep_time = retry_delay * (1.4 ** attempt)
                print(f"Waiting {sleep_time:.2f}s before retrying...")
                time.sleep(sleep_time)
                continue # Retry

            # For non-429 errors, return the error message with 0 tokens and cost.
            return f"LLM Error: {e}", inference_time, 0, 0, 0.0

    # If max retries exceeded for any reason
    return predicted_text, inference_time, 0, 0, 0.0

Gemini API configured successfully.


Hit rate limits mny times.

Error 429



Add exponential backoff + rate-limit wait inside your eval loop

Here is a ready-to-paste wrapper that:

Detects 429

Extracts retry_delay

Sleeps automatically

Prevents your entire prediction column from filling with errors

In [23]:
import time
import pandas as pd

# Initialize lists to store LLM predictions and metrics
llm_predictions = []
inference_times = []
input_token_counts = []
output_token_counts = []
api_costs = []

print(f"Starting LLM inference for {len(X_test_sampled)} samples...\n")

for index, row in X_test_sampled.iterrows():
    category = row["category"]
    instruction_synthetic = row["instruction_synthetic"]

    # Build new improved prompt
    prompt = create_zero_shot_prompt(
        category,
        instruction_synthetic,
        all_unique_intents_bulleted
    )

    # Call LLM using robust rate-limit-safe wrapper
    prediction, inf_time, in_tokens, out_tokens, call_cost = safe_llm_predict(
        prompt,
        model_name="gemini-2.0-flash-lite",   # \uD83D\uDD25 use lite model to avoid quota exhaustion
        timeout=60
    )

    llm_predictions.append(prediction)
    inference_times.append(inf_time)
    input_token_counts.append(in_tokens)
    output_token_counts.append(out_tokens)
    api_costs.append(call_cost)

    # Throttle proactively to avoid token quota hits
    time.sleep(0.35)   # ~3 requests/sec; adjust as needed

print("\nLLM inference completed.\n")

# Convert predictions to a DataFrame for analysis
predictions_df = pd.DataFrame(
    {"llm_predicted_intent": llm_predictions},
    index=X_test_sampled.index
)

print("Head of LLM Predictions:")
display(predictions_df.head())

print("\nValue counts of LLM Predictions:")
display(predictions_df["llm_predicted_intent"].value_counts())

Starting LLM inference for 270 samples...


LLM inference completed.

Head of LLM Predictions:


Unnamed: 0,llm_predicted_intent
1267,change_order
1179,change_order
1310,change_order
1048,change_order
1950,change_order



Value counts of LLM Predictions:


Unnamed: 0_level_0,count
llm_predicted_intent,Unnamed: 1_level_1
get_invoice,19
complaint,16
track_order,15
edit_account,13
get_refund,13
contact_human_agent,12
place_order,11
create_account,11
payment_issue,10
newsletter_subscription,10


Here's a breakdown of the output:

"Starting LLM inference for 270 samples... LLM inference completed.": This confirms that the loop processed all 270 sampled test cases, generating a prediction for each one using the gemini-2.0-flash-lite model.

"Head of LLM Predictions:"

This displays the first few rows of the predictions_df DataFrame. You can see the original index of the sampled X_test examples and the corresponding llm_predicted_intent. For instance, you can see several change_order predictions at the top.
"Value counts of LLM Predictions:"

This is a crucial summary, showing the frequency of each intent predicted by the LLM across all 270 samples. Ideally, if the LLM was perfect and predicted correctly for all 10 samples of each of the 27 intents, we would see each unique intent listed with a count of 10. This output helps us quickly identify which intents the LLM is predicting more often, or if there are any unexpected predictions or errors.
This output now gives us the LLM's zero-shot predictions, which we can compare against the true y_test_sampled labels to evaluate its performance!



In [24]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

# Ensure y_test_sampled and predictions_df are aligned by index
y_true = y_test_sampled.loc[predictions_df.index]
y_pred = predictions_df['llm_predicted_intent']

# Calculate Accuracy
accuracy = accuracy_score(y_true, y_pred)

# Calculate Macro-F1 Score
f1_macro = f1_score(y_true, y_pred, average='macro')

print(f"Zero-Shot LLM Performance:\n")
print(f"Accuracy: {accuracy:.4f}")
print(f"Macro-F1 Score: {f1_macro:.4f}")

# Calculate and print additional metrics
total_inference_time = sum(inference_times)
average_time_per_sample = total_inference_time / len(X_test_sampled)
estimated_time_per_1000_samples = average_time_per_sample * 1000

total_input_tokens = sum(input_token_counts)
total_output_tokens = sum(output_token_counts)
total_api_cost = sum(api_costs)

print(f"\n--- LLM Inference Metrics ---")
print(f"Total samples processed: {len(X_test_sampled)}")
print(f"Total inference time: {total_inference_time:.2f} seconds")
print(f"Average inference time per sample: {average_time_per_sample:.4f} seconds")
print(f"Estimated inference time per 1000 samples: {estimated_time_per_1000_samples:.2f} seconds")
print(f"Total input tokens consumed: {total_input_tokens}")
print(f"Total output tokens consumed: {total_output_tokens}")
print(f"Approximate API Cost for {len(X_test_sampled)} samples: ${total_api_cost:.6f}")
print(f"Estimated API Cost per 1000 samples: ${total_api_cost / len(X_test_sampled) * 1000:.6f}")

Zero-Shot LLM Performance:

Accuracy: 0.8815
Macro-F1 Score: 0.8700

--- LLM Inference Metrics ---
Total samples processed: 270
Total inference time: 169.85 seconds
Average inference time per sample: 0.6291 seconds
Estimated inference time per 1000 samples: 629.07 seconds
Total input tokens consumed: 63898
Total output tokens consumed: 1196
Approximate API Cost for 270 samples: $0.003374
Estimated API Cost per 1000 samples: $0.012497


Let's compare the llm_predicted_intent in predictions_df with the true intents in y_test_sampled. Calculate the Accuracy and Macro-F1 scores to evaluate the zero-shot LLM's performance.

The evaluation results for the Zero-Shot LLM are in! Here's what we found:

Accuracy: 0.8815 (or 88.15%) This means that the LLM correctly predicted the intent for approximately 88.15% of the sampled customer queries. Accuracy is a straightforward measure of overall correctness.

Macro-F1 Score: 0.8701 (or 87.01%) The Macro-F1 score is a more robust metric, especially useful when dealing with imbalanced datasets (though our sampled dataset is balanced for intents). It calculates the F1 score for each individual intent class and then averages them. This ensures that the performance on all classes, including less frequent ones, is equally weighted. An 87.01% Macro-F1 score indicates a strong balance between precision and recall across all the different intent categories.

These are quite impressive results for a zero-shot approach, demonstrating the LLM's capability to understand and classify intents without any specific training examples for this task. This sets a strong baseline for intent classification!

Zero-Shot LLM Performance:

Accuracy: 0.8815 (or 88.15%): This indicates that the LLM correctly predicted the intent for approximately 88.15% of the sampled customer queries.
Macro-F1 Score: 0.8700 (or 87.00%): This metric shows a strong balance between precision and recall across all the different intent categories, which is particularly good for classification tasks with multiple classes.

--- LLM Inference Metrics --- These metrics provide insights into the operational aspects of running the LLM:

Total samples processed: 270: Confirms that all the samples in X_test_sampled were processed.

Total inference time: 169.85 seconds: The total time taken for all 270 API calls.
Average inference time per sample: 0.6291 seconds: On average, each query took about 0.63 seconds for the LLM to process.
Estimated inference time per 1000 samples: 629.07 seconds: This projection helps understand the scalability for larger datasets.

Total input tokens consumed: 63898: The total number of tokens sent to the LLM in the prompts.

Total output tokens consumed: 1196: The total number of tokens received as predictions from the LLM.

Approximate API Cost for 270 samples: $0.003374**: The estimated cost for processing these 270 samples based on the placeholder rates. * **Estimated API Cost per 1000 samples: $$0.003374**: The estimated cost for processing these 270 samples based on the placeholder rates. * **Estimated API Cost per 1000 samples: $0.012497: A projection of the cost for processing 1000 samples.

Summary: The Zero-Shot LLM (Gemini 2.0 Flash Lite) demonstrates impressive performance with an Accuracy of 88.15% and a Macro-F1 score of 87.00%. The inference is relatively fast, averaging less than a second per sample, and the cost is very low, making it a viable option for intent classification, especially given that it requires no fine-tuning. This robust set of metrics provides a solid baseline for comparing against other approaches, such as fine-tuned transformers.

We've successfully completed the zero-shot LLM intent classification!

## Approach 1: Few-Shot via Large Language Models

To improve the LLM's performance further, we can provide it with a few examples of customer queries and their corresponding intents. This is known as **few-shot learning**. By seeing a few examples, the LLM can better understand the patterns and expected output format, leading to more accurate predictions.

We will define a new prompt structure that includes a set of curated examples. These examples should be representative of the different intent classes and demonstrate the desired classification behavior.

In [None]:
import pandas as pd

# Get the list of all unique intent classes from the full dataset (already available)
# all_unique_intents_bulleted = "\n- " + "\n- ".join(sorted(df_synthetic['intent'].unique()))

# Curate a few diverse examples from the training set for few-shot prompting
# These examples are manually selected to cover different intents and demonstrate the expected format.
# We will use the original instruction to ensure diversity and avoid using 'instruction_synthetic' here
# to keep the examples clear and direct.

few_shot_examples = [
    {
        "category": "ORDER",
        "instruction": "I want to change my order",
        "intent": "change_order"
    },
    {
        "category": "ACCOUNT",
        "instruction": "I need help creating a new account",
        "intent": "create_account"
    },
    {
        "category": "PAYMENT",
        "instruction": "What payment methods do you accept?",
        "intent": "check_payment_methods"
    },
    {
        "category": "REFUND",
        "instruction": "How can I track my refund?",
        "intent": "track_refund"
    },
    {
        "category": "DELIVERY",
        "instruction": "When will my package arrive?",
        "intent": "delivery_period"
    }
]

def create_few_shot_prompt(category: str, instruction_synthetic: str, available_intents_bulleted: str, examples: list) -> str:
    """Constructs a few-shot prompt for intent classification.

    Args:
        category (str): The category of the instruction.
        instruction_synthetic (str): The synthetically generated customer instruction.
        available_intents_bulleted (str): A bulleted string of all possible intent classes.
        examples (list): A list of dictionaries, each containing 'category', 'instruction', and 'intent' for few-shot learning.

    Returns:
        str: The complete few-shot prompt.
    """
    prompt_parts = []
    prompt_parts.append("You are an intent classification system.")
    prompt_parts.append("Your task is to classify the intent of a customer query based on the provided examples.")
    prompt_parts.append("\n--- Examples ---\n")

    for example in examples:
        prompt_parts.append(f"Customer Query Details:")
        prompt_parts.append(f"- Category: {example['category']}")
        prompt_parts.append(f"- User Instruction: {example['instruction']}")
        prompt_parts.append(f"Predicted Intent: {example['intent']}\n")

    prompt_parts.append("--- End Examples ---\n")

    prompt_parts.append("Now, classify the following query:")
    prompt_parts.append(f"Customer Query Details:")
    prompt_parts.append(f"- Category: {category}")
    prompt_parts.append(f"- User Instruction: {instruction_synthetic}")

    prompt_parts.append("\nYou MUST choose one intent ONLY from the following list. Do NOT generate any other text, explanation, or new intents.")
    prompt_parts.append("The output should be ONLY the intent label.")
    prompt_parts.append(f"\nAvailable Intents:\n{available_intents_bulleted}\n")
    prompt_parts.append("Predicted Intent:")

    return "\n".join(prompt_parts)

# Example usage with a sample from X_test_sampled
if not X_test_sampled.empty:
    sample_category = X_test_sampled['category'].iloc[0]
    sample_instruction_synthetic = X_test_sampled['instruction_synthetic'].iloc[0]
    example_few_shot_prompt = create_few_shot_prompt(sample_category, sample_instruction_synthetic, all_unique_intents_bulleted, few_shot_examples)
    print("Example Few-Shot Prompt:\n")
    print(example_few_shot_prompt)
else:
    print("X_test_sampled is empty. Cannot generate an example few-shot prompt.")