In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("bitext/bitext-gen-ai-chatbot-customer-support-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/bitext-gen-ai-chatbot-customer-support-dataset


## Summary:

### Q&A
Yes, the dataset has been successfully loaded into the `df` DataFrame.

### Data Analysis Key Findings
*   The primary CSV file, `Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv`, was successfully identified and loaded into a pandas DataFrame named `df`.
*   The DataFrame `df` contains columns such as 'flags', 'instruction', 'category', 'intent', and 'response', as verified by displaying its head.

### Insights or Next Steps
*   The loaded DataFrame `df` is now ready for further data exploration, cleaning, and analysis to understand the customer support interactions.
*   A logical next step would be to perform a quick data overview, including checking data types, missing values, and unique values in categorical columns.


In [2]:
import pandas as pd
import os

# List files in the dataset directory
files_in_path = os.listdir(path)
print(f"Files available in the dataset directory: {files_in_path}")

# Identify the main CSV file. Based on previous runs or common dataset structures,
# it's often 'Bitext_Sample_-_es_to_en_-_Sheet1.csv'
# Let's try to find a .csv file or assume the given name.
csv_file_name = None
for file in files_in_path:
    if file.endswith('.csv'):
        csv_file_name = file
        break

if csv_file_name is None:
    print("No CSV file found in the dataset directory. Attempting to use a known filename.")
    # Fallback to a known filename if no CSV is automatically detected
    csv_file_name = 'Bitext_Sample_-_es_to_en_-_Sheet1.csv'

csv_file_path = os.path.join(path, csv_file_name)

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

print(f"Successfully loaded '{csv_file_name}' into DataFrame 'df'.")

# Display the head of the DataFrame to verify
df.head()

Files available in the dataset directory: ['Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv']
Successfully loaded 'Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv' into DataFrame 'df'.


Unnamed: 0,flags,instruction,category,intent,response
0,B,question about cancelling order {{Order Number}},ORDER,cancel_order,I've understood you have a question regarding ...
1,BQZ,i have a question about cancelling oorder {{Or...,ORDER,cancel_order,I've been informed that you have a question ab...
2,BLQZ,i need help cancelling puchase {{Order Number}},ORDER,cancel_order,I can sense that you're seeking assistance wit...
3,BL,I need to cancel purchase {{Order Number}},ORDER,cancel_order,I understood that you need assistance with can...
4,BCELN,"I cannot afford this order, cancel purchase {{...",ORDER,cancel_order,I'm sensitive to the fact that you're facing f...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26872 entries, 0 to 26871
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   flags        26872 non-null  object
 1   instruction  26872 non-null  object
 2   category     26872 non-null  object
 3   intent       26872 non-null  object
 4   response     26872 non-null  object
dtypes: object(5)
memory usage: 1.0+ MB


In [4]:
import re
import random



# Define the regex pattern for placeholders
placeholder_pattern = r"\{\{([^}]+)\}\}"

# Initialize a set to store unique placeholders
unique_placeholders = set()

# Iterate through the 'instruction' column and extract placeholders
if 'instruction' in df.columns:
    for instruction_text in df['instruction'].astype(str):
        matches = re.findall(placeholder_pattern, instruction_text)
        for match in matches:
            unique_placeholders.add(match.strip())

    print("Unique placeholders found:")
    for placeholder in sorted(list(unique_placeholders)):
        print(placeholder)
else:
    print("The 'instruction' column does not exist in the DataFrame.")

Unique placeholders found:
Account Category
Account Type
Currency Symbol
Delivery City
Delivery Country
Invoice Number
Order Number
Person Name
Refund Amount


In [5]:
import re

# Define the mapping of placeholders to synthetic values
synthetic_values = {
    "Order Number": "ORD-2025-12345",
    "Person Name": "Maria Garcia",
    "email": "alex.smith@example.com", # Assuming 'email' is a placeholder
    "product_name": "Wireless Mouse", # Assuming 'product_name' is a placeholder
    "date": "05/09/2025", # Assuming 'date' is a placeholder
    "Refund Amount": "$50.00", # Based on previously found placeholders
    "Account Category": "Savings", # Based on previously found placeholders
    "Account Type": "Checking", # Based on previously found placeholders
    "Currency Symbol": "$", # Based on previously found placeholders
    "Delivery City": "New York", # Based on previously found placeholders
    "Delivery Country": "USA", # Based on previously found placeholders
    "Invoice Number": "INV-2025-54321", # Based on previously found placeholders
}

# Create a copy of the DataFrame to store the modified instructions
df_synthetic = df.copy()

# Function to replace placeholders
def replace_placeholders(text):
    for placeholder, value in synthetic_values.items():
        # Use re.escape to handle special characters in placeholder names
        text = re.sub(r'\{\{' + re.escape(placeholder) + r'\}\}', value, text, flags=re.IGNORECASE)
    return text

# Apply the replacement function to the 'instruction' column
if 'instruction' in df_synthetic.columns:
    df_synthetic['instruction_synthetic'] = df_synthetic['instruction'].apply(replace_placeholders)
    print("Instructions with synthetic values generated successfully. Displaying original and synthetic instructions for verification:")
    display(df_synthetic[['instruction', 'instruction_synthetic']].head())
else:
    print("The 'instruction' column does not exist in the DataFrame.")

Instructions with synthetic values generated successfully. Displaying original and synthetic instructions for verification:


Unnamed: 0,instruction,instruction_synthetic
0,question about cancelling order {{Order Number}},question about cancelling order ORD-2025-12345
1,i have a question about cancelling oorder {{Or...,i have a question about cancelling oorder ORD-...
2,i need help cancelling puchase {{Order Number}},i need help cancelling puchase ORD-2025-12345
3,I need to cancel purchase {{Order Number}},I need to cancel purchase ORD-2025-12345
4,"I cannot afford this order, cancel purchase {{...","I cannot afford this order, cancel purchase OR..."


In [6]:
df_synthetic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26872 entries, 0 to 26871
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   flags                  26872 non-null  object
 1   instruction            26872 non-null  object
 2   category               26872 non-null  object
 3   intent                 26872 non-null  object
 4   response               26872 non-null  object
 5   instruction_synthetic  26872 non-null  object
dtypes: object(6)
memory usage: 1.2+ MB


In [7]:
if 'instruction_synthetic' in df_synthetic.columns:
    print("Sampling 20 processed rows for verification:")
    display(df_synthetic[['instruction', 'instruction_synthetic']].sample(n=20, random_state=42))
else:
    print("The 'instruction_synthetic' column does not exist in the DataFrame. Please ensure the placeholder replacement step was executed correctly.")

Sampling 20 processed rows for verification:


Unnamed: 0,instruction,instruction_synthetic
9329,I can't talk with a human agent,I can't talk with a human agent
4160,I have got to locate hte bills from {{Person N...,I have got to locate hte bills from Maria Garcia
18500,"I cannot pay, help me to inform of a problem w...","I cannot pay, help me to inform of a problem w..."
8840,I want help speaking to customer service,I want help speaking to customer service
5098,I try to see th accepted payment options,I try to see th accepted payment options
17250,where to sign up to the company nmewsletter,where to sign up to the company nmewsletter
3589,I'd like to see the withdrwaal fee how can i d...,I'd like to see the withdrwaal fee how can i d...
9043,I want to speak with someone,I want to speak with someone
15800,can you help me getting bill #85632?,can you help me getting bill #85632?
4384,I don't know how to take a quick look at invoi...,I don't know how to take a quick look at invoi...


In [9]:
from sklearn.model_selection import train_test_split

# Drop the original 'instruction' column and keep 'instruction_synthetic'
if 'instruction' in df_synthetic.columns:
    df_synthetic = df_synthetic.drop(columns=['instruction'])

# Define the features (X) and target (y) for intent classification
X = df_synthetic.drop(columns=['response', 'intent']) # Features should not include the target or the response
y = df_synthetic['intent'] # Target is the 'intent' column

# First split: 80% train, 20% (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Second split: 50% validation, 50% test from the 20% temporary set
# This results in 10% validation and 10% test from the original dataset
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training set size: {len(X_train)} samples")
print(f"Validation set size: {len(X_val)} samples")
print(f"Test set size: {len(X_test)} samples")

# Display the head of the training set features as an example
print("\nHead of X_train:")
display(X_train.head())

Training set size: 21497 samples
Validation set size: 2687 samples
Test set size: 2688 samples

Head of X_train:


Unnamed: 0,flags,category,instruction_synthetic
18407,BL,PAYMENT,I want assistance informing of a trouble with ...
4878,BKL,INVOICE,see bill #12588
26858,BCL,REFUND,"I want to see my refund current status, help me"
25464,BKL,ORDER,see current status of order ORD-2025-12345
12237,BCIL,DELIVERY,"I need to check the options for shipping, how ..."


In [10]:
import pandas as pd

# The 'intent' column is the target variable y_test, not in X_test.
# We need to sample based on y_test and then use the sampled indices to get corresponding X_test rows.

# Get unique intent classes from y_test
unique_intents = y_test.unique()
print(f"Found {len(unique_intents)} unique intent classes.")

X_test_sampled_list = []
y_test_sampled_list = []

# Iterate through each unique intent to sample examples
for intent_class in unique_intents:
    # Filter y_test for the current intent class to get its indices
    intent_indices = y_test[y_test == intent_class].index

    # Sample up to 10 indices from the current intent class
    # Use min(len(intent_indices), 10) to handle classes with fewer than 10 examples
    if not intent_indices.empty:
        # Sample indices directly from the filtered indices
        sampled_indices = intent_indices.to_series().sample(n=min(len(intent_indices), 10), random_state=42).index
        X_test_sampled_list.append(X_test.loc[sampled_indices])
        y_test_sampled_list.append(y_test.loc[sampled_indices])

# Concatenate all sampled data into new DataFrames
X_test_sampled = pd.concat(X_test_sampled_list)
y_test_sampled = pd.concat(y_test_sampled_list)

print(f"\nTotal sampled examples: {len(X_test_sampled)}")
print(f"Number of unique intents in sampled data: {y_test_sampled.nunique()}") # Check unique intents in y_test_sampled

print("\nHead of X_test_sampled:")
display(X_test_sampled.head())

print("\nValue counts for 'intent' in y_test_sampled to verify distribution:")
display(y_test_sampled.value_counts()) # Display value counts from y_test_sampled

print("\nHead of y_test_sampled:")
display(y_test_sampled.head())

Found 27 unique intent classes.

Total sampled examples: 270
Number of unique intents in sampled data: 27

Head of X_test_sampled:


Unnamed: 0,flags,category,instruction_synthetic
1267,BLQZ,ORDER,i have got to swap an item of order ORD-2025-...
1179,BIL,ORDER,how do I correct purchase ORD-2025-12345?
1310,BILZ,ORDER,canm you help me to update purchase ORD-2025-1...
1048,BLMQ,ORDER,i try to switch several items of order ORD-202...
1950,BL,ORDER,I need information about correcting purchase O...



Value counts for 'intent' in y_test_sampled to verify distribution:


intent
change_order                10
newsletter_subscription     10
review                      10
track_refund                10
delivery_options            10
create_account              10
delivery_period             10
contact_human_agent         10
payment_issue               10
complaint                   10
check_cancellation_fee      10
set_up_shipping_address     10
track_order                 10
cancel_order                10
edit_account                10
recover_password            10
get_invoice                 10
switch_account              10
registration_problems       10
contact_customer_service    10
place_order                 10
change_shipping_address     10
check_invoice               10
get_refund                  10
check_refund_policy         10
check_payment_methods       10
delete_account              10
Name: count, dtype: int64


Head of y_test_sampled:


1267    change_order
1179    change_order
1310    change_order
1048    change_order
1950    change_order
Name: intent, dtype: object

Unique Intents Found: We identified all 27 unique intent classes from your y_test dataset.
Total Sampled Examples: A total of 270 examples have been sampled.
Stratified Sampling: As intended, the value_counts for y_test_sampled confirms that we have exactly 10 samples for each of the 27 unique intent classes, ensuring a balanced representation for LLM evaluation.
This means our sampled dataset is now perfectly prepared for the next step: defining the zero-shot prompt structure for the LLM!


Let's redefine the zero-shot prompt structure. This updated prompt will explicitly instruct the LLM to identify the intent from the customer query and provide the list of possible intent categories it should choose from. After the modification, I will show an example of the generated prompt.

In [43]:
import pandas as pd

# Get the list of all unique intent classes from the full dataset
all_unique_intents = df_synthetic['intent'].unique()
all_unique_intents_sorted = sorted(all_unique_intents)
# Format as a bulleted list for better LLM parsing
all_unique_intents_bulleted = "\n- " + "\n- ".join(all_unique_intents_sorted)

print(f"All unique intents available (formatted for prompt): {all_unique_intents_bulleted}\n")

def create_zero_shot_prompt(category: str, instruction_synthetic: str, available_intents_bulleted: str) -> str:
    """
    Constructs a zero-shot prompt for intent classification with controlled response format.
    """

    prompt = f'''You are an intent classification system.
Your task is to classify the intent of a customer query.

Specific guidelines (but not rules)

Customers stating that they are expecting a refund are tracking a refund, and should be "track_refund".
Customers asking to check when their order will arrive are asking about the delivery period - unless an order number is a part of the instruction or it sounds like the order has been made, in which case they are tracking the order.
Customers asking about the possibility of orders are asking about delivery options, including where they are ordering from.
Customers asking setting up a new or secondary address are not editing their address, but setting up an address.
Customers looking to use a specific account (such as chequing and savings) or profile, are attempting to switch accounts.
Customers modifying or updating account details are edit_account, unless these details are password or PIN related. These are recover password asks.
Customers may also ask about updating their account to delete the account. This is delete_account.
Customers looking to download an invoice want to get the invoice. Customers looking to see an invoice want to check it.
Customers looking specifically customer assistance aren't inherently looking for human assistance, and are more likely to be looking for customer service. 
If they want to chat or speak with customer service, they are looking for customer service, not human assistance.
If an invoice has already been issued, and the customer is looking for it, that is check_invoice. 
Refund policy questions relate to check_refund_policy. Note that if they aren't asking for specifics on the policy (such as the time for reimbursements), they are likely just wanting to get a refund.
Customers wanting an article under the order category, are trying to place an order.

Customer Query Details:
- Category: {category}
- User Instruction: {instruction_synthetic}

You MUST choose exactly one intent ONLY from the following list:

Available Intents:
{available_intents_bulleted}

### Response Format (follow EXACTLY):
1. Provide an explanation (1-2 paragraphs) of why you selected the intent. Be sure to consider two-to-three other intents before settling on a final choice.
2. Then output the final predicted intent on a new line in the following format:

Final Choice: <intent_label>

Do NOT invent new intents.
Do NOT output anything other than the two required parts.

Begin.

Explanation:'''
    return prompt


# Example usage with a sample from X_test_sampled
if not X_test_sampled.empty:
    sample_category = X_test_sampled['category'].iloc[0]
    sample_instruction_synthetic = X_test_sampled['instruction_synthetic'].iloc[0]
    example_prompt = create_zero_shot_prompt(sample_category, sample_instruction_synthetic, all_unique_intents_bulleted)
    print("Example Zero-Shot Prompt (improved):\n")
    print(example_prompt)
else:
    print("X_test_sampled is empty. Cannot generate an example prompt.")

All unique intents available (formatted for prompt): 
- cancel_order
- change_order
- change_shipping_address
- check_cancellation_fee
- check_invoice
- check_payment_methods
- check_refund_policy
- complaint
- contact_customer_service
- contact_human_agent
- create_account
- delete_account
- delivery_options
- delivery_period
- edit_account
- get_invoice
- get_refund
- newsletter_subscription
- payment_issue
- place_order
- recover_password
- registration_problems
- review
- set_up_shipping_address
- switch_account
- track_order
- track_refund

Example Zero-Shot Prompt (improved):

You are an intent classification system.
Your task is to classify the intent of a customer query.

Specific guidelines (but not rules)

Customers stating that they are expecting a refund are tracking a refund, and should be "track_refund".
Customers asking to check when their order will arrive are asking about the delivery period - unless an order number is a part of the instruction or it sounds like the or

## Implement LLM Inference Function

### Subtask:
Provide a placeholder or example function that demonstrates how to make an LLM API call. This function will take a prompt and return the LLM's response. It will also include guidance for the user to insert their actual API key and handle the specific API integration for their chosen LLM, emphasizing careful usage to manage costs.

In [44]:
import os
import time
import re
from openai import OpenAI, APIError, RateLimitError
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
OPENAI_API_KEY = user_secrets.get_secret("OPENAI_API_KEY")

client = OpenAI(api_key=OPENAI_API_KEY)

def safe_llm_predict(
    prompt: str,
    model_name: str = "gpt-4.1-mini",
    timeout: int = 60,
    max_retries: int = 6
):

    input_tokens = 0
    output_tokens = 0
    inference_time = 0.0
    cost = 0.0
    extracted_intent = "LLM Error"

    for attempt in range(max_retries):
        start_time = time.time()

        try:
            response = client.chat.completions.create(
                model=model_name,
                messages=[{"role": "user", "content": prompt}],
                timeout=timeout
            )

            inference_time = time.time() - start_time

            # FIXED: message is an object, not a dict
            raw_text = response.choices[0].message.content.strip()

            # usage handling
            if response.usage:
                input_tokens = response.usage.prompt_tokens or 0
                output_tokens = response.usage.completion_tokens or 0


            # Strong regex match
            match = re.search(r"Final\s*Choice\s*:\s*([A-Za-z0-9_\-]+)",
                              raw_text, flags=re.IGNORECASE)
            if match:
                extracted_intent = match.group(1).strip()
                return extracted_intent, inference_time, input_tokens, output_tokens, cost

            # Fallback: take final word after "Final Choice:"
            match = re.search(r"Final\s*Choice\s*:\s*(.*)$",
                              raw_text, flags=re.IGNORECASE | re.DOTALL)
            if match:
                tail = match.group(1).strip()
                last_word = re.findall(r"[A-Za-z0-9_\-]+", tail)
                if last_word:
                    extracted_intent = last_word[-1]
                    return extracted_intent, inference_time, input_tokens, output_tokens, cost

            # Full fallback
            fallback_tokens = re.findall(r"[A-Za-z0-9_\-]+", raw_text)
            if fallback_tokens:
                extracted_intent = fallback_tokens[-1]

            return extracted_intent, inference_time, input_tokens, output_tokens, cost

        except RateLimitError:
            wait_time = 1.5 ** attempt
            print(f"[429] Rate limit ‚Äî retrying in {wait_time:.2f}s...")
            time.sleep(wait_time)
            continue

        except APIError as e:
            return f"LLM Error: {e}", inference_time, input_tokens, output_tokens, cost

        except Exception as e:
            return f"LLM Error: {e}", inference_time, input_tokens, output_tokens, cost

    return extracted_intent, inference_time, input_tokens, output_tokens, cost


Here's a breakdown of the output:

"Starting LLM inference for 270 samples... LLM inference completed.": This confirms that the loop processed all 270 sampled test cases, generating a prediction for each one using the gemini-2.0-flash-lite model.

"Head of LLM Predictions:"

This displays the first few rows of the predictions_df DataFrame. You can see the original index of the sampled X_test examples and the corresponding llm_predicted_intent. For instance, you can see several change_order predictions at the top.
"Value counts of LLM Predictions:"

This is a crucial summary, showing the frequency of each intent predicted by the LLM across all 270 samples. Ideally, if the LLM was perfect and predicted correctly for all 10 samples of each of the 27 intents, we would see each unique intent listed with a count of 10. This output helps us quickly identify which intents the LLM is predicting more often, or if there are any unexpected predictions or errors.
This output now gives us the LLM's zero-shot predictions, which we can compare against the true y_test_sampled labels to evaluate its performance!



In [45]:
def test_safe_llm_predict(prompt: str):
    """
    Direct test harness to inspect what the model returns
    before running the full dataset loop.
    """

    print("\n======================")
    print("üîç TESTING LLM OUTPUT")
    print("======================\n")

    # Make a single request
    intent, inf_time, in_tokens, out_tokens, cost = safe_llm_predict(prompt)

    print("üìå Extracted Intent:", intent)
    print("‚è±Ô∏è Inference Time:", inf_time)
    print("üî¢ Input Tokens:", in_tokens)
    print("üî¢ Output Tokens:", out_tokens)
    print("üí≤ Cost:", cost)

    print("\n--- RAW MODEL OUTPUT ---\n")

    # Call the API *again* but without extraction
    # so you can see EXACTLY what the model said
    raw = client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[{"role": "user", "content": prompt}]
    )

    raw_text = raw.choices[0].message.content
    print(raw_text)
    print("\n------------------------\n")

    return raw_text


In [46]:
# Pick a sample row to test
sample_row = X_test_sampled.iloc[0]

test_prompt = create_zero_shot_prompt(
    sample_row["category"],
    sample_row["instruction_synthetic"],
    all_unique_intents_bulleted
)

# Run the single test
test_safe_llm_predict(test_prompt)



üîç TESTING LLM OUTPUT

üìå Extracted Intent: change_order
‚è±Ô∏è Inference Time: 3.264967679977417
üî¢ Input Tokens: 609
üî¢ Output Tokens: 146
üí≤ Cost: 0.0

--- RAW MODEL OUTPUT ---

The user is asking to "swap an item of order ORD-2025-12345," which implies they want to make a change to an existing order. This is clearly related to modifying the order contents rather than canceling it, placing a new order, or tracking shipment or refund. While it might be tempting to consider "track_order" due to the presence of the order number, the intention is not to track the status but to modify the order. The options "change_shipping_address" and "set_up_shipping_address" are unrelated since the request is about swapping an item, not setting or changing the shipping address.

Hence, the most appropriate intent is "change_order" since the customer is requesting a modification to an already placed order.

Final Choice: change_order

------------------------



'The user is asking to "swap an item of order ORD-2025-12345," which implies they want to make a change to an existing order. This is clearly related to modifying the order contents rather than canceling it, placing a new order, or tracking shipment or refund. While it might be tempting to consider "track_order" due to the presence of the order number, the intention is not to track the status but to modify the order. The options "change_shipping_address" and "set_up_shipping_address" are unrelated since the request is about swapping an item, not setting or changing the shipping address.\n\nHence, the most appropriate intent is "change_order" since the customer is requesting a modification to an already placed order.\n\nFinal Choice: change_order'

In [47]:
from tqdm import tqdm
import time
import pandas as pd

# Storage for metrics
llm_predictions = []
inference_times = []
input_token_counts = []
output_token_counts = []
api_costs = []

print(f"Starting LLM inference for {len(X_test_sampled)} samples...\n")

# tqdm progress bar
for idx, row in tqdm(X_test_sampled.iterrows(), total=len(X_test_sampled), desc="LLM Inference"):
    category = row["category"]
    instruction_synthetic = row["instruction_synthetic"]

    # Build zero-shot prompt
    prompt = create_zero_shot_prompt(
        category,
        instruction_synthetic,
        all_unique_intents_bulleted
    )

    # Call LLM safely
    prediction, inf_time, in_tokens, out_tokens, call_cost = safe_llm_predict(
        prompt,
        timeout=60
    )

    # Store metrics
    llm_predictions.append(prediction)
    inference_times.append(inf_time)
    input_token_counts.append(in_tokens)
    output_token_counts.append(out_tokens)
    api_costs.append(call_cost)

    # Slight delay to avoid rate limits
    time.sleep(0.35)

print("\nLLM inference completed.\n")

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(
    {"llm_predicted_intent": llm_predictions},
    index=X_test_sampled.index
)

print("Head of LLM Predictions:")
display(predictions_df.head())

print("\nValue counts of LLM Predictions:")
display(predictions_df["llm_predicted_intent"].value_counts())


Starting LLM inference for 270 samples...



LLM Inference: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 270/270 [10:38<00:00,  2.36s/it]


LLM inference completed.

Head of LLM Predictions:





Unnamed: 0,llm_predicted_intent
1267,change_order
1179,change_order
1310,change_order
1048,change_order
1950,change_order



Value counts of LLM Predictions:


llm_predicted_intent
contact_human_agent         11
switch_account              11
newsletter_subscription     10
review                      10
track_refund                10
create_account              10
delivery_options            10
delivery_period             10
payment_issue               10
complaint                   10
change_order                10
check_cancellation_fee      10
set_up_shipping_address     10
cancel_order                10
track_order                 10
get_invoice                 10
recover_password            10
registration_problems       10
place_order                 10
check_payment_methods       10
change_shipping_address     10
check_invoice               10
get_refund                  10
delete_account              10
check_refund_policy         10
edit_account                 9
contact_customer_service     9
Name: count, dtype: int64

In [48]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

# Ensure y_test_sampled and predictions_df are aligned by index
y_true = y_test_sampled.loc[predictions_df.index]
y_pred = predictions_df['llm_predicted_intent']

# Calculate Accuracy
accuracy = accuracy_score(y_true, y_pred)

# Calculate Macro-F1 Score
f1_macro = f1_score(y_true, y_pred, average='macro')

print(f"Zero-Shot LLM Performance:\n")
print(f"Accuracy: {accuracy:.4f}")
print(f"Macro-F1 Score: {f1_macro:.4f}")

# Calculate and print additional metrics
total_inference_time = sum(inference_times)
average_time_per_sample = total_inference_time / len(X_test_sampled)
estimated_time_per_1000_samples = average_time_per_sample * 1000

total_input_tokens = sum(input_token_counts)
total_output_tokens = sum(output_token_counts)
total_api_cost = sum(api_costs)

print(f"\n--- LLM Inference Metrics ---")
print(f"Total samples processed: {len(X_test_sampled)}")
print(f"Total inference time: {total_inference_time:.2f} seconds")
print(f"Average inference time per sample: {average_time_per_sample:.4f} seconds")
print(f"Estimated inference time per 1000 samples: {estimated_time_per_1000_samples:.2f} seconds")
print(f"Total input tokens consumed: {total_input_tokens}")
print(f"Total output tokens consumed: {total_output_tokens}")
print(f"Approximate API Cost for {len(X_test_sampled)} samples: ${total_api_cost:.6f}")
print(f"Estimated API Cost per 1000 samples: ${total_api_cost / len(X_test_sampled) * 1000:.6f}")

Zero-Shot LLM Performance:

Accuracy: 0.9926
Macro-F1 Score: 0.9926

--- LLM Inference Metrics ---
Total samples processed: 270
Total inference time: 543.30 seconds
Average inference time per sample: 2.0122 seconds
Estimated inference time per 1000 samples: 2012.20 seconds
Total input tokens consumed: 162541
Total output tokens consumed: 38306
Approximate API Cost for 270 samples: $0.000000
Estimated API Cost per 1000 samples: $0.000000


In [49]:
# Build a dataframe with all relevant info
results_df = pd.DataFrame({
    "y_true": y_true,
    "y_pred": y_pred,
    "instruction_synthetic": X_test_sampled["instruction_synthetic"].values,
    "category": X_test_sampled["category"].values
})

# Filter wrong predictions
wrong_df = results_df[results_df["y_true"] != results_df["y_pred"]]

print("Number of wrong predictions:", len(wrong_df))
wrong_df.head(27)

Number of wrong predictions: 2


Unnamed: 0,y_true,y_pred,instruction_synthetic,category
13950,edit_account,switch_account,mldifying Savings account,ACCOUNT
7952,contact_customer_service,contact_human_agent,"I have got to talk with customer assistance, h...",CONTACT


In [50]:
from tqdm import tqdm
import time
import pandas as pd

# Load test set
finaltest = pd.read_csv("/kaggle/input/mmai894-test-set/test.csv")

# Storage for predictions
submission_predictions = []

print(f"Running inference on {len(finaltest)} test samples...\n")

for idx, row in tqdm(finaltest.iterrows(), total=len(finaltest), desc="LLM Test Inference"):
    
    category = row["category"]
    instruction = row["instruction"]

    # Build prompt using your function
    prompt = create_zero_shot_prompt(
        category,
        instruction,
        all_unique_intents_bulleted  # same as training/validation
    )
    
    # Run LLM
    pred, inf_time, in_tokens, out_tokens, cost = safe_llm_predict(
        prompt,
        timeout=60
    )
    
    # Append prediction
    submission_predictions.append(pred)

    # Delay to reduce rate limits
    time.sleep(0.35)

print("\nInference completed.\n")

# Build submission file
submission_df = pd.DataFrame({
    "id": finaltest["id"],
    "intent": submission_predictions
})

# Save
submission_df.to_csv("submission.csv", index=False)

print("Saved submission.csv!")

submission_df.head()


Running inference on 270 test samples...



LLM Test Inference: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 270/270 [12:06<00:00,  2.69s/it]


Inference completed.

Saved submission.csv!





Unnamed: 0,id,intent
0,1,contact_customer_service
1,2,switch_account
2,3,contact_human_agent
3,4,create_account
4,5,contact_human_agent
