## Prepare for human evaluation

In [6]:
import pandas as pd
import json

# Load the data
file_path = r"C:\Users\tys\Documents\Coding\FYP-enhancing-churn-prediction-with-slm-and-llm\data\output\llm_judge_evaluation_results_TEST.csv.gz"
df = pd.read_csv(file_path, compression='gzip')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# Display first few rows to understand the structure
df.head()

Dataset shape: (30, 6)
Columns: ['index', 'explanation', 'predicted_label', 'top_5_shap_magnitudes', 'top_5_shap_values', 'evaluation']


Unnamed: 0,index,explanation,predicted_label,top_5_shap_magnitudes,top_5_shap_values,evaluation
0,0,The prediction indicates this customer is like...,0,"{'membership_category': 'very strong', 'avg_fr...","{'membership_category': -8.527676582336426, 'a...","{'completeness': {'score': 8.0}, 'faithfulness..."
1,1,This customer is likely to churn primarily due...,1,"{'membership_category': 'strong', 'avg_frequen...","{'membership_category': 2.813594102859497, 'av...","{'completeness': {'score': 10.0}, 'faithfulnes..."
2,2,The model indicates this customer is likely to...,1,"{'membership_category': 'strong', 'avg_frequen...","{'membership_category': 3.10160756111145, 'avg...","{'completeness': {'score': 8.0}, 'faithfulness..."
3,3,The model predicts this customer is unlikely t...,0,"{'membership_category': 'very strong', 'points...","{'membership_category': -9.17963695526123, 'po...","{'completeness': {'score': 10.0}, 'faithfulnes..."
4,4,The prediction indicates this customer is like...,1,"{'membership_category': 'strong', 'avg_frequen...","{'membership_category': 2.37990665435791, 'avg...","{'completeness': {'score': 10.0}, 'faithfulnes..."


In [7]:
# Function to format the evaluation text for each sample
def format_evaluation_text(row):
    # Parse JSON strings if they are stored as strings
    try:
        top_5_shap_magnitudes = eval(row['top_5_shap_magnitudes']) if isinstance(row['top_5_shap_magnitudes'], str) else row['top_5_shap_magnitudes']
        top_5_shap_values = eval(row['top_5_shap_values']) if isinstance(row['top_5_shap_values'], str) else row['top_5_shap_values']
        evaluation = eval(row['evaluation']) if isinstance(row['evaluation'], str) else row['evaluation']
    except:
        # If eval fails, try json.loads
        try:
            top_5_shap_magnitudes = json.loads(row['top_5_shap_magnitudes']) if isinstance(row['top_5_shap_magnitudes'], str) else row['top_5_shap_magnitudes']
            top_5_shap_values = json.loads(row['top_5_shap_values']) if isinstance(row['top_5_shap_values'], str) else row['top_5_shap_values']
            evaluation = json.loads(row['evaluation']) if isinstance(row['evaluation'], str) else row['evaluation']
        except:
            # If both fail, use as is
            top_5_shap_magnitudes = row['top_5_shap_magnitudes']
            top_5_shap_values = row['top_5_shap_values']
            evaluation = row['evaluation']
    
    formatted_text = f"""LLM-generated narratives
==========================
{row['explanation']}

Predicted Target variable
===========================
predicted_label: {row['predicted_label']}

LLM-generated scoring to evaluate narratives
================================================
{top_5_shap_magnitudes}

{top_5_shap_values}

{evaluation}"""
    
    return formatted_text

# Select first 30 samples
sample_df = df.head(30).copy()

# Generate formatted texts for all 30 samples
formatted_texts = []
for idx, row in sample_df.iterrows():
    formatted_text = format_evaluation_text(row)
    formatted_texts.append({
        'sample_id': idx,
        'formatted_text': formatted_text
    })

print(f"Generated {len(formatted_texts)} formatted evaluation texts")

Generated 30 formatted evaluation texts


In [8]:
# Display the first formatted sample to verify the format
print("Sample 1:")
print("=" * 50)
print(formatted_texts[0]['formatted_text'])

Sample 1:
LLM-generated narratives
The prediction indicates this customer is likely to stay. Their membership category significantly reduces the chance of churn. Additionally, they log in frequently, which also helps retain them. However, factors like age and using special discounts slightly increase the risk of leaving, but overall, they remain a valued customer.

Predicted Target variable
predicted_label: 0

LLM-generated scoring to evaluate narratives
{'membership_category': 'very strong', 'avg_frequency_login_days': 'strong', 'age': 'moderate', 'used_special_discount_Yes': 'weak', 'internet_option_Wi-Fi': 'weak'}

{'membership_category': -8.527676582336426, 'avg_frequency_login_days': -1.5842729806900024, 'age': 0.4120582640171051, 'used_special_discount_Yes': 0.18287909030914307, 'internet_option_Wi-Fi': 0.1736346036195755}

{'completeness': {'score': 8.0}, 'faithfulness': {'score': 8.0, 'details': 'Calculated by validation function'}, 'raw_features': [{'feature': 'membership_cate

In [9]:
# Save all formatted texts to a file for easy access during human evaluation
output_texts = []
for i, item in enumerate(formatted_texts, 1):
    output_texts.append(f"SAMPLE {i}")
    output_texts.append("=" * 50)
    output_texts.append(item['formatted_text'])
    output_texts.append("\n" + "=" * 100 + "\n")

# Write to a text file
output_file = r"C:\Users\tys\Documents\Coding\FYP-enhancing-churn-prediction-with-slm-and-llm\data\output\human_evaluation_samples.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    f.write("\n".join(output_texts))

print(f"Formatted evaluation texts saved to: {output_file}")

Formatted evaluation texts saved to: C:\Users\tys\Documents\Coding\FYP-enhancing-churn-prediction-with-slm-and-llm\data\output\human_evaluation_samples.txt


In [10]:
# Create a summary dataframe for easy reference during evaluation
summary_df = pd.DataFrame({
    'sample_id': [item['sample_id'] for item in formatted_texts],
    'predicted_label': sample_df['predicted_label'].values,
    'narrative_preview': [row['explanation'][:100] + "..." if len(row['explanation']) > 100 else row['explanation'] 
                         for _, row in sample_df.iterrows()]
})

print("Summary of 30 evaluation samples:")
summary_df

Summary of 30 evaluation samples:


Unnamed: 0,sample_id,predicted_label,narrative_preview
0,0,0,The prediction indicates this customer is like...
1,1,1,This customer is likely to churn primarily due...
2,2,1,The model indicates this customer is likely to...
3,3,0,The model predicts this customer is unlikely t...
4,4,1,The prediction indicates this customer is like...
5,5,1,The prediction indicates this customer is like...
6,6,0,The model predicts this customer is likely to ...
7,7,0,The model predicts this customer is unlikely t...
8,8,1,The model indicates this customer is likely to...
9,9,1,The prediction indicates this customer is like...
