# Thematic Analysis with LLMs

## This notebook implements a pipeline for thematic analysis of text data using Amazon Bedrock and Anthropic's Claude model, followed by evaluation of the generated themes.

### Step 1. Initial Setup

In [1]:
!pip install boto3 pandas numpy simpledorff scipy

Collecting simpledorff
  Downloading simpledorff-0.0.2-py3-none-any.whl.metadata (2.5 kB)
Downloading simpledorff-0.0.2-py3-none-any.whl (5.6 kB)
Installing collected packages: simpledorff
Successfully installed simpledorff-0.0.2


In [2]:
import boto3
import json
from datetime import datetime
from botocore.config import Config
import re


# Initialize S3 client
s3_client = boto3.client('s3')

#Optional: Check which models you have access to 
bedrock = boto3.client('bedrock')
response = bedrock.list_foundation_models()

# Initialize AWS clients
bedrock_runtime = boto3.client(
    service_name='bedrock-runtime',
    region_name='us-east-1',  # Important: Replace with your region if different
    config=Config(
        signature_version='v4',
        retries={
            'max_attempts': 3,
            'mode': 'standard'
        }
    )
)

# Optional: Print available models
for model in response['modelSummaries']:
    print(f"Model ID: {model['modelId']}")
    if model["inferenceTypesSupported"] == "ON_DEMAND":
        print(model)
#     break

Model ID: amazon.titan-tg1-large
Model ID: amazon.titan-image-generator-v1:0
Model ID: amazon.titan-image-generator-v1
Model ID: amazon.titan-image-generator-v2:0
Model ID: amazon.nova-premier-v1:0:8k
Model ID: amazon.nova-premier-v1:0:20k
Model ID: amazon.nova-premier-v1:0:1000k
Model ID: amazon.nova-premier-v1:0:mm
Model ID: amazon.nova-premier-v1:0
Model ID: amazon.titan-text-premier-v1:0
Model ID: amazon.nova-pro-v1:0:24k
Model ID: amazon.nova-pro-v1:0:300k
Model ID: amazon.nova-pro-v1:0
Model ID: amazon.nova-lite-v1:0:24k
Model ID: amazon.nova-lite-v1:0:300k
Model ID: amazon.nova-lite-v1:0
Model ID: amazon.nova-canvas-v1:0
Model ID: amazon.nova-reel-v1:0
Model ID: amazon.nova-reel-v1:1
Model ID: amazon.nova-micro-v1:0:24k
Model ID: amazon.nova-micro-v1:0:128k
Model ID: amazon.nova-micro-v1:0
Model ID: amazon.nova-sonic-v1:0
Model ID: amazon.titan-embed-g1-text-02
Model ID: amazon.titan-text-lite-v1:0:4k
Model ID: amazon.titan-text-lite-v1
Model ID: amazon.titan-text-express-v1:0:8

In [6]:
# Configure the S3 for input text file as well as to upload the output
bucket = 'my-example-name'
raw_input = 'feedback_dummy_data.txt'
output_themes = 'feedback_analyzed_claude.txt'

### Step 2: Generate Thematic Summaries

In [7]:
def analyze_comment(comment, model_id="amazon.nova-pro-v1:0"):
    """
    Analyze customer comments using Nova Pro in a structured format.
    Returns thematic analysis of customer feedback.
    """
    
    # Define system prompt
    system_list = [
        {
            "text": "You are a Customer Feedback Analyst responsible for identifying key themes and concerns in customer reviews. Your role is to extract meaningful insights that can drive product improvements within the team."
        }
    ]

    # Define the user message
    message_list = [
        {
            "role": "user",
            "content": [
                {
                    "text": f"""Analyze this customer review and provide insights in JSON format:

Review: "{comment}"

Return only this JSON structure with your analysis:
{{
    "main_theme": "primary theme identified",
    "sub_theme": "secondary theme or specific aspect",
    "rationale": "explanation of thematic analysis"
}}

Provide only the JSON response without additional text."""
                }
            ]
        }
    ]

    # Configure inference parameters
    inf_params = {
        "maxTokens": 1000,
        "temperature": 0.1,
        "topP": 0.9,
        "topK": 20
    }

    # Construct the request body
    request_body = {
        "schemaVersion": "messages-v1",
        "messages": message_list,
        "system": system_list,
        "inferenceConfig": inf_params
    }

    # Make the request to Nova Pro
    response = bedrock_runtime.invoke_model_with_response_stream(
        modelId=model_id,
        body=json.dumps(request_body)
    )

    # Process the response stream
    full_response = ""
    stream = response.get("body")
    if stream:
        for event in stream:
            chunk = event.get("chunk")
            if chunk:
                chunk_json = json.loads(chunk.get("bytes").decode())
                content_block_delta = chunk_json.get("contentBlockDelta")
                if content_block_delta:
                    full_response += content_block_delta.get("delta").get("text", "")

    # Extract and validate JSON response
    try:
    	json_match = re.search(r'(\{.*\})', full_response, re.DOTALL)
    	if json_match:
        	json_str = json_match.group(1)
        	return json.loads(json_str)
    	else:
        	raise json.JSONDecodeError("No JSON found", full_response, 0)
    except Exception as e:
        return {
            "main_theme": "Error in analysis",
            "sub_theme": "Processing error",
            "rationale": str(e)
        }

In [11]:
response = s3_client.get_object(Bucket=bucket, Key=raw_input)
feedbacks = []

for line in response['Body'].iter_lines():
    decoded_line = line.decode('utf-8')
    feedbacks.append(decoded_line)

# Analyze each comment and store results
analyzed_results = []
output_content = ""

for i, feedback in enumerate(feedbacks):
    if feedback.strip():  # Skip empty lines
        print(f"Analyzing comment {i+1} of {len(feedbacks)}")
        analysis = analyze_comment(feedback)

#Optional: For debugging
#         print("Raw response for feedback:", feedback)
#         print(analysis)

        result = {
            "original_comment": feedback,
            "analysis": analysis
        }
        analyzed_results.append(result)

# Format output for each result
        entry = {
            "Original Comment": feedback,
            "main_theme": analysis['main_theme'],
            "sub_theme": analysis['sub_theme'],
            "rationale": analysis['rationale']
        }
        output_content += "\n".join(f"{k}: {v}" for k, v in entry.items()) + "\n\n"

# Save results back to S3
s3_client.put_object(
    Bucket=bucket,
    Key=output_themes,
    Body=output_content.encode('utf-8')
)

print(f"Analysis complete. Results saved to s3://{bucket}/{output_themes}")


Analyzing comment 1 of 1
Analysis complete. Results saved to s3://genai-demos-bucket/feedback_analyzed_claude.txt


In [12]:
#Optional: Read the analyzed results file
response = s3_client.get_object(Bucket=bucket, Key=output_themes)
content = response['Body'].read().decode('utf-8')

# Split by double newlines since that's how entries are separated
entries = content.split('\n\n')


# Display first k entries 
k = 3
for i, entry in enumerate(entries[:k]):
    if entry.strip():  # Skip empty entries
        print(f"\nEntry {i+1}:")
        print(entry.strip())


Entry 1:
Original Comment: Affordable and reliable.
main_theme: positive feedback
sub_theme: affordability and reliability
rationale: The review highlights two key attributes of the product: it is 'affordable' and 'reliable', indicating customer satisfaction with both the price and the performance of the product.


### Step 3. Deploy multiple pre-trained LLMs as judges

In [31]:
def evaluate_alignment_nova_product(comment, theme, subtheme, rationale, model_id):
    """Evaluate theme alignment using Nova model"""
    
    # Define system prompt
    system_list = [
        {
            "text": "You are a Product Researcher analyzing customer feedback. Your role is to evaluate how accurately our thematic analysis captures customer sentiments and experiences. Use your expertise in customer insights and product research to assess theme alignment."
        }
    ]

    # Define the user message
    message_list = [
        {
            "role": "user", 
            "content": [
                {
                    "text": f"""As a Product Researcher, evaluate the theme alignment using this scale:
1: Poor - Our thematic analysis misses key customer product requirements
2: Partial - Our analysis captures some but not all important product requirements feedback
3: Strong - Our thematic analysis effectively captures the customer's requirements

CUSTOMER FEEDBACK ANALYSIS:
Voice of Customer: "{comment}"
Primary Theme: {theme}
Secondary Theme: {subtheme} 
Analysis Rationale: {rationale}

Provide your research assessment in this JSON format:
{{
    "alignment_score": <1, 2, or 3>,
    "justification": "brief explanation of your evaluation from a research perspective"
}}

Return only valid JSON with no additional commentary."""
                }
            ]
        }
    ]

    # Configure inference parameters
    inf_params = {
        "maxTokens": 500,
        "topP": 0.9,
        "topK": 20,
        "temperature": 0.1
    }

    # Construct the request body
    request_body = {
        "schemaVersion": "messages-v1",
        "messages": message_list,
        "system": system_list,
        "inferenceConfig": inf_params,
    }

    # Make the request
    response = bedrock_runtime.invoke_model_with_response_stream(
        modelId=model_id,
        body=json.dumps(request_body)
    )

    # Process the response stream
    full_response = ""
    stream = response.get("body")
    if stream:
        for event in stream:
            chunk = event.get("chunk")
            if chunk:
                chunk_json = json.loads(chunk.get("bytes").decode())
                content_block_delta = chunk_json.get("contentBlockDelta")
                if content_block_delta:
                    full_response += content_block_delta.get("delta").get("text", "")


    json_match = re.search(r'(\{.*\})', full_response, re.DOTALL)
    if json_match:
        json_str = json_match.group(1)
        return json.loads(json_str)
    else:
        raise json.JSONDecodeError("No JSON found", full_response, 0)

## Deploying Nova Pro in a Sales persona

def evaluate_alignment_nova_sales(comment, theme, subtheme, rationale, model_id):
    """Evaluate theme alignment using Nova model"""
    
    # Define system prompt
    system_list = [
        {
            "text": "You are a Customer Experience Advocate whose primary focus is understanding the emotional context and underlying customer needs. Your role is to analyze feedback from the customer's perspective, identifying both explicit and implicit emotional signals. Use your expertise in customer sales to assess theme alignment."
        }
    ]

    # Define the user message
    message_list = [
        {
            "role": "user", 
            "content": [
                {
                    "text": f"""As a Sales Representative, evaluate the theme alignment using this scale:
1: Poor - The thematic analysis misses key customer sentiments
2: Partial - The analysis captures some but not all important sentiments
3: Strong - The thematic analysis effectively captures all the customer's core sentiments

CUSTOMER FEEDBACK ANALYSIS:
Voice of Customer: "{comment}"
Primary Theme: {theme}
Secondary Theme: {subtheme} 
Analysis Rationale: {rationale}

Provide your research assessment in this JSON format:
{{
    "alignment_score": <1, 2, or 3>,
    "justification": "brief explanation of your evaluation from a research perspective"
}}

Return only valid JSON with no additional commentary."""
                }
            ]
        }
    ]

    # Configure inference parameters
    inf_params = {
        "maxTokens": 500,
        "topP": 0.9,
        "topK": 20,
        "temperature": 0.1
    }

    # Construct the request body
    request_body = {
        "schemaVersion": "messages-v1",
        "messages": message_list,
        "system": system_list,
        "inferenceConfig": inf_params,
    }

    # Make the request
    response = bedrock_runtime.invoke_model_with_response_stream(
        modelId=model_id,
        body=json.dumps(request_body)
    )

    # Process the response stream
    full_response = ""
    stream = response.get("body")
    if stream:
        for event in stream:
            chunk = event.get("chunk")
            if chunk:
                chunk_json = json.loads(chunk.get("bytes").decode())
                content_block_delta = chunk_json.get("contentBlockDelta")
                if content_block_delta:
                    full_response += content_block_delta.get("delta").get("text", "")

    # Extract JSON from response
    json_match = re.search(r'(\{.*\})', full_response, re.DOTALL)
    if json_match:
        json_str = json_match.group(1)
        return json.loads(json_str)
    else:
        raise json.JSONDecodeError("No JSON found", full_response, 0)

## Deploying Claude 3.5 Sonnet 

def evaluate_alignment_claude(comment, theme, subtheme, rationale, model_id):
    """Evaluate theme alignment using Claude model"""
    
    # In Claude's API format, system message is a separate top-level parameter
    system_message = "You are a customer research executive evaluating how accurately themes match a customer comment. Provide your evaluation as JSON."
    
    # Define the messages (only user and assistant roles allowed)
    messages = [
        {
            "role": "user", 
            "content": f"""Rate ONLY the theme alignment on this scale:
1: Poor match - themes don't capture the main points
2: Partial match - themes capture some but not all key points
3: Strong match - themes accurately capture the main points

REVIEW DETAILS:
Comment: "{comment}"
Main Theme: {theme}
Sub-theme: {subtheme} 
Rationale: {rationale}

Respond ONLY with this JSON structure:
{{
    "alignment_score": <1, 2, or 3>,
    "justification": "brief explanation of score"
}}

Your response must be valid JSON with no other text."""
        }
    ]

    # Construct the request body according to Claude's API format
    request_body = {
        "anthropic_version": "bedrock-2023-05-31",
        "system": system_message,
        "messages": messages,
        "max_tokens": 500,
        "temperature": 0.7,
        "top_p": 0.9
    }

    # Make the request
    response = bedrock_runtime.invoke_model_with_response_stream(
        modelId=model_id,
        body=json.dumps(request_body)
    )

    # Process the response stream
    full_response = ""
    stream = response.get("body")
    if stream:
        for event in stream:
            chunk = event.get("chunk")
            if chunk:
                chunk_data = json.loads(chunk.get("bytes").decode())
                if chunk_data.get("type") == "content_block_delta":
                    delta = chunk_data.get("delta", {})
                    if "text" in delta:
                        full_response += delta["text"]

    json_match = re.search(r'(\{.*\})', full_response, re.DOTALL)
    if json_match:
        json_str = json_match.group(1)
        return json.loads(json_str)
    else:
        raise json.JSONDecodeError("No JSON found", full_response, 0)


model_name_to_id_map = {"nova_product": "amazon.nova-lite-v1:0",
               "nova_sales" : "amazon.nova-pro-v1:0",
               "claude_v3_5": "anthropic.claude-3-5-sonnet-20240620-v1:0"}

model_name_to_id_map = {"nova_product": "amazon.nova-lite-v1:0",
               "nova_sales" : "amazon.nova-pro-v1:0",
               "claude_v3_5": "us.anthropic.claude-3-5-haiku-20241022-v1:0"}

In [32]:
bucket = 'my-example-name'
raw_input = 'feedback_dummy_data.txt'
output_themes = 'feedback_analyzed_claude.txt'

def get_alignment_func(model_type):
    if model_type == "nova_product":
        return evaluate_alignment_nova_product
    elif model_type == "nova_sales":
        return evaluate_alignment_nova_sales
    elif model_type == "claude_v3_5":
        return evaluate_alignment_claude
    else:
        print(f"Alignment function for model:{model_id} is not defined")
        raise NotImplementedError

def evaluate_alignment_model(s3_client, model_name, bucket=bucket, key='feedback_analyzed.txt'):
    # Rest of the code remains the same
    # Read analyzed results
    response = s3_client.get_object(Bucket=bucket, Key=key)
    content = response['Body'].read().decode('utf-8')

    # Split into entries
    entries = content.split('\n\n')

    # Process each entry
    output_content = ""
    alignments = []
    
    alignment_func = get_alignment_func(model_name)

    for i, entry in enumerate(entries):
        if not entry.strip():
            print(f"{i+1} entry is empty. Skipping")
            continue
        # Parse entry
        lines = entry.strip().split('\n')
        entry_dict = {}
        for line in lines:
            if ':' in line:
                key, value = line.split(':', 1)
                entry_dict[key.strip()] = value.strip()

        # # Get alignment score
        
        alignments.append(alignment_func(
            entry_dict['Original Comment'],
            entry_dict['main_theme'],
            entry_dict['sub_theme'],
            entry_dict['rationale'],
            model_name_to_id_map[model_name]
        ))
        print(f"Processed {i+1} / {len(entries)} entries with {model_name}")
    return alignments

In [35]:
alignment_dict = {}

alignment_dict["nova_product"] = evaluate_alignment_model(s3_client=s3_client, 
                                                  model_name='nova_product',
                                                    key = output_themes)

alignment_dict["nova_sales"] = evaluate_alignment_model(s3_client=s3_client, 
                                                   model_name='nova_sales',
                                                    key = output_themes)
alignment_dict["claude"] = evaluate_alignment_model(s3_client=s3_client, 
                                                    model_name='claude_v3_5',
                                                    key = output_themes)

Processed 1 / 2 entries with nova_product
Processed 2 / 2 entries with nova_product


### Step 4. Implement Comparative Evaluation Metrics

In [38]:
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score
import simpledorff
from scipy import stats
import pprint


def create_df(alignment_dict):
    if not alignment_dict:
        raise ValueError("Empty alignment dictionary provided")

    model_names = list(alignment_dict.keys())
    res = pd.DataFrame(columns=model_names)
    
    # Get expected length from first model
    N = len(alignment_dict[model_names[0]])
    
    # Validate all models have same length
    for model_name, values in alignment_dict.items():
        if len(values) != N:
            raise ValueError(
                f"Inconsistent lengths detected: {model_name} has length {len(values)}, "
                f"expected {N}"
            )
    
    # Create DataFrame
    for i in range(N):
        this_dict = {}
        for model_name, v in alignment_dict.items():
            this_dict[model_name] = v[i]['alignment_score']
        res = pd.concat((res, pd.DataFrame.from_dict([this_dict])), ignore_index=True)
                        
    return res

def calculate_percentage_agreement(df):
    """Calculate the percentage of perfect agreement between all raters"""
    total_rows = len(df)
    perfect_agreement_rows = sum(df.nunique(axis=1) == 1)
    return (perfect_agreement_rows / total_rows) * 100

def calculate_pairwise_cohens_kappa(df):
    """Calculate Cohen's Kappa for each pair of raters"""
    models = df.columns
    kappa_results = {}
    
    for i in range(len(models)):
        for j in range(i+1, len(models)):
            model1, model2 = models[i], models[j]
            # Remove rows where either rater has missing values
            valid_ratings = df[[model1, model2]].dropna()
            
            if len(valid_ratings) > 0:
                kappa = cohen_kappa_score(
                    valid_ratings[model1].astype(int),
                    valid_ratings[model2].astype(int)
                )
                kappa_results[f"{model1} vs {model2}"] = kappa
    
    return kappa_results

def calculate_krippendorffs_alpha(df): 
    """Calculate Krippendorff's alpha for all raters""" 
    # Reshape the data into the format required by simpledorff 
    data = df.reset_index().melt(id_vars=['index'], var_name='annotator_id', value_name='annotation') 
    data = data.rename(columns={'index': 'document_id'}) 
    
    return simpledorff.calculate_krippendorffs_alpha_for_df( 
        data, 
        experiment_col='document_id', 
        annotator_col='annotator_id', 
        class_col='annotation' 
    ) 

def calculate_spearmans_rho(df):
    """Calculate Spearman's rho for each pair of raters"""
    models = df.columns
    rho_results = {}
    
    for i in range(len(models)):
        for j in range(i+1, len(models)):
            model1, model2 = models[i], models[j]
            # Remove rows where either rater has missing values
            valid_ratings = df[[model1, model2]].dropna()
            
            if len(valid_ratings) > 0:
                rho, p_value = stats.spearmanr(
                    valid_ratings[model1],
                    valid_ratings[model2]
                )
                rho_results[f"{model1} vs {model2}"] = {
                    'rho': rho,
                    'p_value': p_value
                }
    
    return rho_results

In [55]:
# Extract ratings into a DataFrame
ratings_df = create_df(alignment_dict)

# Calculate statistics
stats_results = {
    'Percentage Agreement': calculate_percentage_agreement(ratings_df),
    'Cohens Kappa': calculate_pairwise_cohens_kappa(ratings_df),
    'Krippendorffs Alpha': calculate_krippendorffs_alpha(ratings_df),
    'Spearmans Rho': calculate_spearmans_rho(ratings_df)
}

  rho, p_value = stats.spearmanr(
  rho, p_value = stats.spearmanr(
  rho, p_value = stats.spearmanr(


In [56]:
pprint.pprint(stats_results)

{'Cohens Kappa': {'nova_product vs claude': 0.0,
                  'nova_product vs nova_sales': 0.0,
                  'nova_sales vs claude': 0.0},
 'Krippendorffs Alpha': -0.13636363636363624,
 'Percentage Agreement': 0.0,
 'Spearmans Rho': {'nova_product vs claude': {'p_value': nan, 'rho': nan},
                   'nova_product vs nova_sales': {'p_value': nan, 'rho': nan},
                   'nova_sales vs claude': {'p_value': nan, 'rho': nan}}}
