In [46]:
from langchain.schema import HumanMessage
import re
import boto3

In [47]:
from langchain_aws import ChatBedrock

llm = ChatBedrock(
    credentials_profile_name="sf-test", 
    model_id="arn:aws:bedrock:us-east-1:225285538696:custom-model/cohere.command-light-text-v14:7:4k/3u1xe1y3cmpn"
)

In [48]:
import json

data = []
with open('test_data.jsonl', 'r') as f:
    for line in f:
        data_line = json.loads(line)
        data.append(data_line)

print(f"Loaded {len(data)} data_lines")

Loaded 215 data_lines


In [49]:
def parse_llm_response(response: str):
    try:
        # Match basic Python-like list of tuples
        pattern = r"\([^)]+\)"
        tuples = re.findall(pattern, response)
        result = []
        for t in tuples:
            parts = [p.strip().strip("'\"") for p in t.strip("()").split(",")]
            if len(parts) == 3:
                result.append(tuple(map(str.lower, parts)))
        return result
    except Exception as e:
        print(f"Parse error: {e}\nResponse: {response}")
        return []

def evaluate_predictions(gold, pred):
    gold_set = set(gold)
    pred_set = set(pred)
    tp = len(gold_set & pred_set)
    fp = len(pred_set - gold_set)
    fn = len(gold_set - pred_set)
    precision = tp / (tp + fp + 1e-5)
    recall = tp / (tp + fn + 1e-5)
    f1 = 2 * precision * recall / (precision + recall + 1e-5)
    return precision, recall, f1

# Evaluate the predictions
results = []
for i, data_line in enumerate(data): 
    expected = eval(data_line['completion'])
    
    # Get response and parse it
    messages = [HumanMessage(content=data_line['prompt'])]
    accept = 'application/json'
    content_type = 'application/json'

    bedrock = boto3.client(service_name='bedrock-runtime')

    body = json.dumps({
        "prompt": data_line['prompt'],
        "max_tokens": 200,
        "temperature": 0.2,
        "p": 1,
        "k": 0,
        "num_generations": 1,
        "return_likelihoods": "GENERATION"
    })

    response = bedrock.invoke_model(
        body=body,
        modelId="arn:aws:bedrock:us-east-1:225285538696:provisioned-model/v3xjyaqeu1w4",
        accept=accept,
        contentType=content_type
    )
    response_body = json.loads(response.get('body').read())
    generations = response_body.get('generations')
    predicted = parse_llm_response(generations[0]['text'])
    
    # Calculate metrics
    precision, recall, f1 = evaluate_predictions(expected, predicted)
    results.append({
        'data_line': i+1,
        'Prompt': str(data_line['prompt']).strip(),
        'Expected': expected,
        'Predicted': predicted,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
    })

avg_p = sum(r['Precision'] for r in results) / len(results)
avg_r = sum(r['Recall'] for r in results) / len(results)
avg_f = sum(r['F1'] for r in results) / len(results)
print(f"Precision: {avg_p:.2f}, Recall: {avg_r:.2f}, F1: {avg_f:.2f}")

Precision: 0.24, Recall: 0.25, F1: 0.25


In [50]:
import pandas as pd

# Convert results list to DataFrame
df_results = pd.DataFrame(results)

# Save to CSV
df_results.to_csv('evaluation_results_custom_2.csv', index=False)