# Workshop solutions for Evaluating LLM based applications

## Evaluation framework for a LLM based app

### Setup and dataset prep

In [1]:
import pandas as pd
import requests
from evidently import ColumnMapping
from evidently.descriptors import *
from evidently.metric_preset import TextEvals
from evidently.metrics import *
from evidently.report import Report
from evidently.test_suite import TestSuite
from evidently.tests import *
from io import BytesIO

In [2]:
response = requests.get("https://raw.githubusercontent.com/pyladiesams/eval-llm-based-apps-jan2025/main/assets/QA.csv")
qa_csv_content = BytesIO(response.content)
qa_logs = pd.read_csv(qa_csv_content, index_col=0, parse_dates=['start_time', 'end_time'])
qa_logs.index = qa_logs.start_time
qa_logs.index.rename('index', inplace=True)

In [3]:
column_mapping = ColumnMapping(
    datetime='start_time',
    datetime_features=['end_time'],
    text_features=['question', 'response'],
    categorical_features=['organization', 'model_ID', 'region', 'environment', 'feedback'],
)

### Text statistics

**Exercise 1: Solution**

Get a side by side comparison of the sentence count for the first and the next 100 rows from the same dataframe. Use SentenceCount() descriptor.

In [None]:
text_evals_report = Report(metrics=[
    TextEvals(column_name="response",
              descriptors=[
                  SentenceCount(),
                  ]
              )
])

text_evals_report.run(reference_data=qa_logs[:100],
                      current_data=qa_logs[100:200],
                      column_mapping=column_mapping)
text_evals_report

### Text patterns

**Exercise 2: Solution**

Check whether the first 200 responses contain links or not. Add "Contains links" display name for this eval.

In [None]:
text_evals_report = Report(metrics=[
    TextEvals(column_name="response",
              descriptors=[
                  ContainsLink(
                      display_name="Contains links")
            ]
        )
        ]
)

text_evals_report.run(reference_data=None,
                      current_data=qa_logs[:200],
                      column_mapping=column_mapping)
text_evals_report

### ML-based evaluation

**Sentiment analysis**

**Exercise 3: Solution** 

Execute a sentiment check on the first 200 responses

In [None]:
text_evals_report = Report(metrics=[
    TextEvals(column_name="response", descriptors=[
            Sentiment(),
        ]
    ),
])

text_evals_report.run(reference_data=None,
                      current_data=qa_logs[:200],
                      column_mapping=column_mapping)
text_evals_report

## Test suite with evals for a LLM based app

**Exercise 4: Solution**

Enrich the current test suite with a new condition: average response sentiment is positive.

In [None]:
test_suite = TestSuite(tests=[
    TestColumnValueMin(column_name = TextLength().on("response"), gt=0),
    TestColumnValueMax(column_name = TextLength().on("response"), lte=1800),
    TestColumnValueMean(column_name = TextLength().on("response"), gt=500),
    TestColumnValueMean(column_name = Sentiment().on("response"), gte=0),
])

test_suite.run(reference_data=None, 
                    current_data=qa_logs[:200], 
                    column_mapping=column_mapping)
test_suite