Создание csv файла, чтобы посчитать метрики

In [None]:
# здесь восстанавливаем id для raw файлов
def add_ids_to_intents(intent_file_path, id_file_path, output_file_path):
    with open(intent_file_path, 'r', encoding='utf-8') as intent_file:
        intent_lines = intent_file.readlines()
    with open(id_file_path, 'r', encoding='utf-8') as id_file:
        id_lines = id_file.readlines()
    ids = [re.match(r'#\s*id:\s*(\S+)', line).group(1) for line in id_lines if re.match(r'#\s*id:', line)]
    intent_indices = [i for i, line in enumerate(intent_lines) if line.startswith('# intent =')]
    if len(ids) != len(intent_indices):
        raise ValueError(f"ID ({len(ids)}) не совпадают с интентами ({len(intent_indices)})")
    output_lines = []
    id_index = 0
    for i, line in enumerate(intent_lines):
        if line.startswith('# intent ='):
            output_lines.append(f'# id: {ids[id_index]}\n')
            id_index += 1
        output_lines.append(line)
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.writelines(output_lines)
add_ids_to_intents('cities_test_adapt.raw_0.conll', 'zara.txt', 'output.conll')

In [1]:
import re
import pandas as pd
def parse_annotated_file(filepath):
    data = {}
    current_id = None
    current_intent = None
    current_slots = []
    intent_pattern = re.compile(r'#\s*intent\s*[:=]\s*(.+)', re.IGNORECASE)
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith('# id:'):
                current_id = line.split(':', 1)[1].strip()
                current_slots = []
            elif line.lower().startswith('# intent'):
                match = intent_pattern.match(line)
                if match:
                    current_intent = match.group(1).strip()
            elif line and not line.startswith('#'):
                parts = line.split('\t')
                if len(parts) >= 4:
                    slot_label = parts[3]
                    current_slots.append(slot_label)
            elif line == '' and current_id:
                data[current_id] = {
                    'intent': current_intent,
                    'slots': ' '.join(current_slots)
                }
                current_id = None
                current_intent = None
                current_slots = []
        if current_id:
            data[current_id] = {
                'intent': current_intent,
                'slots': ' '.join(current_slots)
            }
    return data
def create_comparison_csv(gold_file, pred_file, output_csv):
    gold_data = parse_annotated_file(gold_file)
    pred_data = parse_annotated_file(pred_file)
    rows = []
    for example_id in gold_data:
        row = {
            'id': example_id,
            'intents_gold': gold_data[example_id]['intent'],
            'slots_gold': gold_data[example_id]['slots'],
            'intents_pred': pred_data.get(example_id, {}).get('intent', ''),
            'slots_pred': pred_data.get(example_id, {}).get('slots', '')
        }
        rows.append(row)
    df = pd.DataFrame(rows)
    df.to_csv(output_csv, index=False, encoding='utf-8')
create_comparison_csv('ru.test_adapt.conll', 'nlu.xsid_test_adapt.out', 'comparison_test_adapt.csv')

In [None]:
import pandas as pd
import pandas.api.types

from sklearn.metrics import f1_score

def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, intent_column_name: str, slots_column_name: str) -> float:
    '''
    Evaluate the joint performance of intent classification and slot filling using F1 scores.

    Computes the weighted F1-score for intent classification, the weighted F1-score for
    slot filling, and returns their average. This metric is commonly used in task-oriented
    dialogue systems evaluation.

    # This example doctest works for intent_detection_and_slot_filling_f1_avg:
    >>> import pandas as pd
    >>> y_pred = pd.DataFrame.from_dict({'id': [1, 2], 'intent': ['BookRestaurant', 'ScheduleAlarm'], 'slots': ['O B-location I-location B-date', 'B-date O O']})
    >>> y_true = pd.DataFrame.from_dict({'id': [1, 2], 'intent': ['BookRestaurant', 'ScheduleAlarm'], 'slots': ['O B-location I-location B-date', 'B-date I-date O']})
    >>> score(y_true.copy(), y_pred.copy(), 'id', 'intent', 'slots')
    0.88...
    '''

    del solution[row_id_column_name]
    del submission[row_id_column_name]

    # Calculate intent F1 score
    intent_f1 = f1_score(solution[intent_column_name],
                         submission[intent_column_name],
                         average='weighted')
    print(f"Intents Metric = {intent_f1}")

    # Calculate slot F1 score
    gold_slots = []
    system_slots = []

    slot_f1 = 0
    for gold_slot_str, system_slot_str in zip(solution[slots_column_name], submission[slots_column_name]):
      try:
        slot_f1 += f1_score(gold_slot_str.split(), system_slot_str.split(), average='weighted')
      except:
        continue

    slot_f1 = slot_f1/len(solution)

    # slot_f1 = f1_score(gold_slots, system_slots, average='weighted')

    # Calculate average metric
    avg_f1 = (intent_f1 + slot_f1) / 2

    print(f"Slots Metric = {slot_f1}")

    return avg_f1


comparison = pd.read_csv("comparison_vosk.csv")

solution = comparison[['id', 'intents_gold', 'slots_gold']]
solution = solution.rename(columns={'intents_gold': 'intents', 'slots_gold': 'slots'})
submission = comparison[['id', 'intents_pred', 'slots_pred']]
submission = submission.rename(columns={'intents_pred': 'intents', 'slots_pred': 'slots'})

score(solution, submission, 'id', 'intents', 'slots')

Intents Metric = 0.9488236336600104
Slots Metric = 0.2806627216408447


0.6147431776504275