# Analyze label distribution and quality

## Load data

In [None]:
import sqlite3

from environment.env import getDataSourcePath, getNotebookDataSourcePath
from matplotlib import pyplot as plt
import pandas as pd

categories = [
    "Culture and Art",
    "Health",
    "History",
    "Science",
    "People",
    "Religion",
    "Society",
    "Technology",
    "Geography and Places"
]

db_path = getNotebookDataSourcePath()
connection = sqlite3.connect(db_path)

query_heuristic = 'SELECT sectionID, label FROM candidate_set_heuristics_validation'
df_heuristic = pd.read_sql_query(query_heuristic, connection)

query_llama_one = 'SELECT sectionID, label FROM candidate_set_llama2_1_validation'
df_llama_1 = pd.read_sql_query(query_llama_one, connection)

query_llama_two = 'SELECT sectionID, label FROM candidate_set_llama2_2_validation'
df_llama_2 = pd.read_sql_query(query_llama_two, connection)

query_llama_three = 'SELECT sectionID, label FROM candidate_set_llama2_3_validation'
df_llama_3 = pd.read_sql_query(query_llama_three, connection)

query_validation = 'SELECT sectionID, class FROM validation_data'
df_validation = pd.read_sql_query(query_validation, connection)

## Analyze validation heuristic labels

Show important statistics

In [None]:
label_counts = df_heuristic['label'].value_counts()
barWidth = 0.25

plt.figure(figsize=(10,6))
label_counts.plot(kind='bar')
plt.title('Frequency of labels')
plt.xlabel('Label')
plt.ylabel('Distribution')
plt.show()

labels_count_per_section = df_heuristic.groupby(['sectionID'])['label'].count()
sections_per_label_count = labels_count_per_section.value_counts().sort_index()
heuristic_average_label_count = labels_count_per_section.mean()
plt.figure(figsize=(10,6))
sections_per_label_count.plot(kind='bar')
plt.title('Labels per section')
plt.xlabel('Label')
plt.ylabel('Count per section')
plt.show()

Check if there are sections without label

In [None]:
# Analyze missing sectionIDs
all_section_ids = set(df_validation['sectionID'])
current_section_ids = set(df_heuristic['sectionID'])
missing_section_ids = all_section_ids - current_section_ids

print(len(missing_section_ids))

## Analyze Llama labels

In [None]:
import numpy as np

label_counts_1 = df_llama_1['label'].value_counts()
label_counts_2 = df_llama_2['label'].value_counts()
label_counts_3 = df_llama_3['label'].value_counts()

df = pd.DataFrame({
    'One label': label_counts_1,
    'Two labels': label_counts_2,
    'Three labels': label_counts_3})

plt.figure(figsize=(10,6))
df.plot(kind='bar')

plt.title('Frequency of labels')
plt.ylabel('Distribution')
plt.xlabel('Label')

plt.xticks(ticks=np.arange(len(categories)), labels=categories, rotation=45)
plt.legend()
plt.show()

## Analyze candidate set

In [None]:
df_section_ids = pd.merge(pd.DataFrame({'sectionID': df_heuristic['sectionID']}), pd.DataFrame({'sectionID': df_llama_2['sectionID']}), how='outer').drop_duplicates(subset='sectionID')
df_mix = pd.merge(df_section_ids, df_llama_2, on='sectionID', how='left')
df_mix = pd.merge(df_mix, df_heuristic, on='sectionID', how='left').fillna({'labels': '1,2,3,4,5,6,7,8,9'})

label_counts = df_mix['label'].value_counts()
plt.figure(figsize=(10,6))
label_counts.plot(kind='bar')
plt.title('Frequency of labels')
plt.ylabel('Distribution')
plt.show()

labels_count_per_section = df_mix.groupby(['sectionID'])['label'].count()
sections_per_label_count = labels_count_per_section.value_counts().sort_index()

plt.figure(figsize=(10,6))
sections_per_label_count.plot(kind='bar')
plt.title('Labels per section')
plt.xlabel('Label')
plt.ylabel('Count per section')
plt.show()

# Check label quality

In [None]:
from utils.category import convert_big_to_medium_category_index
from evaluation.evaluator import Evaluator

predictions_heuristic = []
targets = []

for (element, v_class) in zip(df_validation['sectionID'], df_validation['class']):
    labels_array = df_heuristic[df_heuristic['sectionID'] == element]['label'].values
    predictions_heuristic.append(labels_array)
    targets.append([convert_big_to_medium_category_index(str(v_class - 1))])

evaluator_heuristic = Evaluator(predictions_heuristic, targets)
print("Evaluation score of heuristic labels:")
accuracy_heuristic = evaluator_heuristic.evaluate()

In [None]:
predictions_one = []
predictions_two = []
predictions_three = []
predictions_hone = []

targets = []

for (element, v_class) in zip(df_validation['sectionID'], df_validation['class']):
    predictions_one.append(df_llama_1[df_llama_1['sectionID'] == element]['label'].values)
    predictions_two.append(df_llama_2[df_llama_2['sectionID'] == element]['label'].values)
    predictions_three.append(df_llama_3[df_llama_3['sectionID'] == element]['label'].values)
    predictions_hone.append(df_llama_1[df_llama_1['sectionID'] == element]['label'].values or df_heuristic[df_heuristic['sectionID'] == element]['label'].values)
    targets.append([convert_big_to_medium_category_index(str(v_class - 1))])

evaluator_hllama_1 = Evaluator(predictions_hone, targets)
evaluator_llama_1 = Evaluator(predictions_one, targets)
evaluator_llama_2 = Evaluator(predictions_two, targets)
evaluator_llama_3 = Evaluator(predictions_three, targets)
print("Evaluation score of llama labels:")
accuracy_hllama_1 = evaluator_hllama_1.evaluate()
accuracy_llama_1 = evaluator_llama_1.evaluate()
accuracy_llama_2 = evaluator_llama_2.evaluate()
accuracy_llama_3 = evaluator_llama_3.evaluate()

In [None]:
import numpy

predictions = []
targets = []

for (element, v_class) in zip(df_validation['sectionID'], df_validation['class']):
    labels_array = numpy.concatenate([df_llama_2[df_llama_2['sectionID'] == element]['label'].values, df_heuristic[df_heuristic['sectionID'] == element]['label'].values])
    predictions.append(labels_array)
    targets.append([convert_big_to_medium_category_index(str(v_class - 1))])

evaluator = Evaluator(predictions, targets)
print("Evaluation score of labels:")
evaluator.evaluate()

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(["Heuristic", "Llama 1", "Llama 2", "Llama 3"], [accuracy_heuristic, accuracy_llama_1, accuracy_llama_2, accuracy_llama_3], color=['blue', 'green', 'orange', 'red'])

plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Accuracy Comparison of Models')
plt.ylim(0, 1.0)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(
    ["Heuristics", "Llama 1", "Llama 2", "Llama 3"],
    [accuracy_heuristic / heuristic_average_label_count, accuracy_llama_1 / 1, accuracy_llama_2 / 2, accuracy_llama_3 / 3],
    color=['blue', 'green', 'orange', 'red']
)

plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Accuracy Comparison of Models')
plt.show()

In [None]:
label_counts = df_llama_1['label'].value_counts()
barWidth = 0.25

labels_count_per_section = df_llama_1.groupby(['sectionID'])['label'].count()
sections_per_label_count = labels_count_per_section.value_counts().sort_index()
heuristic_average_label_count = labels_count_per_section.mean()
plt.figure(figsize=(10,6))
sections_per_label_count.plot(kind='bar')
plt.title('Labels per section')
plt.xlabel('Label')
plt.ylabel('Count per section')
plt.show()