In [None]:
from utils.class_type import ClassType
from utils.category_type import CategoryType
import numpy as np
import pandas as pd
import sqlite3
from environment.env import getNotebookDataSourcePath
import asyncio
from heuristics.llama_generator import LlamaGenerator

connection = sqlite3.connect(getNotebookDataSourcePath())
cursor = connection.cursor()

# Load existing candidate sets
query_llama = 'SELECT sectionID, label FROM candidate_set_llama2_1_training'
df_llama = pd.read_sql_query(query_llama, connection)

# Load wikipedia sections
query_wikipedia_sections = 'SELECT sectionID, cleanedArticleText FROM wikipedia_sections'
df_wikipedia_sections = pd.read_sql_query(query_wikipedia_sections, connection)

# Find instances with missing candidate set
all_section_ids = set(df_wikipedia_sections['sectionID'])
current_section_ids = set(df_llama['sectionID'])
missing_section_ids = sorted((all_section_ids - current_section_ids), key=lambda x: x[0])
missing_sections = df_wikipedia_sections[df_wikipedia_sections['sectionID'].isin(missing_section_ids)]
section_ids, section_x = np.array(np.hsplit(missing_sections, 2))

async def process_section(p_sectionID, p_text):
    connection_p = sqlite3.connect(getNotebookDataSourcePath())
    cursor_p = connection_p.cursor()

    # Load label generators
    label_generator = LlamaGenerator(CategoryType.MEDIUM, ClassType.ORDER)
    labels = label_generator.get_llama_labels(p_text[0])
    labels_one = labels[:1]
    labels_two = labels[:2]
    labels_three = labels[:3]

    # Store labels
    processed_data_1 = [(p_sectionID[0], label) for label in labels_one]
    labels_to_insert_1 = [(section_id, label) for section_id, label in processed_data_1]
    cursor_p.executemany(f'INSERT INTO candidate_set_llama2_1_training (sectionID, label) VALUES (?, ?)', labels_to_insert_1)

    processed_data_2 = [(p_sectionID[0], label) for label in labels_two]
    labels_to_insert_2 = [(section_id, label) for section_id, label in processed_data_2]
    cursor_p.executemany(f'INSERT INTO candidate_set_llama2_2_training (sectionID, label) VALUES (?, ?)', labels_to_insert_2)

    processed_data_3 = [(p_sectionID[0], label) for label in labels_three]
    labels_to_insert_3 = [(section_id, label) for section_id, label in processed_data_3]
    cursor_p.executemany(f'INSERT INTO candidate_set_llama2_3_training (sectionID, label) VALUES (?, ?)', labels_to_insert_3)

    connection_p.commit()
    connection_p.close()

# Load missing candidate sets async
tasks = []
for t_sectionID, t_text in zip(section_ids, section_x):
    task = asyncio.create_task(process_section(t_sectionID, t_text))
    tasks.append(task)

await asyncio.gather(*tasks)