In [60]:
import os

from openai import OpenAI
from tqdm import tqdm

In [61]:
CLIENT = OpenAI(api_key="")
MODEL = "gpt-3.5-turbo-0125"
folder_path = "./data/Corpora_Falke/Wiki/test/102"

In [62]:
def read_cmap_file(folder_path):
    """
    Read the .cmap file in the specified folder and return its content.

    Args:
    - folder_path (str): The path to the folder containing the .cmap file.

    Returns:
    - cmap_content (str): The content of the .cmap file.
    """
    # Get the list of files in the folder
    files = os.listdir(folder_path)

    # Filter files to find the .cmap file
    cmap_files = [file for file in files if file.endswith('.cmap')]

    # Check if there is exactly one .cmap file
    if len(cmap_files) != 1:
        raise ValueError("There should be exactly one .cmap file in the folder.")

    # Get the path of the .cmap file
    cmap_file_path = os.path.join(folder_path, cmap_files[0])

    # Open and read the content of the .cmap file
    with open(cmap_file_path, 'r') as file:
        cmap_content = file.read()

    return cmap_content

In [63]:
def read_txt_files(folder_path):
    """
    Read all .txt files in the specified folder and return their content in a dictionary.

    Args:
    - folder_path (str): The path to the folder containing the .txt files.

    Returns:
    - txt_contents (dict): A dictionary where keys are file names (without the .txt extension) and values are the content of each file.
    """
    files = os.listdir(folder_path)

    txt_files = [file for file in files if file.endswith('.txt')]

    txt_contents = {}

    for txt_file in txt_files:
        txt_file_path = os.path.join(folder_path, txt_file)
        with open(txt_file_path, 'r') as file:
            txt_contents[txt_file.replace('.txt', '')] = file.read()

    return txt_contents

In [64]:
folder_path = "./data/Corpora_Falke/Wiki/test/102"
txt_contents = read_txt_files(folder_path)
cmap_content = read_cmap_file(folder_path)

PROMPT_CONCEPT_MAP = """
                    Task Description: Concept Map Generation

                    Your task is to process a collection of {} and extract triples from them.

                    Subsequently, you'll aggregate this information to construct a unique and comprehensive Concept Map representing the information
                    in all the texts in the given folder.

                    The resulting Concept Map should adhere to the following structure:
                    <Subject> - <Predicate> - <Object>,
                    <Subject> - <Predicate> - <Object>,
                    <Subject> - <Predicate> - <Object>,

                    The Concept Map should contain only the most important triple that best summarizes the content of all texts and avoid redundancy across triples.
                    In your answer, you must give the output in a .csv file with the columns `subject`, `predicate`, and `object`.

                    The output is a single:
                    ```csv
                    """


def get_concept_map(texts, cmap_content):
    res = []
    for text in tqdm(texts):
        prompt = PROMPT_CONCEPT_MAP.format(cmap_content, text)
        completion = CLIENT.chat.completions.create(
            model=MODEL, messages=[{"role": "user", "content": prompt}], temperature=0)
        res.append(completion.choices[0].message.content)
    return res


In [65]:
FOLDER = "./data/Corpora_Falke/Wiki/test/103"
TEXTS = [x for x in os.listdir(FOLDER) if x.endswith(".txt")]
TEXTS = [open(os.path.join(FOLDER, x)).read() for x in TEXTS]
texts = list(txt_contents.values())
CONCEPT_MAPS = get_concept_map(texts, cmap_content)

100%|██████████| 16/16 [00:46<00:00,  2.94s/it]


In [66]:
print(CONCEPT_MAPS)

['Concept Map:\nacra - built by - antiochus epiphanes\nacra - destroyed by - bce\nacra - was used to oversee the temple and maintain control over - jerusalem\nacra - was built to consolidate - seleucid empire\nacra - was built specifically to control access to - temple mount\nantiochus epiphanes - is ruler of - seleucid empire\ncity of david - is the lowest area of - jerusalem\nisrael antiquities authority - concentrating in the area of the herodian street west of - temple mount\ntemple mount - sits - jerusalem\nyoram tsafrir - has interpreted a masonry joint in the southeastern corner of - temple mount', 'Concept Map:\nacra - built by - antiochus epiphanes\nacra - destroyed by - bce\nacra - was used to oversee the temple and maintain control over - jerusalem\nacra - was built to consolidate - seleucid empire\nacra - was built specifically to control access to - temple mount\nantiochus epiphanes - is ruler of - seleucid empire\ncity of david - is the lowest area of - jerusalem\nisrael 