## Concept Map Extraction - Zero-shot Learning

In [None]:
import os
from openai import OpenAI
from io import StringIO
import pandas as pd
from tqdm import tqdm

## Concept Map Extraction

In [None]:
class ZeroBaseline:
    def __init__(self, api_key, model):
        self.client = OpenAI(api_key=api_key)
        self.model = model

    def extract_concept_maps(self, folder_path):
        texts = self._load_texts(folder_path)
        concept_maps = self._generate_concept_maps(texts)
        return concept_maps

    def _load_texts(self, folder_path):
        texts = []
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".txt"):
                with open(os.path.join(folder_path, file_name), "r") as file:
                    texts.append(file.read())
        return texts

    def _generate_concept_maps(self, texts):
        # Aggregate all texts into a single string
        aggregated_text = " ".join(texts)

        # Split the aggregated text into smaller chunks
        chunk_size = 15000  # Adjust the chunk size as needed
        text_chunks = [aggregated_text[i:i + chunk_size] for i in range(0, len(aggregated_text), chunk_size)]

        concept_maps = []

        # Generate concept maps for each text chunk
        for chunk in text_chunks:
            # Generate the prompt for concept map generation
            prompt_template = """
                    Task Description: Concept Map Generation

                    Your task is to process a collection of {} and extract triples from them.

                    Subsequently, you'll aggregate this information to construct a unique and comprehensive Concept Map representing the information 
                    in all the texts in the given folder.

                    The resulting Concept Map should adhere to the following structure:
                    <Subject> - <Predicate> - <Object>,
                    <Subject> - <Predicate> - <Object>,
                    <Subject> - <Predicate> - <Object>,

                    The Concept Map should contain only the most important triple that best summarizes the content of all texts and avoid redundancy across triples.
                    In your answer, you must give the output in a .csv file with the columns `subject`, `predicate`, and `object`.

                    The output is a single:
                    ```csv 
                    """

            prompt = prompt_template.format(chunk)

            completion = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0
            )

            concept_maps.append(completion.choices[0].message.content)

        return concept_maps

In [None]:
test_folder = "/Users/martina/Desktop/concept_map/src/data/Corpora_Falke/Wiki/test"
output_folder = "/Users/martina/Desktop/concept_map/src/baselines/output_baseline"
model = "gpt-3.5-turbo-0125"

concept_map_extractor = ZeroBaseline(api_key="", model=model)

for folder_name in os.listdir(test_folder):
    folder_path = os.path.join(test_folder, folder_name)
    if os.path.isdir(folder_path):
        concept_maps = concept_map_extractor.extract_concept_maps(folder_path)
        folder_number = folder_name.split("/")[-1]

In [107]:
TEXTS

['A fascinating, recent discovery appears to have solved one of Jerusalem’s biggest historical mysteries: the location of the Acra, the fortified compound in Jerusalem built by Antiochus Epiphanes, ruler of the Hellenistic Seleucid Empire, following his sack of the city in 168 BCE.\nThe renowned fortress was used to control the Jewish city and to monitor the activities in the temple.\nThe Akra was eventually conquered by the Hasmoneans.',
 '(CNN)Archaeologists believe they have found the remains of the ancient Greek fort of Acra, solving "one of Jerusalem\'s greatest archaeological mysteries."\nThe stronghold ruins were unearthed from beneath a parking lot in Jerusalem, Israel\'s Antiquities Authority said.\nAcra dates back more than 2,000 years, to the time of Greek ruler Antiochus IV Epiphanes.\nExcavation directors Doron Ben-Ami, Yana Tchekhanovets and Salome Cohen called it a "sensational discovery."\n"The new archaeological finds indicate the establishment of a well-fortified stro

In [108]:
CONCEPT_MAPS = get_concept_map(texts=TEXTS)

  0%|          | 0/16 [00:00<?, ?it/s]


AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: {}. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [102]:
print(CONCEPT_MAPS)

['subject,predicate,object\nAcra,located in,Jerusalem\nAcra,built by,Antiochus Epiphanes\nAcra,conquered by,Hasmoneans\nAcra,used to,control the Jewish city\nAcra,used to,monitor the activities in the temple', "```csv\nAcra - dates back to - Greek ruler Antiochus IV Epiphanes,\nAcra - was built to consolidate - Seleucid Empire's control over the city,\nAcra - played a pivotal role in - Maccabean revolt against Greek rulers,\nAcra - was recaptured by - the Jews in 141 BCE,\nAcra - was believed to have been razed - some years later,\nAcra - had stumped - experts for well over a century,\nAcra - was located within - the City of David,\nAcra - was unearthed from beneath - a parking lot in Jerusalem,\nAcra - controls - all means of approach to the temple atop the Temple Mount,\nAcra - was occupied by - mercenaries and Hellenized Jews,\nAcra - was constructed on - the high bedrock cliff overlooking the steep slopes of the City of David hill,\nAcra - was a well-fortified stronghold,\nAcra - w