In [1]:
import os
import json
import requests

In [2]:
MODEL_NAME = "meta-llama/Meta-Llama-3.1-70B-Instruct"
MODEL_URL = "http://172.18.21.137:8000/v1/completions"
MAX_TOKENS = 24288

In [3]:
def extract_paper_title(file_path):
    return os.path.splitext(os.path.basename(file_path))[0]

In [10]:
def generate_qa_for_paper(text, paper_title, num_questions=5):
    prompt = f"""
    You are tasked with generating {num_questions} specific, detailed questions and answers suitable for a retrieval-augmented generation (RAG) system based solely on the following academic paper titled "{paper_title}". 

    Instructions:
    1. Each question must explicitly reference the paper titled "{paper_title}" and be closely tied to specific sections, such as the abstract, specific tables, figures, or individual paragraphs of the document.
    2. The questions should focus on specific data points, figures, tables, or methodologies. Avoid general or summary-like questions.
    3. The answers must consist of detailed sentences or phrases directly extracted from the paper, but should be longer, providing at least a couple of sentences to offer full context. Explain the significance of the information provided, or give some additional detail on why the data is important.
    4. Each answer should be specific, but include at least 2-3 sentences to ensure sufficient context, while still directly reflecting the exact language and details from the paper.
    5. Avoid summaries that combine information from multiple sections; focus on isolated, factual details from specific parts of the paper.

    Paper content:
    {text}

    Return the question and answer pairs in a structured JSON format like this:
    {{
        "question": "Generated question",
        "answer": "Generated answer"
    }}
    """
    
    data = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "max_tokens": 3500,  # Limite per risposte più lunghe
        "temperature": 0.0   # Risposte determinate e precise
    }
    
    headers = {
        "Content-Type": "application/json"
    }
    
    response = requests.post(MODEL_URL, headers=headers, json=data)
    
    if response.status_code == 200:
        model_output = response.json()
        raw_output = model_output['choices'][0]['text'].strip()

        qa_pairs = []
        current_question = None
        current_answer = None

        lines = raw_output.split("\n")
        for line in lines:
            if "question" in line.lower() and 'question": "' in line:
                try:
                    if current_question and current_answer:
                        qa_pairs.append({"question": current_question, "answer": current_answer})
                    current_question = line.split('question": "')[1].rstrip('",').strip()
                    current_answer = None  # Reset dell'answer
                except IndexError:
                    print(f"Errore durante il parsing della domanda: {line}")
            elif "answer" in line.lower() and 'answer": "' in line:
                try:
                    current_answer = line.split('answer": "')[1].rstrip('"}').strip()
                except IndexError:
                    print(f"Errore durante il parsing della risposta: {line}")
        
        if current_question and current_answer:
            qa_pairs.append({"question": current_question, "answer": current_answer})

        examples = []
        for pair in qa_pairs:
            try:
                examples.append({
                    "paper_title": paper_title,
                    "question": pair["question"],
                    "answer": pair["answer"]
                })
            except Exception as e:
                print(f"Errore durante il parsing della risposta: {e}")

        return {
            "model_info": {
                "model_name": MODEL_NAME,
                "type": "ai"
            },
            "examples": examples
        }
    else:
        print(f"Errore: {response.status_code}")
        return None

In [11]:
def save_qa_to_json(qa_data, paper_title):
    output_file = f"{paper_title}_qa.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(qa_data, f, indent=4, ensure_ascii=False)
    print(f"File salvato: {output_file}")

In [7]:
def process_papers_in_directory(directory_path):
    for file_name in os.listdir(directory_path):
        if file_name.endswith(".txt"):
            file_path = os.path.join(directory_path, file_name)

            # Estrai il nome del paper dal file
            paper_title = extract_paper_title(file_path)

            # Carica il contenuto del documento
            with open(file_path, "r", encoding="utf-8") as file:
                paper_content = file.read()

            # Genera domande e risposte per il documento
            qa_data = generate_qa_for_paper(paper_content, paper_title)

            # Salva le domande e risposte in un file JSON separato per ogni paper
            if qa_data:
                save_qa_to_json(qa_data, paper_title)

In [12]:
directory_path = "../input"

In [13]:
process_all_papers_in_directory(directory_path)

Errore: 400
File salvato: LagLLama_qa.json
File salvato: TimeGPT_qa.json
Errore: 400
File salvato: AnomalyBERT_qa.json
File salvato: TranAD_qa.json
File salvato: RESTAD_qa.json
File salvato: Foundation_Models_for_Time_Series_Analysis_qa.json
File salvato: Timesfm_qa.json


Processo i paper troppo grandi tagliandoli al limite dei token

In [14]:
def trim_text_to_context_limit(text, max_characters):
    return text[:max_characters] 

In [15]:
def generate_qa_for_paper(text, paper_title, num_questions=5):
    trimmed_text = trim_text_to_context_limit(text, 20000)  # Limita a 10.000 caratteri (circa)
    
    prompt = f"""
    You are tasked with generating {num_questions} specific, detailed questions and answers suitable for a retrieval-augmented generation (RAG) system based solely on the following academic paper titled "{paper_title}". 

    Instructions:
    1. Each question must explicitly reference the paper titled "{paper_title}" and be closely tied to specific sections, such as the abstract, specific tables, figures, or individual paragraphs of the document.
    2. The questions should focus on specific data points, figures, tables, or methodologies. Avoid general or summary-like questions.
    3. The answers must consist of detailed sentences or phrases directly extracted from the paper, but should be longer, providing at least a couple of sentences to offer full context. Explain the significance of the information provided, or give some additional detail on why the data is important.
    4. Each answer should be specific, but include at least 2-3 sentences to ensure sufficient context, while still directly reflecting the exact language and details from the paper.
    5. Avoid summaries that combine information from multiple sections; focus on isolated, factual details from specific parts of the paper.

    Paper content:
    {trimmed_text}

    Return the question and answer pairs in a structured JSON format like this:
    {{
        "question": "Generated question",
        "answer": "Generated answer"
    }}
    """
    
    data = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "max_tokens": 3500,  # Limite per risposte più lunghe
        "temperature": 0.0   # Risposte determinate e precise
    }
    
    headers = {
        "Content-Type": "application/json"
    }
    
    response = requests.post(MODEL_URL, headers=headers, json=data)
    
    if response.status_code == 200:
        model_output = response.json()
        raw_output = model_output['choices'][0]['text'].strip()

        qa_pairs = []
        current_question = None
        current_answer = None

        lines = raw_output.split("\n")
        for line in lines:
            if "question" in line.lower() and 'question": "' in line:
                try:
                    if current_question and current_answer:
                        qa_pairs.append({"question": current_question, "answer": current_answer})
                    current_question = line.split('question": "')[1].rstrip('",').strip()
                    current_answer = None  # Reset dell'answer
                except IndexError:
                    print(f"Errore durante il parsing della domanda: {line}")
            elif "answer" in line.lower() and 'answer": "' in line:
                try:
                    current_answer = line.split('answer": "')[1].rstrip('"}').strip()
                except IndexError:
                    print(f"Errore durante il parsing della risposta: {line}")
        
        if current_question and current_answer:
            qa_pairs.append({"question": current_question, "answer": current_answer})

        examples = []
        for pair in qa_pairs:
            try:
                examples.append({
                    "paper_title": paper_title,
                    "question": pair["question"],
                    "answer": pair["answer"]
                })
            except Exception as e:
                print(f"Errore durante il parsing della risposta: {e}")

        return {
            "model_info": {
                "model_name": MODEL_NAME,
                "type": "ai"
            },
            "examples": examples
        }
    else:
        print(f"Errore: {response.status_code}")
        return None

In [16]:
def process_large_paper(file_path):
    # Estrai il nome del paper dal file
    paper_title = extract_paper_title(file_path)

    # Carica il contenuto del documento
    with open(file_path, "r", encoding="utf-8") as file:
        paper_content = file.read()

    # Genera domande e risposte per la parte del documento che rientra nel limite del contesto
    qa_data = generate_qa_for_paper(paper_content, paper_title)

    # Salva le domande e risposte in un file JSON
    if qa_data:
        save_qa_to_json(qa_data, paper_title)

In [17]:
file_path = "../input/TimeLLM.txt"

process_large_paper(file_path)

File salvato: TimeLLM_qa.json


In [18]:
file_path = "../input/Chronos.txt"

process_large_paper(file_path)

File salvato: Chronos_qa.json


In [19]:
file_path = "../input/LagLLama.txt"

process_large_paper(file_path)

File salvato: LagLLama_qa.json
