# NaiveRAG per query globali

In [1]:
from langchain.embeddings.base import Embeddings
from langchain.llms.base import LLM
from typing import Optional, List
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
import requests
import os
import numpy as np
from IPython.display import Markdown, display
import time

## Embedder

In [2]:
class CustomEmbeddings(Embeddings):
    def __init__(self, endpoint_url):
        self.endpoint_url = endpoint_url

    def embed_documents(self, texts):
        embeddings = []
        for text in texts:
            payload = {
                "input": text,
                "model": "intfloat/multilingual-e5-large-instruct"
            }
            response = requests.post(f"{self.endpoint_url}/embeddings", json=payload)
            if response.status_code == 200:
                embedding = response.json()['data'][0]['embedding']
                embeddings.append(embedding)
            else:
                raise Exception(f"Errore nell'embedder: {response.text}")
        return embeddings

    def embed_query(self, text):
        return self.embed_documents([text])[0]

In [3]:
embedder = CustomEmbeddings(endpoint_url="http://172.18.21.138:80/v1")

## Caricamento dei file di testo

In [4]:
import glob

document_paths = glob.glob('../input/*.txt')

In [5]:
from langchain.docstore.document import Document

documents = []
for file_path in document_paths:
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        documents.append(Document(page_content=content))

In [6]:
print(f"Numero di documenti caricati: {len(documents)}")

Numero di documenti caricati: 9


## Suddivisione documenti

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,       
    chunk_overlap=200,     
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
    length_function=len
)

docs = text_splitter.split_documents(documents)

In [8]:
print(f"Numero totale di chunk: {len(docs)}\n")

Numero totale di chunk: 718



## Indicizzazione dei documenti nel db vettoriale

In [9]:
from langchain.vectorstores import FAISS

In [10]:
vectorstore = FAISS.from_documents(docs, embedder)

In [11]:
num_vectors = vectorstore.index.ntotal
print(f"Numero di vettori indicizzati: {num_vectors}")

Numero di vettori indicizzati: 718


## Configurazione del prompt per le query

In [12]:
from langchain.prompts import PromptTemplate

prompt_template = """
You are a knowledgeable assistant specialized in answering questions based solely on the provided context. Provide a detailed and well-structured answer, including all relevant information from the context. Ensure your response is comprehensive, faithful to the context, and presented in clear, well-formed sentences. Do not add any information that is not present in the context. If the answer is not explicitly stated in the context, respond with "I don't know."

Context:
{context}

Question:
{question}

Answer:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

## LLM

In [13]:
class CustomLLM(LLM):
    endpoint_url: str
    model_name: str = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    temperature: float = 0.0
    max_tokens: int = 1500
    repetition_penalty: float = 1.2

    @property
    def _llm_type(self) -> str:
        return "custom_llm"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        payload = {
            "prompt": prompt,
            "model": self.model_name,
            "temperature": self.temperature,
            "max_tokens": self.max_tokens,
            "repetition_penalty": self.repetition_penalty,
            "stop": stop or ["I don't know."],
        }
        print("Payload inviato all'API:", payload)  
        response = requests.post(f"{self.endpoint_url}/completions", json=payload)
        if response.status_code == 200:
            return response.json()['choices'][0]['text']
        else:
            raise Exception(f"Errore nel LLM: {response.text}")

In [14]:
llm = CustomLLM(
    endpoint_url="http://172.18.21.132:8000/v1",
    temperature=0.0,
    max_tokens= 500
)

## Configurazione del retrieval e sistema di question answering

In [15]:
from langchain.chains import RetrievalQA

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

In [16]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=False,
    chain_type_kwargs={"prompt": prompt}
)

In [17]:
query = "Describe how TranAD works?"

result = qa_chain({"query": query})
print(f"Question: {query}")
print(f"Answer: {result['result']}\n")

  result = qa_chain({"query": query})


Payload inviato all'API: {'prompt': '\nYou are a knowledgeable assistant specialized in answering questions based solely on the provided context. Provide a detailed and well-structured answer, including all relevant information from the context. Ensure your response is comprehensive, faithful to the context, and presented in clear, well-formed sentences. Do not add any information that is not present in the context. If the answer is not explicitly stated in the context, respond with "I don\'t know."\n\nContext:\n# Figure 1: The TranAD Model.\n\n# 3.3 Transformer Model\n\nTransformers are popular deep learning models that have been used in various natural language and vision processing tasks [51]. However, we use insightful refactoring of the transformer architecture for the task of anomaly detection in time-series data. Just like other encoder-decoder models, in a transformer, an input sequence undergoes several attention-based transformations. Figure 1 shows the architecture of the ne

## Definizione della funzione per eseguire le query

In [18]:
async def ask_question(query):  
    result = qa_chain({"query": query})  
    answer = result['result']
    display(Markdown(f"**Answer to the question:** {query}\n\n{answer}"))

In [19]:
await ask_question(query)

Payload inviato all'API: {'prompt': '\nYou are a knowledgeable assistant specialized in answering questions based solely on the provided context. Provide a detailed and well-structured answer, including all relevant information from the context. Ensure your response is comprehensive, faithful to the context, and presented in clear, well-formed sentences. Do not add any information that is not present in the context. If the answer is not explicitly stated in the context, respond with "I don\'t know."\n\nContext:\n# Figure 1: The TranAD Model.\n\n# 3.3 Transformer Model\n\nTransformers are popular deep learning models that have been used in various natural language and vision processing tasks [51]. However, we use insightful refactoring of the transformer architecture for the task of anomaly detection in time-series data. Just like other encoder-decoder models, in a transformer, an input sequence undergoes several attention-based transformations. Figure 1 shows the architecture of the ne

**Answer to the question:** Describe how TranAD works?

TranAD works by using a Transformer model as an encoder-decoder network to predict the reconstruction of each input time-series window. This prediction occurs in two phases. 

Firstly, during **Phase 1 - Input Reconstruction**, the model generates an approximate reconstruction of the input window. Then, in **Phase 2 - Focused Input Reconstruction**, the model utilizes the reconstruction loss for the first decoder as a focus score to refine its predictions. Specifically, having obtained the focus matrix for the second phase, the model re-runs inference to produce the output of the second decoder as O^2.

This two-phase inference approach allows TranAD to effectively capture both long-term patterns and short-term trends within the input sequences, thereby improving its ability to detect anomalies. Furthermore, the use of multi-head self-attention mechanism facilitates stable training while reducing the variance of the weights involved in the attention operation. Overall, TranAD's design enables it to achieve significant improvements in performance metrics such as F1 score, F1*, AUC, and AUC* compared to existing state-of-the-art baselines. I don't know whether these results were achieved under specific conditions or datasets; however, they demonstrate the effectiveness of TranAD in addressing challenges related to anomaly detection in multivariate time series data.

## Estrazione domande globali

In [20]:
import json

with open('../../DatasetCreation/Global_questions.json', 'r') as file:
    data = json.load(file)

In [21]:
questions_list = [item['question'] for item in data['questions']]

# Print the questions to verify
for idx, question in enumerate(questions_list, 1):
    print(f"Question {idx}: {question}")

Question 1: What are the main topics covered by the data in the set of time-series papers?
Question 2: How does RestAD leverage both statistical methods and machine learning to achieve robust anomaly detection in noisy time-series data?
Question 3: What are the key features and benefits of RestAD in anomaly detection for time-series data?
Question 4: What are the key features and benefits of RestAD in anomaly detection for time-series data?
Question 5: How does TimeLLM differ from other models in time-series forecasting?
Question 6: How does AnomalyBERT work?
Question 7: How does TimeGPT approach time-series forecasting?
Question 8: What types of real-world applications can benefit from models like TimeLLM, RestAD, TimeGPT, AnomalyBERT, LagLLama and the other models described?
Question 9: What distinguishes LagLLama in its approach to time-series analysis?
Question 10: How do models like AnomalyBERT handle non-stationary data, and why is this important?
Question 11: What are the main t

In [22]:
questions_sample_global = questions_list[:5]

## Risposte del modello alle domande globali

In [22]:
def process_question(question):
    # Pass the question to your QA chain or model
    result = qa_chain({"query": question})
    return result['result']

In [23]:
start_time = time.time()

model_answers = []
for question in questions_list:
    answer = process_question(question)
    model_answers.append({
        "question": question,
        "answer": answer
    })

end_time = time.time()
total_time_global = end_time - start_time

Payload inviato all'API: {'prompt': '\nYou are a knowledgeable assistant specialized in answering questions based solely on the provided context. Provide a detailed and well-structured answer, including all relevant information from the context. Ensure your response is comprehensive, faithful to the context, and presented in clear, well-formed sentences. Do not add any information that is not present in the context. If the answer is not explicitly stated in the context, respond with "I don\'t know."\n\nContext:\n# Categories of Time Series.\n\nA time series is commonly described as an ordered sequence of data points. Figure 2 illustrates various types of time series discussed in this survey, including standard time series, spatial time series, trajectories, and events. Note that trajectories and events can be regarded as time series since each data point is associated with a specific timestamp (and location), allowing for analysis using time series techniques such as anomaly detection.

## Salvataggio risposte in JSON

In [24]:
output_data = {
    "questions": model_answers
}

with open('Naive_responses.json', 'w') as outfile:
    json.dump(output_data, outfile, indent=4)

## Estrazione delle domande locali

In [25]:
import json
def extract_questions_from_json(file_path):
    # extraction from single json file
    questions = []
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        if 'examples' in data:
            for example in data['examples']:
                questions.append(example['question'])
    return questions

In [26]:
def extract_all_questions_from_directory(directory_path):

    all_questions = []
    
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):  
            file_path = os.path.join(directory_path, filename)
            questions = extract_questions_from_json(file_path)
            all_questions.extend(questions) 
    
    return all_questions

In [27]:
data_path = '../DatasetCreation/Local'
all_questions = extract_all_questions_from_directory(data_path)

In [28]:
print(f"Numero totale di domande estratte: {len(all_questions)}")

Numero totale di domande estratte: 45


## Risposte alle domande locali

In [29]:
def process_question(question):
    # Pass the question to your QA chain or model
    result = qa_chain({"query": question})
    return result['result']

In [30]:
start_time = time.time()

model_answers = []
for question in all_questions:
    answer = process_question(question)
    model_answers.append({
        "question": question,
        "answer": answer
    })

end_time = time.time()
total_time_local = end_time - start_time

Payload inviato all'API: {'prompt': '\nYou are a knowledgeable assistant specialized in answering questions based solely on the provided context. Provide a detailed and well-structured answer, including all relevant information from the context. Ensure your response is comprehensive, faithful to the context, and presented in clear, well-formed sentences. Do not add any information that is not present in the context. If the answer is not explicitly stated in the context, respond with "I don\'t know."\n\nContext:\nChronos represents one of the first endeavours in practical pretrained time series forecasting models, with remarkable zero-shot performance on a comprehensive collection of test datasets. This work opens up various research avenues, some of which we discuss below.\n\n# 6.1 Beyond Zero-shot Univariate Forecasting\n\n# Fine tuning\n\nMotivated by the remarkable zero-shot performance of Chronos models, we conducted a preliminary investigation into fine-tuning Chronos models indiv

## Salvataggio risposte in JSON

In [31]:
output_data = {
    "questions": model_answers
}

with open('Naive_local_responses.json', 'w') as outfile:
    json.dump(output_data, outfile, indent=4)

## Misurazione tempi 

### Tempi per risposte globali

In [None]:
query1 = "What are the main topics covered by the data in the set of time-series papers?"
query2 = "How does RestAD leverage both statistical methods and machine learning to achieve robust anomaly detection in noisy time-series data?"
query3 = "How does TimeGPT approach time-series forecasting?"
query4 = "What are the key features and benefits of RestAD in anomaly detection for time-series data?"
query5 = "How does TimeLLM differ from other models in time-series forecasting?"

## Esporto i tempi

In [32]:
import csv
time_data = [
    ["Type of question", "Time (seconds)", "Number of questions"],
    ["Local", total_time_local, 45],
    ["Global", total_time_global, 37]
]

In [33]:
output_file = 'naive_times.csv'
with open(output_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(time_data)

print(f"Tempi salvati in {output_file}")

Tempi salvati in naive_times.csv


In [40]:
def extract_questions_and_answers(file_path):
    """
    Extracts questions and their corresponding answers from a single JSON file.
    
    :param file_path: Path to the JSON file.
    :return: List of dictionaries with 'question' and 'answer' keys.
    """
    qa_pairs = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            examples = data.get('examples', [])
            for example in examples:
                question = example.get('question')
                answer = example.get('answer')
                if question and answer:
                    qa_pairs.append({
                        "question": question.strip(),
                        "answer": answer.strip()
                    })
                else:
                    print(f"Missing question or answer in file {file_path}, example: {example}")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from file {file_path}: {e}")
    except Exception as e:
        print(f"Unexpected error reading file {file_path}: {e}")
    return qa_pairs

def extract_all_questions_and_answers(directory_path):
    """
    Scans all JSON files in a directory and extracts all question-answer pairs.
    
    :param directory_path: Path to the directory containing JSON files.
    :return: List of all question-answer pairs.
    """
    all_qa_pairs = []
    if not os.path.isdir(directory_path):
        print(f"The directory {directory_path} does not exist.")
        return all_qa_pairs
    
    for filename in os.listdir(directory_path):
        if filename.lower().endswith(".json"):
            file_path = os.path.join(directory_path, filename)
            print(f"Processing file: {file_path}")
            qa_pairs = extract_questions_and_answers(file_path)
            all_qa_pairs.extend(qa_pairs)
    
    return all_qa_pairs

def save_questions_and_answers(qa_pairs, output_file):
    """
    Saves the list of question-answer pairs into a JSON file.
    
    :param qa_pairs: List of dictionaries with 'question' and 'answer' keys.
    :param output_file: Path to the output JSON file.
    """
    data = {"questions": qa_pairs}
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Successfully saved {len(qa_pairs)} question-answer pairs to {output_file}")
    except Exception as e:
        print(f"Error writing to file {output_file}: {e}")

def main():
    # Define the directory containing the JSON files
    directory_path = '../DatasetCreation/Local'  # Update this path as needed
    
    # Extract all questions and answers
    all_qa_pairs = extract_all_questions_and_answers(directory_path)
    
    if not all_qa_pairs:
        print("No question-answer pairs were extracted.")
        return
    
    # Define the output JSON file path
    output_file = 'all_questions_answers.json'  # You can change the output file name and path as needed
    
    # Save the extracted data to the output file
    save_questions_and_answers(all_qa_pairs, output_file)

if __name__ == "__main__":
    main()

Processing file: ../DatasetCreation/Local/Chronos_qa.json
Processing file: ../DatasetCreation/Local/TimeGPT_qa.json
Processing file: ../DatasetCreation/Local/Timesfm_qa.json
Processing file: ../DatasetCreation/Local/LagLLama_qa.json
Processing file: ../DatasetCreation/Local/Foundation_Models_for_Time_Series_Analysis_qa.json
Processing file: ../DatasetCreation/Local/TimeLLM_qa.json
Processing file: ../DatasetCreation/Local/RESTAD_qa.json
Processing file: ../DatasetCreation/Local/AnomalyBERT_qa.json
Processing file: ../DatasetCreation/Local/TranAD_qa.json
Successfully saved 45 question-answer pairs to all_questions_answers.json
