In [43]:
# !pip install markitdown
# !pip install PyMuPDF
# !pip install chromadb

In [84]:
from dotenv import load_dotenv

In [85]:
# Load the .env file
load_dotenv()

# Access variables
api_key = os.getenv("LLM_API_KEY")

### Read PDF files and convert them in markdown

In [86]:
import os
import fitz 

In [87]:
# Define input and output folder paths
input_folder = "/Users/xitij/Documents/RAG_Projects/Papers/PDF"
output_folder = "/Users/xitij/Documents/RAG_Projects/Papers/MD"

In [88]:
# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

In [89]:
# Process all PDF files in the input folder
for filename in os.listdir(input_folder):
    if filename.lower().endswith(".pdf"):
        input_path = os.path.join(input_folder, filename)
        output_filename = os.path.splitext(filename)[0] + ".md"
        output_path = os.path.join(output_folder, output_filename)

        # Read PDF content
        doc = fitz.open(input_path)
        full_text = ""
        for page in doc:
            full_text += page.get_text() + "\n\n"
        doc.close()

        # Save as markdown file
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(full_text)

        print(f"Processed: {filename} → {output_filename}")

Processed: paper4.pdf → paper4.md
Processed: PD_paper_175.pdf → PD_paper_175.md
Processed: 2111.09741v1.pdf → 2111.09741v1.md


### Loading the md files

In [90]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [91]:
# Path to the folder where .md files are saved
markdown_folder = "/Users/xitij/Documents/RAG_Projects/Papers/MD"

loader = DirectoryLoader(
    path=markdown_folder, 
    glob="**/*.md",                  # Load all .md files (supports recursive search)
    loader_cls=TextLoader,           # Use TextLoader to load each file
    use_multithreading=True          # Speeds up loading for large folders
)

documents = loader.load()

In [92]:
documents

[Document(metadata={'source': '/Users/xitij/Documents/RAG_Projects/Papers/MD/PD_paper_175.md'}, page_content='On the Role of Preprocessing on Matching Tables to\nKnowledge Graphs\nVishvapalsinhji Parmar1, Achraf Haddar1 and Alsayed Algergawy1\n1Chair of Data and Knowledge Engineering, University of Passau Passau, Germany\nAbstract\nMatching tabular data to knowledge graphs plays a crucial role in various applications, including Named Entity\nRecognition (NER). Data preprocessing has consistently shown to enhance the performance of data-driven systems.\nTo this end, in this paper, we present a systematic preprocessing pipeline designed to improve the accuracy\nof table-to-graph matching by identifying and addressing anomalies in datasets from diverse domains, such as\nbiodiversity, food, and Wikidata. Our pipeline, implemented over three iterations, focuses on correcting domain-\nspecific irregularities to enhance data quality. Experimental results demonstrate substantial improvements, 

In [93]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,       # max characters per chunk
    chunk_overlap=200,      # how much to overlap between chunks
    add_start_index=True
)

chunks = text_splitter.split_documents(documents)

In [94]:
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

Split 3 documents into 101 chunks.


In [95]:
document = chunks[10]
print(document.page_content)
print(document.metadata)

• Numerical and String Data Types: Over 54% of the columns contain numerical data, while 33%
are strings. Most tables (49 out of 50) feature at least four columns, except for one single-column
table.
• Missing Values: Similar to Wikidata, this dataset shows a high number of missing values
(39,198), which also lack corresponding annotations in the CEA target file, indicating no need for
imputation.
• Domain-Specific Patterns: The dataset contains domain-specific characteristics, such as species
name abbreviations (e.g., ’C. sclerophylla’ for ’Castanopsis sclerophylla’) and composed values
combining multiple elements. Tools like the NCBI Taxonomy database and ChatGPT were used
to interpret these domain-specific nuances, ensuring accurate data preprocessing.
3.3. tFood Tables
The tFood dataset is designed specifically for the Food domain, including two types of tables: Horizontal
Relational Tables and Entity Tables. Each table contains two columns, with one column representing
{'source': 

In [103]:
CHROMA_PATH = "/Users/xitij/Documents/RAG_Projects/chroma"

In [104]:
import shutil

In [105]:
if os.path.exists(CHROMA_PATH):
    shutil.rmtree(CHROMA_PATH)

In [106]:
from langchain_openai import OpenAIEmbeddings

In [107]:
embeddings = OpenAIEmbeddings(
    openai_api_base="https://llms-inference.innkube.fim.uni-passau.de",
    api_key=api_key,
    model="Snowflake/snowflake-arctic-embed-s"
)

In [108]:
from langchain.vectorstores import Chroma

In [109]:
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=CHROMA_PATH
)

In [110]:
# Use the same embedding model you used while saving
vectorstore = Chroma(
    persist_directory=CHROMA_PATH,
    embedding_function=embeddings  
)

In [111]:
retriever = vectorstore.as_retriever(
    search_type="similarity",  # or "mmr" or "similarity_score_threshold"
    search_kwargs={"k": 3}      # get top 3 most relevant chunks
)

In [112]:
from langchain.chat_models import ChatOpenAI

In [113]:
chat = ChatOpenAI(
    openai_api_base="https://llms-inference.innkube.fim.uni-passau.de",
    api_key="sk-Fr2TEolkvrxjjs235KJqkg",
    model="qwen2.5",
    temperature=0.1
)

In [114]:
from langchain.chains import RetrievalQA

In [115]:
qa_chain = RetrievalQA.from_chain_type(
    llm=chat,
    retriever=retriever,
    return_source_documents=True  # optional: returns docs along with the answer
)

In [116]:
query = "What is patent analysis dataset is about?"
response = qa_chain.invoke(query)

print("Answer:")
print(response)

Answer:
{'query': 'What is patent analysis dataset is about?', 'result': 'The patent analysis dataset is a curated collection of data extracted from the United States Patent and Trademark Office (USPTO) raw XML files. This dataset is designed to support patent sentiment analysis and information retrieval using machine learning techniques. It includes text segments from patent grants, which are parsed from the nested XML structures provided by the USPTO and compiled into a CSV file format. The dataset aims to aid in creating a sustainable and efficient patent analysis process, helping examiners and researchers find relevant prior art more effectively.', 'source_documents': [Document(metadata={'source': '/Users/xitij/Documents/RAG_Projects/Papers/MD/2111.09741v1.md', 'start_index': 10285}, page_content='as an annotation process. We propose a novel dataset to highlight patent paragraphs automatically, again this is a\nmulti-class labeled dataset where a pertinent evidence for the correctn