In [1]:
import os
import re
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from IPython.display import Markdown, display
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

In [2]:
pdf_path = "../docs"
def extract_texts_from_pdf(pdf_path):
    docs_combined = []

    for filename in os.listdir(pdf_path):
        path = os.path.join(pdf_path, filename)

        reader = PdfReader(path)
        total_pages = len(reader.pages)

        full_text = ""
        doc_type = None

        for i, page in enumerate(reader.pages):
            text = page.extract_text()
            if text:
                full_text += text + "\n"

                if i == 0:
                    heading_lines = text.strip().split('\n')[:3]  # Get first 3 lines
                    combined_heading = " ".join(heading_lines).lower()
                    if "act" in combined_heading:
                        doc_type = "act"
                    else:
                        doc_type = "constitution"

        docs_combined.append({
            "text": full_text,
            "metadata": {
                "file_name": filename.lower(),
                "total_pages": total_pages,
                "doc_type": doc_type,
            },
        })


    return docs_combined

In [3]:
docs_combined = extract_texts_from_pdf(pdf_path)

In [4]:
with open("../data/combined_texts.txt", "w", encoding="utf-8") as file:
    file.write(str(docs_combined) + "\n")

In [5]:
len(docs_combined)

7

In [6]:
def split_text_by_chapters(full_text):
    chapter_pattern = r'(CHAPTER\s+\w+|Chapter\s+\w+)' 
    parts = re.split(chapter_pattern, full_text)

    chapters = []
    for i in range(1, len(parts), 2):
        title = parts[i].strip()
        content = parts[i + 1].strip() if i + 1 < len(parts) else ""
        chapter_text = f"{title}\n{content}"
        chapters.append(chapter_text)

    return chapters


def chunk_docs_by_chapter(docs_combined, chunk_size=1000, chunk_overlap=150):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    chunked_docs = []

    for doc in docs_combined:
        full_text = doc["text"]
        metadata = doc["metadata"]

        chapters = split_text_by_chapters(full_text)

        for chapter in chapters:
            chapter_title = chapter.split('\n')[0].strip()

            chunks = splitter.split_text(chapter)
            for chunk in chunks:
                chunked_docs.append({
                    "text": chunk,
                    "metadata": {
                        **metadata,
                        "chapter_title": chapter_title
                    }
                })


    return chunked_docs

In [7]:
chunked_documents = chunk_docs_by_chapter(docs_combined)

In [8]:
chunked_documents

[{'text': 'CHAPTER ONE\nPreliminary Matters\n1. Interpretation.\n2. Company and its officers.\n3. Definition of public officer.\n4. General rules of construction.\n5. Application of Part One to other offences.\n6. Jurisdiction of territorial waters.\n7. Acts done partly beyond the jurisdiction.\n8. Exclusion of the common law.\n9. Offences under more than one enactment.\n10. Saving for contempt of court.',
  'metadata': {'file_name': 'criminal offences act.pdf',
   'total_pages': 115,
   'doc_type': 'act',
   'chapter_title': 'CHAPTER ONE'}},
 {'text': 'CHAPTER TWO\nGeneral Explanations\n11. Intent.\n12. Negligence.\n13. Causing an event.\n14. Consent.\n15. Claim of right.\n16. Fraud.\n17. Meaning and use of threats.',
  'metadata': {'file_name': 'criminal offences act.pdf',
   'total_pages': 115,
   'doc_type': 'act',
   'chapter_title': 'CHAPTER TWO'}},
 {'text': 'CHAPTER THREE\nAttempts to commit Criminal Offences\n18. Attempt to commit a criminal offence.\np\n19. Preparation for co

In [9]:
documents = [
    Document(page_content=doc["text"], metadata=doc["metadata"])
    for doc in chunked_documents
]

In [10]:
documents

[Document(metadata={'file_name': 'criminal offences act.pdf', 'total_pages': 115, 'doc_type': 'act', 'chapter_title': 'CHAPTER ONE'}, page_content='CHAPTER ONE\nPreliminary Matters\n1. Interpretation.\n2. Company and its officers.\n3. Definition of public officer.\n4. General rules of construction.\n5. Application of Part One to other offences.\n6. Jurisdiction of territorial waters.\n7. Acts done partly beyond the jurisdiction.\n8. Exclusion of the common law.\n9. Offences under more than one enactment.\n10. Saving for contempt of court.'),
 Document(metadata={'file_name': 'criminal offences act.pdf', 'total_pages': 115, 'doc_type': 'act', 'chapter_title': 'CHAPTER TWO'}, page_content='CHAPTER TWO\nGeneral Explanations\n11. Intent.\n12. Negligence.\n13. Causing an event.\n14. Consent.\n15. Claim of right.\n16. Fraud.\n17. Meaning and use of threats.'),
 Document(metadata={'file_name': 'criminal offences act.pdf', 'total_pages': 115, 'doc_type': 'act', 'chapter_title': 'CHAPTER THREE'}

EMBEDDINGS


In [11]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [12]:
vector_store = Chroma.from_documents(
      documents=documents,
    embedding=embedding_function,
    persist_directory="../chroma",
    collection_name="osagyefo_v1"
)

In [13]:
vector_store.persist()

  vector_store.persist()


In [14]:
embedding_function

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [15]:
all_docs = vector_store.get()

# Access first document
print(all_docs.keys())           # Full text chunk
# print(all_docs["metadatas"][0]) 

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas'])


In [16]:
print(vector_store._collection.count())  # should be > 0


1323


In [17]:
query = "What are the penalties for theft?"

# Perform a similarity search
results = vector_store.similarity_search(query, k=5)

In [18]:
results

[Document(metadata={'chapter_title': 'CHAPTER FOUR', 'doc_type': 'act', 'total_pages': 115, 'file_name': 'criminal offences act.pdf'}, page_content='sentence.\n227.   Breaches of prison discipline\nRepealed.61(61)\n228.   Smuggling things into prisons\nRepealed.62(62)\n229.   Interference with prisoners outside prisons\nRepealed.63(63)\n230.   Prison officer leaving prisoner when outside prison\nRepealed.64(64)\n230A.   Aiding escape\nA prison officer who directly or indirectly aids, encourages, induces or facilitates the escape of a\nperson in lawful custody commits a second degree felony.65(65)\n231.   Oppression by prison officer\nRepealed.66(66)\n232.   Preventing the death penalty\nA person who endeavours by force to prevent the ex ecution of a person sentenced to death commits a\nsecond degree felony.\n233.   Advertising reward for the return of stolen property\nA person commits a criminal offence and is liable to a fine not exceeding twenty-five penalty units\nwho\n            (

In [20]:
vector_store_1 = Chroma(
    embedding_function=embedding_function,
    persist_directory="../chroma",
    collection_name="osagyefo_v1"
)

In [21]:
query = "What are the penalties for theft?"

# Perform a similarity search
results = vector_store_1.similarity_search(query, k=5)
results

[Document(metadata={'file_name': 'criminal offences act.pdf', 'doc_type': 'act', 'total_pages': 115, 'chapter_title': 'CHAPTER FOUR'}, page_content='sentence.\n227.   Breaches of prison discipline\nRepealed.61(61)\n228.   Smuggling things into prisons\nRepealed.62(62)\n229.   Interference with prisoners outside prisons\nRepealed.63(63)\n230.   Prison officer leaving prisoner when outside prison\nRepealed.64(64)\n230A.   Aiding escape\nA prison officer who directly or indirectly aids, encourages, induces or facilitates the escape of a\nperson in lawful custody commits a second degree felony.65(65)\n231.   Oppression by prison officer\nRepealed.66(66)\n232.   Preventing the death penalty\nA person who endeavours by force to prevent the ex ecution of a person sentenced to death commits a\nsecond degree felony.\n233.   Advertising reward for the return of stolen property\nA person commits a criminal offence and is liable to a fine not exceeding twenty-five penalty units\nwho\n            (