<a href="https://colab.research.google.com/github/vera-lovelace/GenAI-final/blob/graphRAG/Extended_RAG_Model_GraphRAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG Mini Project
## Milestone #2 : Vectorise and store Chunks

Use the embedding code from Assignment A1 to create embeddings from the  text chunks generated and save in Pickle file from Milestone #1.

Create a Python dictionary as a Vector database using the embedding vector as keys (note: convert list of embeddings to a tuple) and the text as the value
Experiment with some queries and use cosine similarity to get the most similar text from your vector database.
If the results are not satisfactory, you may want to refactor your code by:
changing the embedding technique
modifying the chunking technique from Milestone #1. Your code should be modular enough to make this fairly straightforward if needed. It is what software development is all about.
When satisfied, store your Python dict (vector db) in a pickle file.


### Deliverables: Zip file with

Jupyter Notebook
Summary of your efforts (issues, success in matching chunks to queries based on embeddings, …)
Pickle file with the Python vector database for use in the final Mini Project Deliverable

In [None]:
# Imports
!pip install python-docx
!pip install docx
!pip install rdflib
!pip install nltk

from docx import Document
from io import BytesIO
import re
import os
from pathlib import Path

from google.colab import files
import pickle
import nltk
nltk.download('punkt_tab')

import numpy as np
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize

import spacy
from rdflib import Graph, Literal

from torch.nn.functional import cosine_similarity as torch_cosine_similarity
import torch

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

/content/sample_data/mydata

In [None]:
def extract_text_from_docx(file_paths):
    """Extract text content from a Word document."""
    try:
      full_text = ""
      # Handle both file path and binary content
      for file_path in file_paths:
        if isinstance(file_path, bytes):
            doc = Document(BytesIO(file_path))
        else:
            doc = Document(file_path)

        # Extract and clean all text
        for para in doc.paragraphs:
            text = para.text.strip()
            if text:  # Skip empty paragraphs
                # Clean and normalise the text
                text = re.sub(r'\n{3,}', '\n\n', text)
                text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
                full_text += text + " "  # Add space between paragraphs

      # Split text into sentences
      sentences = re.split('(?<=[.!?-]) +', full_text)

      return sentences
    except Exception as e:
        raise Exception(f"Error processing document: {str(e)}")

def calculate_sentence_similarities(sentences):
    """
    Calculate pairwise cosine similarities between sentences.

    Args:
        sentences: List of sentences to compare

    Returns:
        Similarity matrix for sentences
    """
    if not sentences:
        return np.array([])

    # Create TF-IDF vectors for sentences
    tfidf_matrix = vectorizer.fit_transform(sentences)

    # Calculate cosine similarity
    return sklearn_cosine_similarity(tfidf_matrix)

def find_similar_sentences(sentences, similarity_matrix: np.ndarray):
    """
    Group sentences based on similarity threshold.

    Args:
        sentences: List of sentences
        similarity_matrix: Pairwise similarity matrix

    Returns:
        List of groups of similar sentence indices
    """
    try:
      sentence_groups = []
      used_indices = set()

      for i in range(len(sentences)):
          if i in used_indices:
              continue

          # Start new group with current sentence
          current_group = [i]
          used_indices.add(i)

          # Find similar sentences
          for j in range(i + 1, len(sentences)):
              if j in used_indices:
                  continue

              # Check similarity with all sentences in current group
              similarities = [similarity_matrix[j][k] for k in current_group]
              if min(similarities) >= 0.5:
                  current_group.append(j)
                  used_indices.add(j)

          sentence_groups.append(current_group)

      return sentence_groups
    except Exception as e:
        raise Exception(f"Error processing document: {str(e)}")


def create_semantic_chunks(sentences, max_chunk_size, min_chunk_size):
    """
    Split text into semantically coherent chunks based on sentence similarity.

    Args:
        text: Preprocessed text to chunk

    Returns:
        List of chunk dictionaries with text and metadata
    """
    if not sentences:
        return []

    # Calculate sentence similarities
    similarity_matrix = calculate_sentence_similarities(sentences)

    # Group similar sentences
    sentence_groups = find_similar_sentences(sentences, similarity_matrix)

    chunks = []
    current_chunk = []
    current_length = 0
    vectors = []
    vectors_dict = {}
    count = 1

    try:
      # Write chunks to a text file with metadata.
      with open("/content/chunks.txt", 'w', encoding='utf-8') as f:

        for group in sentence_groups:
            group_sentences = [sentences[i] for i in group]
            group_text = ' '.join(group_sentences)
            group_length = len(group_text)

            # Check if adding this group would exceed max chunk size
            if current_length + group_length > max_chunk_size and current_chunk:
                # Create chunk from accumulated sentences
                chunk_text = ' '.join(current_chunk)

                embedding = tuple(model.encode(chunk_text))
                vectors_dict[embedding] = chunk_text
                vectors.append(embedding)

                count += 1
                sentences_length = len(current_chunk)
                avg_group_similarity = np.mean([
                  similarity_matrix[i][j]
                  for i in range(sentences_length)
                  for j in range(i + 1, sentences_length)
                ]) if sentences_length > 1 else 1.0

                f.write(f"{'='*80}\n")
                f.write(f"CHUNK {count}\n")
                f.write(f"Length: {len(chunk_text)} characters\n")
                f.write(f"Sentences: {sentences_length}\n")
                f.write(f"Average group similarity: {avg_group_similarity:.3f}\n")
                f.write(f"{'-'*40}\n")
                f.write(f"{chunk_text}\n\n")

                # Start new chunk
                current_chunk = group_sentences
                current_length = group_length
            else:
                current_chunk.extend(group_sentences)
                current_length += group_length

        # Add final chunk if it meets minimum size
        if current_length >= min_chunk_size:
            final_text = ' '.join(current_chunk)

            embedding = tuple(model.encode(final_text))
            vectors_dict[embedding] = final_text
            vectors.append(embedding)

            count += 1
            sentences_length = len(current_chunk)
            avg_group_similarity = np.mean([
                    similarity_matrix[i][j]
                    for i in range(sentences_length)
                    for j in range(i + 1, sentences_length)
                ]) if sentences_length > 1 else 1.0

            f.write(f"{'='*80}\n")
            f.write(f"CHUNK {count}\n")
            f.write(f"Length: {len(final_text)} characters\n")
            f.write(f"Sentences: {len(current_chunk)}\n")
            f.write(f"Average group similarity: {avg_group_similarity:.3f}\n")
            f.write(f"{'-'*40}\n")
            f.write(f"{final_text}\n\n")

    except Exception as e:
        raise Exception(f"Error writing to document: {str(e)}")
    return vectors_dict, vectors


# Find cosine similarity of given query and sentences
def find_similar_query_sentences(query, embeddings):
    """
    Finds similar texts to query based on similarity threshold.

    Args:
        query: embeddings of query
        embeddings: List of text embeddings

    Returns:
        List of similar sentence embeddings
    """
    similar_sentences = []
    for i in range(len(embeddings)):
        similarity = np.dot(query, embeddings[i]) / (
            np.linalg.norm(query) * np.linalg.norm(embeddings[i]))

        if similarity >= 0.5:
            similar_sentences.append(embeddings[i])
    return similar_sentences

In [None]:
# Load RDF triples from a Turtle file
def load_triples_from_ttl(file_path):
    g = Graph()
    g.parse(file_path, format="ttl")
    return list(g)

# Convert a triple to a readable sentence
def triple_to_text(s, p, o):
    s = s.split("#")[-1] if "#" in s else s.split("/")[-1]
    p = p.split("#")[-1] if "#" in p else p.split("/")[-1]
    o = o.split("#")[-1] if "#" in o else o.split("/")[-1]

    return f"{s} {p.replace('_', ' ')} {o}".replace('"', '')

# Embed a list of sentences
def embed_texts(texts):
    return model.encode(texts, convert_to_tensor=True)

# Retrieve top-k most similar triples
def retrieve_top_k(query, triple_texts, triple_embeddings, k=4):
    query_embedding = embed_texts([query])
    scores = torch_cosine_similarity(query_embedding, triple_embeddings)

    top_k_indices = torch.topk(scores, k).indices
    return [triple_texts[i] for i in top_k_indices]

In [None]:
# Main - Note that chunk size to use is set here in main and overrides default
def main():
    try:
        # Directory containing Word documents
        directory = "content/docs"

        # Get all .docx files in the directory
        docx_files = list(Path(directory).glob("*.docx"))
        print(f"Found files: {docx_files}")

        if not docx_files:
            print(f"No Word documents found in {directory}")
            return

        print(f"Found {len(docx_files)} Word documents")

        vectors_dict = {}
        vectors = []

        sentences = extract_text_from_docx(docx_files)
        similarity_matrix = calculate_sentence_similarities(sentences)
        sentence_groups = find_similar_sentences(sentences, similarity_matrix)

        vectors_dict, vectors = create_semantic_chunks(sentences, max_chunk_size=300, min_chunk_size=100)


        # run queries to find similarity in chunks and graphs
        queries = ["When was the Tor network released?",
          "List where the GDPR approach was applied.",
          "How major data breaches impacted Apple and Microsoft?",
          "How privacy regulations affect various industries in the USA?",
          "When was the TRW Credit Data breach and how many credit records were exposed?",
          "How have approaches to data breach notification evolved since 2000, and what are the key differences between jurisdictions?",
          "What kind of data is protected by privacy acts?",
          "What privacy protection is applicable in California?",
          "Who is covered by privacy protection?",
          "What are the key differences between the articles tagged with PrivacyLaw?"]

        # === Querying RAG ===
        print("\n=== Querying RAG ===\n")
        for query in queries:
          query_embedding = model.encode(query)
          similar_sentences = find_similar_query_sentences(query_embedding, vectors)

          print(f"Query: {query}")
          print("Similar Sentences:")
          for sentence in similar_sentences:
            chunk = vectors_dict[tuple(sentence)]
            print(chunk)
            print('\n')


        # === Querying GraphRAG ===

        print("\n=== Querying GraphRAG ===\n")
        # Load and process triples
        triples = load_triples_from_ttl("content/privacy_and_security.ttl")
        triple_texts = [triple_to_text(str(s), str(p), str(o)) for s, p, o in triples]

        # Embed all triple texts
        triple_embeddings = embed_texts(triple_texts)

        for query in queries:
          # Retrieve top 20 relevant triples
          top_triples = retrieve_top_k(query, triple_texts, triple_embeddings, k=30)

          # Output results
          print(f"Query: {query}")
          print("Top Relevant Triples:")
          for t in top_triples:
              print("-", t)
          print("\n")

    except Exception as e:
        print(f"Error accessing directory: {str(e)}")

# Call main and start the creating embeddings
main()



Found files: [PosixPath('content/docs/3.Major Data Breaches and Their Impact on Privacy Regulation.docx'), PosixPath('content/docs/4.The Evolution of European Data Protection.docx'), PosixPath('content/docs/CCPA.docx'), PosixPath('content/docs/2.DevelopmentPrivacyProtectionUSA.docx'), PosixPath('content/docs/CPRA.docx'), PosixPath('content/docs/EU GDPR.docx'), PosixPath('content/docs/1.The Evolution of Privacy.docx'), PosixPath('content/docs/5.Global Approaches to Data Protection.docx'), PosixPath('content/docs/HIPAA.docx')]
Found 9 Word documents

=== Querying RAG ===

Query: When was the Tor network released?
Similar Sentences:
Became foundation for HTTPS Early 2000s Innovations: Privacy-Preserving Data Mining: - Developed in response to growing data collection - Techniques for anonymizing datasets - Statistical disclosure control methods - K-anonymity concept introduced Tor Network: - Released in 2002 -


Query: List where the GDPR approach was applied.
Similar Sentences:
The Genera

In [None]:
# Extract Chunks using document paragraphs
# Chunk size is controlled by parameter
def extract_fixed_chunks(file_path, chunk_size=1000):
    """
    Extract fixed-size chunks from a Word document.

    Args:
        file_path (str or bytes): Path to Word document or binary content
        chunk_size (int): Target size of each chunk in characters

    Returns:
        list: List of text chunks of approximately chunk_size characters
    """
    try:
        # Handle both file path and binary content
        if isinstance(file_path, bytes):
            doc = Document(BytesIO(file_path))
        else:
            doc = Document(file_path)

        # Extract and clean all text
        full_text = ""
        for para in doc.paragraphs:
            text = para.text.strip()
            if text:  # Skip empty paragraphs
                # Clean and normalise the text
                text = re.sub(r'\n{3,}', '\n\n', text)
                text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
                full_text += text + " "  # Add space between paragraphs

        # Split text into sentences
        sentences = re.split('(?<=[.!?-]) +', full_text)

        chunks = []
        current_chunk = ""

        for sentence in sentences:
            # If adding this sentence would exceed chunk_size
            if len(current_chunk) + len(sentence) > chunk_size:
                # If current chunk is not empty, add it to chunks
                if current_chunk:
                    chunks.append(current_chunk.strip())
                    current_chunk = ""

                # Handle sentences longer than chunk_size
                if len(sentence) > chunk_size:
                    # Split long sentence into fixed-size chunks
                    words = sentence.split()
                    temp_chunk = ""

                    for word in words:
                        if len(temp_chunk) + len(word) + 1 <= chunk_size:
                            temp_chunk += (" " + word if temp_chunk else word)
                        else:
                            chunks.append(temp_chunk.strip())
                            temp_chunk = word

                    if temp_chunk:
                        current_chunk = temp_chunk
                else:
                    current_chunk = sentence
            else:
                # Add sentence to current chunk
                current_chunk += (" " + sentence if current_chunk else sentence)

        # Add the last chunk if not empty
        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

    except Exception as e:
        raise Exception(f"Error processing document: {str(e)}")