# Extracting Text from PDFs

In [1]:
import os
from PyPDF2 import PdfReader
import numpy as np

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    extracted_text = ""
    for page in reader.pages:
        extracted_text += page.extract_text()
    return extracted_text

def extract_text_from_pdfs_in_directory(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            extracted_text = extract_text_from_pdf(pdf_path)
            txt_filename = os.path.splitext(filename)[0] + ".txt"
            txt_filepath = os.path.join(directory, txt_filename)
            with open(txt_filepath, "w") as txt_file:
                txt_file.write(extracted_text)

# Specify the directory containing PDF files
directory_path = "Docs/"

# Extract text from PDFs in the directory and save as text files
extract_text_from_pdfs_in_directory(directory_path)

In [2]:
import os
from nltk.tokenize import sent_tokenize

directory_path = "Docs"

# List all .txt files in the directory
txt_files = [file for file in os.listdir(directory_path) if file.endswith('.txt')]

# List to store sentences from all files
all_sentences = []

# Read each text file, split into sentences, and store
for txt_file in txt_files:
    file_path = os.path.join(directory_path, txt_file)
    with open(file_path, "r") as file:
        text = file.read()
        sentences = sent_tokenize(text)
        all_sentences.extend(sentences)

# Print the first few sentences as an example
print(all_sentences[:10])  # Print first 10 sentences


['The Claude 3 Model Family: Opus, Sonnet, Haiku\nAnthropic\nAbstract\nWe introduce Claude 3, a new family of large multimodal models – Claude 3 Opus , our\nmost capable offering, Claude 3 Sonnet , which provides a combination of skills and speed,\nandClaude 3 Haiku , our fastest and least expensive model.', 'All new models have vision\ncapabilities that enable them to process and analyze image data.', 'The Claude 3 family\ndemonstrates strong performance across benchmark evaluations and sets a new standard on\nmeasures of reasoning, math, and coding.', 'Claude 3 Opus achieves state-of-the-art results\non evaluations like GPQA [1], MMLU [2], MMMU [3] and many more.', 'Claude 3 Haiku\nperforms as well or better than Claude 2 [4] on most pure-text tasks, while Sonnet and\nOpus significantly outperform it.', 'Additionally, these models exhibit improved fluency in\nnon-English languages, making them more versatile for a global audience.', 'In this report,\nwe provide an in-depth analysis o

# Generating Embedding for the text using FastEmbed

In [4]:
from fastembed import TextEmbedding
import numpy as np
import time

# Initialize the TextEmbedding model
embedding_model = TextEmbedding(model_name="BAAI/bge-base-en", cache_dir="./embeddings")

def embed_documents(documents):
    embeddings = []
    for document in documents:
        # Embed document using FastEmbed
        embedding = np.array(list((embedding_model.embed([document]))))
        
        # Append the embedding to the list of embeddings
        embeddings.append(embedding)
    
    return embeddings

# Define the documents
documents = all_sentences

# Perform embedding generation
embeddings = embed_documents(documents)

In [8]:
embeddings = [sublist[0] for sublist in embeddings]

# Starting Qdrant-Client

In [5]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct

client = QdrantClient(
    url="https://c065099d-b51c-4e03-b680-646b177fc993.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key="LojX3SvBqFF93pknBnlK8J5QbVR98wWzypfg7UTr4lyhdeakTYLsSA",
    https=True,
)
collection_name = 'RAG-Usage-Example'
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

True

In [6]:
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='fastembed_collection'), CollectionDescription(name='RAG-Usage-Example')])

# Uploading Embedding to Qdrant Vector DB

In [11]:
client.upload_points(
    collection_name=collection_name,
    points=[
        PointStruct(
            id=idx,
            vector=vector.tolist(),
            payload={"text": text}
        )
        for idx, (vector, text) in enumerate(zip(embeddings, documents))
    ]
)

# Building a RAG System with OpenaI for any Query

In [35]:
from typing import List
from qdrant_client import QdrantClient
from openai import OpenAI
OpenAI_client = OpenAI(api_key='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')

# Function to generate completion from prompt
def generate_completion(prompt):
    completion = OpenAI_client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are assisting in answering a question."},
        {"role": "user", "content": prompt}
    ]
    )
    return completion.choices[0].message.content

# Function to embed Queries
def embed_query(Question):
    return np.array(list(embedding_model.embed([Question])))


# Initialize Qdrant Client
client = QdrantClient(
    url="https://c065099d-b51c-4e03-b680-646b177fc993.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key="LojX3SvBqFF93pknBnlK8J5QbVR98wWzypfg7UTr4lyhdeakTYLsSA",
    https=True,
)

Question = "Can AI Models be hacked?"
query_embeddings = embed_query(Question)

collection_name = 'RAG-Usage-Example'
all_text = ""

# Retrieve all hits and concatenate texts into a single prompt
for query_embedding in query_embeddings:
    query_vector: List[np.ndarray] = list(query_embedding)
    hits = client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=8 
    )

    for hit in hits:
        text = hit.payload["text"]
        all_text += text + "\n\n"

# Generate completion using all texts as a single prompt
prompt = f"Given the following text, answer the following question:\n\n{all_text}\n\nQuestion: What is the main idea of the text?\n\nAnswer:"
completion = generate_completion(prompt)

print("Generated Response:")
print(completion)


Generated Summary:
The main idea of the text is to discuss different aspects related to the ethical and social risks associated with language models, including exploring adversarial system messages, safety in dialogue systems, measuring model behaviors in mimicking human falsehoods, and assessing the potential existential risks posed by power-seeking AI.
