In [36]:
# ! pip install python-dotenv

In [37]:
from dotenv import load_dotenv
import os 

load_dotenv()

# Loading Credentials
api_key = os.getenv("AZURE_OPENAI_API_KEY")
api_version = os.getenv("API_VERSION")
api_endpoint = os.getenv("AZURE_BASE_URL")
together_api = os.getenv("TOGETHER_API_KEY")

In [38]:
# ! pip install gdown

# !gdown 1yVbhJWh4L1unDbDT4APOusTXlwic7aE9

### Extracting Text and IMages from the PDF

In [39]:
# ! pip install PyMuPDF

* here we can check how to pass the base64 encoded images to open ai model for usage
* note that the variable name for the image is still image_url, but we pass the image in base64 format

https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images

In [40]:
import base64

# Open the image file and encode it as a base64 string
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

In [41]:
# ! pip install openai

In [42]:
from openai import AzureOpenAI

client = AzureOpenAI(
    api_key=api_key,
    api_version=api_version,
    azure_endpoint=api_endpoint)


def describe_image(base64_image):
    """
    Uses OpenAI's GPT-4o model to generate a description of the image.
    """
    response = client.chat.completions.create(
      model="gpt-4o-mini",
      messages=[
        { "role": "system", "content": "Your job is to extract all the information from the images, includng the text. Extract all the text from the image without changing the order or structure of the information. recheck if all the text has been extracted correctly and return in the same presentation and structure as present in the original image. "},
         { "role": "user",
          "content": [
            {"type": "text", "text": "extract ALL the text from the image in the same structure as present in the image. and then after it summarise everything in brief, do not miss anything "},
            {
              "type": "image_url",
              "image_url": {
                "url": f"data:image/png;base64,{base64_image}",
              },
            },
          ],
        }
      ],
      max_tokens=300,
    )
    
    return response.choices[0].message.content

In [43]:

import fitz  # PyMuPDF wrapper for simplicity

def extract_images_and_text_from_pdf(pdf_path, output_folder):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Initialize a variable to store the combined text
    combined_text = ""

    # Loop through each page
    for page_number in range(len(pdf_document)):
        
        # Only testing till page number 10, main purpose is to build and end-to-end pipeline
        if page_number > 10:
            break

        page = pdf_document.load_page(page_number)

        # Loading text from pdf
        text = page.get_text()

        # Add the text of the current page to combined_text
        combined_text += f"\n\nPage {page_number + 1}:\n{text}"

        # Get the images from the page
        image_list = page.get_images(full=True)

        # Extract and process each image
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"page_{page_number+1}_img_{img_index+1}.{image_ext}"
            image_filepath = os.path.join(output_folder, image_filename)

            # Save the image to the output folder
            with open(image_filepath, "wb") as image_file:
                image_file.write(image_bytes)

            # Encode the image to base64
            base64_image = encode_image(image_filepath)

            # Use GPT-4o to describe the image and extract text
            image_description = describe_image(base64_image)

            # Add the image description and reference to combined_text
            combined_text += f"\n\n[Image: {image_filename}]\n{image_description}"

            print(f"Processed {image_filename} on page {page_number + 1}")

    print("Processing complete.")

    # Return the combined text
    return combined_text

# # Example usage
# pdf_path = "PA - Consolidated lecture notes.pdf"
# output_folder = "extracted_images_new"
# combined_text = extract_images_and_text_from_pdf(pdf_path, output_folder)

# # Optionally save the combined text to a file
# with open("combined_text.txt", "w") as text_file:
#     text_file.write(combined_text)

In [44]:
# ! pip install langchain-community

In [45]:
from langchain_community.document_loaders import TextLoader

loaders = TextLoader("combined_text.txt")

print(loaders.load()[0].page_content)



Page 1:
Product Sense -
How to tackle product strategy and business acumen rounds in
interviews?
Lecture Objective:
How to address business acumen questions round :
●
Analyzing a metric change. What’s a metric?
●
Defining metrics to measure performance / success of a new feature / product.
Importance of product strategy & business acumen rounds -
●
Apart from building models, developing dashboards and reporting frameworks -
One of the main responsibilities of a data scientist is to extract insights from
data and work with product managers and engineering teams to deliver
actionable plans to improve the product.
●
Product sense is about understanding all possibilities, not finding one correct
answer.
Example Questions: Product Acumen / Business Acumen
●
Why did Youtube’s traffic drop by 5%?
●
How would you measure the success of the “Save Post” feature on Facebook?
●
What metrics would you define to measure the health of the product search in
Amazon?
●
We have a dashboard tracking our

In [46]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 60,
    separators=["\n\n","\n"]
)

In [47]:
splits = text_splitter.split_documents(loaders.load())
len(splits)

47

In [48]:
%pip install -qU langchain-openai

Note: you may need to restart the kernel to use updated packages.


In [49]:
from langchain_openai import AzureOpenAIEmbeddings

In [50]:
embeddings = AzureOpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=api_key,
    api_version=api_version,
    azure_endpoint=api_endpoint
)

In [51]:

from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(splits, embeddings)
print(db.index.ntotal)

NotFoundError: Error code: 404 - {'error': {'code': 'DeploymentNotFound', 'message': 'The API deployment for this resource does not exist. If you created the deployment within the last 5 minutes, please wait a moment and try again.'}}