## Library

In [6]:
%%time

try:
    import PyPDF2
except:
    ! pip install PyPDF2
    import PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/232.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
CPU times: user 94.1 ms, sys: 25.8 ms, total: 120 ms
Wall time: 8.09 s


## Define `read_and_textify`

Reads PDF files and extracts text from each page, breaking the text into specified segments. This function iterates over a list of PDF file paths, extracts text from each page, and compiles a list of texts and corresponding source information, segmented into smaller parts of approximately 'chunk_size' words each.

In [44]:
import PyPDF2
from typing import List, Tuple

def read_and_textify(
    files: List[str], chunk_size: int = 50
) -> Tuple[List[str], List[str]]:
    """
    Reads PDF files and extracts text from each page, breaking the text into specified segments.
    This function iterates over a list of PDF file paths, extracts text from each page,
    and compiles a list of texts and corresponding source information, segmented into smaller parts
    of approximately 'chunk_size' words each.

    Args:
    files (List[str]): A list of PDF file paths.
    chunk_size (int): The number of words per text segment. Default is 50.

    Returns:
    Tuple[List[str], List[str]]: A tuple containing two lists:
        1. A list of strings, where each string is a segment of text extracted from a PDF page.
        2. A list of strings indicating the source of each text segment (file name, page number, and segment number).
    """

    text_list = []  # List to store extracted text segments
    sources_list = []  # List to store source information

    # Iterate over each file
    for file_path in files:
        with open(file_path, "rb") as file:
            pdfReader = PyPDF2.PdfReader(file)  # Create a PDF reader object
            # Iterate over each page in the PDF
            for i in range(len(pdfReader.pages)):
                pageObj = pdfReader.pages[i]  # Get the page object
                text = pageObj.extract_text()  # Extract text from the page
                if text:
                    # Split text into words
                    words = text.split('. ')
                    for j in range(len(words)):
                        # Get the chunk of text from j-chunk_size to j+chunk_size
                        start = max(0, j - chunk_size)
                        end = min(len(words), j + chunk_size + 1)
                        chunk = ". ".join(words[start:end]) + '.'
                        text_list.append(chunk)
                        # Create a source identifier for each chunk and add it to the list
                        sources_list.append(f"{file_path}_page_{i}_chunk_{j}")
                else:
                    # If no text extracted, still add a placeholder
                    text_list.append("")
                    sources_list.append(f"{file_path}_page_{i}_chunk_0")
                pageObj.clear()  # Clear the page object (optional, for memory management)

    return text_list, sources_list

### Load Sample PDF

In [45]:
pdf_files = ["/content/CL Vapaad Paper by Yin.pdf"]

### Read and Extract

In [55]:
# Call the function
text_segments, sources = read_and_textify(pdf_files, chunk_size=1)

### Create DataFrame

In [56]:
import pandas as pd

# Create a DataFrame
df = pd.DataFrame({
    'i': list(range(len(text_segments))),
    'source': sources,
    'segment': text_segments
})

# Display the DataFrame
df

Unnamed: 0,i,source,segment
0,0,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,"Dear Editor , \n \nI hope this letter finds yo..."
1,1,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,"Dear Editor , \n \nI hope this letter finds yo..."
2,2,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,"I am writing to submit our manuscript titled ""..."
3,3,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,Our \nwork introduces an innovative autoencode...
4,4,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,\n The abstract of our manuscript is as follow...
5,5,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,This study introduces the Vision Augmentation ...
6,6,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,"Utilizing the Moving MNIST dataset, we demonst..."
7,7,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,"V APAAD combines data augmentation, ConvLSTM2D..."
8,8,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,This methodology not only adheres to human \nc...
9,9,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,The experimental results confirm that V APAAD ...


## Install `OpenAI`

In [None]:
! pip install openai

In [53]:
from google.colab import userdata
from openai import OpenAI

In [54]:
openai_client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))

### Define API Call for GPT Foundation Model

In [58]:
def call_gpt(prompt: str, content: str) -> str:
    """
    Sends a structured conversation context including a system prompt, user prompt,
    and additional background content to the GPT-3.5-turbo model for a response.

    This function is responsible for generating an AI-powered response by interacting
    with the OpenAI API. It puts together a preset system message, a formatted user query,
    and additional background information before requesting the completion from the model.

    Args:
        prompt (str): The main question or topic that the user wants to address.
        content (str): Additional background information or details relevant to the prompt.

    Returns:
        str: The generated response from the GPT model based on the given prompts and content.

    Note: 'openai_client' is assumed to be an already created and authenticated instance of the OpenAI
          openai_client, which should be set up prior to calling this function.
    """

    # Generates a response from the model based on the interactive messages provided
    response = openai_client.chat.completions.create(
        model="gpt-3.5-turbo",  # The AI model being queried for a response
        messages=[
            # System message defining the assistant's role
            {"role": "system", "content": "You are a helpful assistant."},
            # User message containing the prompt
            {"role": "user", "content": f"I want to ask you a question: {prompt}"},
            # Assistant message asking for background content
            {"role": "assistant", "content": "What is the background content?"},
            # User providing the background content
            {"role": "user", "content": content},
        ]
    )

    # Extracts and returns the response content from the model's completion
    return response.choices[0].message.content

### Created Related Content Based Questions

In [61]:
%%time

related_questions = [call_gpt(prompt="create question from the content", content=df.segment[i]) for i in range(len(df))]

CPU times: user 459 ms, sys: 30.6 ms, total: 490 ms
Wall time: 54.7 s


In [62]:
df['related questions'] = related_questions

### Create Embeddings

When create embeddings, make sure we want embeddings of *related questions*.

In [64]:
def list_to_nums(sentences: List[str]) -> List[List[float]]:
    """
    Converts a list of sentences into a list of numerical embeddings using OpenAI's embedding model.
    Args:
    - sentences (List[str]): A list of sentences (strings).
    Returns:
    - List[List[float]]: A list of lists of numerical embeddings.
    """

    # Initialize the list to store embeddings
    embeddings = []

    # Loop through each sentence to convert to embeddings
    for sentence in sentences:
        # Use the OpenAI API to get embeddings for the sentence

        response = openai_client.embeddings.create(
            input=sentence, model="text-embedding-3-small"
        )

        embeddings.append(response.data[0].embedding)

    return embeddings

In [72]:
%%time

generated_embeddings = list_to_nums(df['related questions'].to_list())

CPU times: user 99.4 ms, sys: 171 µs, total: 99.5 ms
Wall time: 2.41 s


In [73]:
df['embeddings'] = generated_embeddings

### Produce Final Results

In [78]:
df.iloc[:, 0:4].head(6)

Unnamed: 0,i,source,segment,related questions
0,0,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,"Dear Editor , \n \nI hope this letter finds yo...",What is the title of the manuscript you are su...
1,1,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,"Dear Editor , \n \nI hope this letter finds yo...",What is the title of the manuscript that the a...
2,2,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,"I am writing to submit our manuscript titled ""...","Based on the content you provided, a possible ..."
3,3,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,Our \nwork introduces an innovative autoencode...,What is the novel framework proposed in the st...
4,4,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,\n The abstract of our manuscript is as follow...,What is the novel approach introduced in the s...
5,5,/content/CL Vapaad Paper by Yin.pdf_page_0_chu...,This study introduces the Vision Augmentation ...,How does the Vision Augmentation Prediction Au...
