In [1]:
! pip install openai PyMuPDF

Collecting openai
  Downloading openai-1.35.3-py3-none-any.whl (327 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.4/327.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDF
  Downloading PyMuPDF-1.24.5-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting PyMuPDFb==1.24.3 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K

In [14]:
import fitz
import openai
import pandas as pd
from tqdm import tqdm
from typing import List

class PDFQnAGenerator:
    def __init__(self, pdf_path: str, openai_api_key: str):
        self.pdf_path = pdf_path
        self.openai_api_key = openai_api_key
        self.scraped_content = self.read_pdf_content()
        self.openai_client = openai.OpenAI(api_key=self.openai_api_key)
        self.raw_content_questions = []
        self.raw_content_answers = []

    def read_pdf_content(self) -> List[str]:
        """
        Reads a PDF and returns its content as a list of strings.

        Returns:
        list of str: A list where each element is the text content of a PDF page.
        """
        content_list = []
        with fitz.open(self.pdf_path) as doc:
            for page in doc:
                content_list.append(page.get_text())

        return content_list

    def process_scraped_content(self):
        """
        Process scraped content to replace special characters and split into sentences.
        """
        self.scraped_content = ' '.join(self.scraped_content)
        self.scraped_content = [self.scraped_content.split('. ')[i].replace('\n', '').replace('   ', '').replace('  ', '') for i in range(len(self.scraped_content.split('. ')))]

    def call_chatgpt(self, query: str, model: str = "gpt-3.5-turbo") -> str:
        """
        Generates a response to a query using the specified language model.
        Args:
            query (str): The user's query that needs to be processed.
            model (str, optional): The language model to be used. Defaults to "gpt-3.5-turbo".
        Returns:
            str: The generated response to the query.
        """

        # Prepare the conversation context with system and user messages.
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Question: {query}."},
        ]

        # Use the OpenAI client to generate a response based on the model and the conversation context.
        response = self.openai_client.chat.completions.create(
            model=model,
            messages=messages,
        )

        # Extract the content of the response from the first choice.
        content: str = response.choices[0].message.content

        # Return the generated content.
        return content

    def prompt_engineered_api(self, text: str) -> str:
        """
        Generate a question based on the provided text content.
        """
        prompt = f"""
            I have the following content: {text}

            Write one question based on the content above. Just write ONE question in a sentence. No more.
        """

        resp = self.call_chatgpt(prompt)

        return resp

    def generate_questions_answers(self):
        """
        Generate questions and answers from the scraped content.
        """
        for i in tqdm(range(len(self.scraped_content))):
            quest = self.scraped_content[i]
            resp = self.prompt_engineered_api(quest)
            this_sample_question = resp.split("###")[0]
            this_sample_answer = self.scraped_content[i]
            self.raw_content_questions.append(this_sample_question)
            self.raw_content_answers.append(this_sample_answer)

    def convert_to_dataframe(self) -> pd.DataFrame:
        """
        Converts a list of questions and answers into a Pandas DataFrame.

        Returns:
            - Pandas DataFrame: The resulting data frame with columns for each question-answer pair.
        """

        # Convert lists to Series objects for easier indexing
        qns_series = pd.Series([question + "\n" for question in self.raw_content_questions])
        ans_series = pd.Series(self.raw_content_answers)

        # Create a data frame from the Series objects
        df = pd.DataFrame({"Question": qns_series, "Answer": ans_series})

        # Split the question column by \n character and split the answer column by newline characters (\r\n or \n). This ensures that each row contains only one question and its corresponding answer.
        df["Question"] = df["Question"].str.split("\n")
        df["Answer"] = df["Answer"].str.split("\r\n|\n")

        # Reshape the data frame so that it has one row for each question and its corresponding answer. Drop any rows where there are no answers provided.
        df = df.explode("Question").reset_index().dropna()

        # Save a .csv file
        file_path_collapsed = self.pdf_path.replace("/", "_").replace(" ", "_")
        df.to_csv(f"questions_answers__{file_path_collapsed}.csv", index=False)

        return df


In [15]:
from google.colab import userdata

In [17]:
# Example usage:
pdf_path = "/content/1 - Individual Research Program - Syllabus - Amogh.pdf"
openai_api_key = userdata.get('OPENAI_API_KEY')
generator = PDFQnAGenerator(pdf_path, openai_api_key)
generator.process_scraped_content()
generator.generate_questions_answers()
df = generator.convert_to_dataframe()
print(df)


100%|██████████| 35/35 [00:23<00:00,  1.48it/s]

    index                                           Question  \
0       0  What is the motivation behind the project prop...   
1       0                                                      
2       1  What pre-trained model will students use in th...   
3       1                                                      
4       2  What steps will be taken to ensure that studen...   
..    ...                                                ...   
65     32                                                      
66     33  What steps should be taken to ensure all feedb...   
67     33                                                      
68     34  What steps are needed to finalize the submissi...   
69     34                                                      

                                               Answer  
0   [Independent Research Program | Data Science, ...  
1   [Independent Research Program | Data Science, ...  
2   [The motivation for this project is to utilize...  
3   [Th


