## Download

In [None]:
pip install openai

## Import OpenAI

In [78]:
import openai
openai.api_key = "ENTER API KEY HERE"

In [None]:
pip install PyPDF2

## Define `extract_df_text` Function

The following function extract pdf assuming each page is a big string.

In [None]:
from typing import List
from PyPDF2 import PdfReader


def extract_pdf_text(filepath: str) -> List[str]:
    """
    Extracts text from each page of a PDF file using PyPDF2 and returns it as a list of strings.

    Parameters:
    filepath (str): The file path or URL of the PDF file to extract text from.

    Returns:
    List[str]: A list of strings containing the extracted text from each page of the PDF.
    """
    pdf_file = open(filepath, 'rb')
    status = True
    try:
        pdf_reader = PdfReader(pdf_file)
        pages = len(pdf_reader.pages)

        text_list = []
        for page in range(pages):
            pdf_page = pdf_reader.pages[page]
            text = pdf_page.extract_text()
            text_list.append(text)
    except Exception as e:
        status = False

    if status == False:
        try:
            text_list = []
            for i in PdfReader(open(name, 'rb')).pages:
                text_list.append(i.extract_text())
        except Exception as e:
            text_list = "Failed."
    else:
        print(f"Task status: {text_list}")

    pdf_file.close()
    return text_list


The following breaks PDF down and extract by token sizes.

In [117]:
from typing import List
import re
from PyPDF2 import PdfReader


def extract_pdf_text(filepath: str) -> List[str]:
    """
    Extracts text from each page of a PDF file using PyPDF2 and returns it as a list of strings,
    with each string containing approximately 200 tokens.

    Parameters:
    filepath (str): The file path or URL of the PDF file to extract text from.

    Returns:
    List[str]: A list of strings containing the extracted text from the PDF, split into paragraphs
               of approximately 200 tokens.
    """
    def split_text(text: str, token_limit: int = 100) -> List[str]:
        tokens = text.split()
        paragraphs = [' '.join(tokens[i:i+token_limit]) for i in range(0, len(tokens), token_limit)]
        return paragraphs

    pdf_file = open(filepath, 'rb')
    text_list = []

    try:
        pdf_reader = PdfReader(pdf_file)
        for pdf_page in pdf_reader.pages:
            text = pdf_page.extract_text()
            paragraphs = split_text(text)
            text_list.extend(paragraphs)
    except Exception as e:
        try:
            for i in PdfReader(open(name, 'rb')).pages:
                text = i.extract_text()
                paragraphs = split_text(text)
                text_list.extend(paragraphs)
        except Exception as e:
            text_list = ["Failed."]

    pdf_file.close()
    return text_list

### Scrape PDF

In [None]:
%%time
# name = "/content/the-economic-potential-of-generative-ai-the-next-productivity-frontier-vf.pdf"
# name = "/content/2020-2021-Larkin-Street-Youth-Services-Impact-Report.pdf"
# name = "/content/WEF_Adopting_AI_Responsibly_Guidelines_for_Procurement_of_AI_Solutions_by_the_Private_Sector_2023.pdf"
# name = "/content/THV-Flyer-and-application.pdf"
# name = "/content/MAIA_Abstract_18.pdf"
name = "/content/2020-2021-Larkin-Street-Youth-Services-Impact-Report.pdf"
pdf_text_list = extract_pdf_text(f"{name}")
print(pdf_text_list)

In [119]:
pdf_text_list = pdf_text_list[0:3]

### Create DataFrame

Create dataframe

In [120]:
import pandas as pd

In [121]:
df = pd.DataFrame(pdf_text_list)
df.columns = ['context']
df.shape

(3, 1)

In [122]:
df.head()

Unnamed: 0,context
0,ANNUAL IMPACT REPORT
1,WELCOME. CONTENTS IMPACT 5 SCOPE 6 COVID-19 10...
2,"IMPACT YOUTH WE SERVE IMPACTDear Friends, We a..."


In [None]:
df.to_csv("contexts.csv")

### Define Function: `get_questions`

In [137]:
def get_questions(context: str) -> str:
    """
    Given a text context, generates a list of questions using OpenAI's GPT-3 API.

    Args:
    - context: A string representing the context for which questions should be generated.

    Returns:
    - A string containing the question generated by the API.
    """

    try:
        response = openai.Completion.create(
            engine="davinci-instruct-beta-v3",
            prompt=f"Write just ONE question based on the text below\n\nText: {context}\n\nQuestion:\n1.",
            # prompt=f"Write one question based on the text: {context}: Question:",
            temperature=0,
            max_tokens=200,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n\n"]
        )
        # Extract question text from the response
        question_text = response['choices'][0]['text']
        return question_text
    except:
        # Return an empty string if there was an error
        return ""

Run on real data

In [None]:
%%time
df['questions'] = df.context.apply(get_questions)
# df['questions'] = "1." + df.questions
# print(df[['questions']].values[0][0])

In [None]:
df

### Define Function: `get_answers`

In [112]:
def get_answers(row: pd.DataFrame) -> str:
    """
    Given a dataframe row containing context and questions, generates an answer using OpenAI's GPT-3 API.

    Args:
    - row: A pandas dataframe row containing 'context' and 'questions' columns.

    Returns:
    - A string containing the answer generated by the API.
    """

    try:
        response = openai.Completion.create(
            engine="davinci-instruct-beta-v3",
            prompt=f"Write answer (limit to 1 paragraph) based on the text below\n\nText: {row.context}\n\nQuestion:\n{row.questions}\n\nAnswer:",
            temperature=0,
            max_tokens=500,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        # Extract answer text from the response
        answer_text = response['choices'][0]['text']
        return answer_text
    except Exception as e:
        # Print the error message and return an empty string if there was an error
        print (e)
        return ""

Run on real data

In [132]:
%%time
df['answers']= df.apply(get_answers, axis=1)
# df['answers'] = "1." + df.answers
df = df.dropna().reset_index().drop('index', axis=1)
print(df[['answers']].values[0][0])



The annual impact report provides an overview of the organization's impact over the past year. It highlights the progress the organization has made and the challenges it has faced. The report also includes information about the organization's finances and how it plans to use its resources in the future.
CPU times: user 23.1 ms, sys: 2.88 ms, total: 26 ms
Wall time: 2.05 s


In [None]:
df.head()

## Save DataFrame

In [None]:
df.to_csv(f'results.csv')

## Post Processing

The following process the code for `GPT4`.

Only run it if you are using `GPT4`.

In [None]:
import numpy as np
import re

In [None]:
questions = pd.DataFrame()
answers = pd.DataFrame()

for i in range(2):
    processed_df_quest = pd.DataFrame()
    processed_df_ans = pd.DataFrame()

    s = df.questions[i]
    split_s = re.split(r'\d+\.', s)
    split_s = [i.strip() for i in split_s if i]
    processed_df_quest['questions'] = split_s
    questions = pd.concat([questions, processed_df_quest])

    s = df.answers[i]
    split_s = re.split(r'\d+\.', s)
    split_s = [i.strip() for i in split_s if i]
    processed_df_ans['answers'] = split_s
    answers = pd.concat([answers, processed_df_ans])

In [None]:
questions['role'] = 'user'
questions = questions[['role', 'questions']]
questions.columns = ['role', 'content']
questions.index = np.arange(1, len(questions) + 1)

In [None]:
answers['role'] = 'assistant'
answers = answers[['role', 'answers']]
answers.columns = ['role', 'content']
answers.index = np.arange(1, len(answers) + 1)

In [None]:
final_messages = []
for i in range(len(questions)):
    final_messages.append(questions.iloc[i, :].to_dict())
    final_messages.append(answers.iloc[i, :].to_dict())

In [None]:
final_messages

In [None]:
def convert_to_list_of_dict_single_pair(df: pd.DataFrame) -> List[Dict[str, str]]:
    questions = pd.DataFrame()
    answers = pd.DataFrame()

    for i in range(df.shape[0]):
        processed_df_quest = pd.DataFrame()
        processed_df_ans = pd.DataFrame()

        s = df.questions[i]
        split_s = re.split(r'\d+\.', s)
        split_s = [i.strip() for i in split_s if i]
        processed_df_quest['questions'] = split_s
        questions = pd.concat([questions, processed_df_quest])

        s = df.answers[i]
        split_s = re.split(r'\d+\.', s)
        split_s = [i.strip() for i in split_s if i]
        processed_df_ans['answers'] = split_s
        answers = pd.concat([answers, processed_df_ans])

    questions['role'] = 'user'
    questions = questions[['role', 'questions']]
    questions.columns = ['role', 'content']
    questions.index = np.arange(1, len(questions) + 1)

    answers['role'] = 'assistant'
    answers = answers[['role', 'answers']]
    answers.columns = ['role', 'content']
    answers.index = np.arange(1, len(answers) + 1)

    final_messages = []
    for i in range(len(questions)):
        final_messages.append(questions.iloc[i, :].to_dict())
        final_messages.append(answers.iloc[i, :].to_dict())

    return final_messages

In [None]:
convert_to_list_of_dict_single_pair(df)

## Post Processing

The following code break the question-answer down into rows.

Only do this if you want to fine-tune your own LLM models.

In [29]:
import pandas as pd
import re

In [16]:
df = pd.read_csv("/content/mckinsey-covid-report.csv")

In [30]:
# Define a regular expression pattern for splitting
pattern = re.compile(r'(?=\n[1-9])')

In [31]:
# Create a sample dataframe
data = df

# Initialize empty lists to store the processed questions for each column
context = []
processed_q = []
processed_a = []

# Iterate over the rows of the dataframe
for index, row in df.iterrows():

    # Process the 'questions' column
    questions = re.split(pattern, row['questions'])
    for quest in questions:
        processed_q.append(quest.strip())

    # Process the 'answers' column
    answers = re.split(pattern, row['answers'])
    for ans in answers:
        processed_a.append(ans.strip())

    # Process the 'context' column
    for ans in answers:
        c = row['context']
        context.append(c)

# Create a new dataframe with the processed questions
new_data = {
    'context': context,
    'questions': processed_q,
    'answers': processed_a
}

In [32]:
len(new_data['context']), len(new_data['questions']), len(new_data['answers'])

(54, 48, 54)

In [None]:
i=3
print(new_data['context'][i])
print(new_data['questions'][i])
print(new_data['answers'][i])

In [None]:
new_df = pd.DataFrame(new_data)

print(new_df)