## Download Library

In [None]:
! pip install openai

In [None]:
import openai

In [None]:
! pip install datasets

## Scrap `Any` PDF

We need the `PyMuPDF` package in python. So, we install it first.

In [None]:
! pip install PyMuPDF

### `read_pdf_content` function

In [None]:
import fitz  # PyMuPDF

def read_pdf_content(pdf_path):
    """
    Reads a PDF and returns its content as a list of strings.

    Args:
    pdf_path (str): The file path to the PDF.

    Returns:
    list of str: A list where each element is the text content of a PDF page.
    """
    content_list = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            content_list.append(page.get_text())

    return content_list

In [None]:
%%time

scraped_content = read_pdf_content("/content/all_ysa_doc.pdf")

In [None]:
len(scraped_content)

In [None]:
len(scraped_content[0])

In [None]:
scraped_content[0]

In [None]:
scraped_content = ' '.join(scraped_content)

In [None]:
%%time

scraped_content = [scraped_content.split('. ')[i].replace('\n', '').replace('   ', '').replace('  ', '') for i in range(len(scraped_content.split('. ')))]

In [None]:
scraped_content[0]

## API Call to Create Data

Here we use the `client.chat.completions.create` function from *OpenAI* as a helper function to assist us to create question answer.

In [None]:
OPENAI_API_KEY = "sk-xxx"
openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)


def call_chatgpt(query: str, model: str = "gpt-3.5-turbo") -> str:
    """
    Generates a response to a query using the specified language model.
    Args:
        query (str): The user's query that needs to be processed.
        model (str, optional): The language model to be used. Defaults to "gpt-3.5-turbo".
    Returns:
        str: The generated response to the query.
    """

    # Prepare the conversation context with system and user messages.
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"Question: {query}."},
    ]

    # Use the OpenAI client to generate a response based on the model and the conversation context.
    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
    )

    # Extract the content of the response from the first choice.
    content: str = response.choices[0].message.content

    # Return the generated content.
    return content

In [None]:
resp = call_chatgpt("tell me a joke")

In [None]:
resp

### Prompt Engineer

We use prompt engineer to ensure the content `GPT` gave us is in the same content as the `openassist/guanaco` data.

```python
    ### Human:
    ### Assistant:
```

In [None]:
def prompt_engineered_api(text: str):

    prompt = f"""
        I have the following content: {text}

        Write one question based on the content above. Just write ONE question in a sentence. No more.
    """

    resp = call_chatgpt(prompt)

    return resp

In [None]:
scraped_content[0]

In [None]:
resp = prompt_engineered_api(scraped_content[20])
resp

In [None]:
this_sample_question = resp.split("###")[0]
this_sample_answer = scraped_content[20]
print(this_sample_question)
print(this_sample_answer)

In [None]:
len(scraped_content)

## Create `DataDict` Structure

In [None]:
from datasets import Dataset, DatasetDict
from tqdm import tqdm

In [None]:
%%time

raw_content_questions = []
raw_content_answers = []
for i in tqdm(range(len(scraped_content))):
    resp = prompt_engineered_api(scraped_content[i])
    this_sample_question = resp.split("###")[0]
    this_sample_answer = scraped_content[i]
    raw_content_questions.append(this_sample_question)
    raw_content_answers.append(this_sample_answer)

In [None]:
i = 20
raw_content_questions[i], raw_content_answers[i]

In [None]:
# Example data - replace these with your actual data
train_data = {
    'questions': raw_content_questions,
    'answers': raw_content_answers
}

# Create Dataset objects for training and testing
train_dataset = Dataset.from_dict(train_data)

# Combine them into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
})

# Display the structure of the dataset
print(dataset_dict)

## Push to HuggingFace Hub

In [None]:
! huggingface-cli login

In [None]:
from huggingface_hub import HfApi, create_repo

In [None]:
# Replace 'your_token_here' with your actual Hugging Face Auth token
# Replace 'youthless-homeless-shelter-web-scrape-dataset' with your desired repository name
auth_token = 'xxx'
repo_name = 'youthless-homeless-shelter-web-scrape-dataset-qa-formatted'
username = 'eagle0504' # replace with your Hugging Face username

api = HfApi()
create_repo(repo_name, token=auth_token, private=False) # Set private=True if you want it to be a private dataset

In [None]:
app_id = f"{username}/{repo_name}"
print(app_id)

In [None]:
%%time

dataset_dict.push_to_hub(app_id)