## Download Library

In [None]:
! pip install openai

In [None]:
import openai

In [None]:
! pip install datasets

## Scrap `Any` PDF

We need the `PyMuPDF` package in python. So, we install it first.

In [None]:
! pip install PyMuPDF

### `read_pdf_content` function

In [None]:
import fitz  # PyMuPDF

def read_pdf_content(pdf_path):
    """
    Reads a PDF and returns its content as a list of strings.

    Args:
    pdf_path (str): The file path to the PDF.

    Returns:
    list of str: A list where each element is the text content of a PDF page.
    """
    content_list = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            content_list.append(page.get_text())

    return content_list

In [None]:
%%time

scraped_content = read_pdf_content("/content/all_ysa_doc.pdf")

In [None]:
len(scraped_content)

In [None]:
len(scraped_content[0])

In [None]:
scraped_content[0]

In [None]:
scraped_content = ' '.join(scraped_content)

In [None]:
%%time

scraped_content = [scraped_content.split('. ')[i].replace('\n', '').replace('   ', '').replace('  ', '') for i in range(len(scraped_content.split('. ')))]

In [None]:
scraped_content[0]

## API Call to Create Data

Here we use the `client.chat.completions.create` function from *OpenAI* as a helper function to assist us to create question answer.

In [None]:
OPENAI_API_KEY = "sk-xxx"
openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)


def call_chatgpt(query: str, model: str = "gpt-3.5-turbo") -> str:
    """
    Generates a response to a query using the specified language model.
    Args:
        query (str): The user's query that needs to be processed.
        model (str, optional): The language model to be used. Defaults to "gpt-3.5-turbo".
    Returns:
        str: The generated response to the query.
    """

    # Prepare the conversation context with system and user messages.
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"Question: {query}."},
    ]

    # Use the OpenAI client to generate a response based on the model and the conversation context.
    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
    )

    # Extract the content of the response from the first choice.
    content: str = response.choices[0].message.content

    # Return the generated content.
    return content

In [None]:
resp = call_chatgpt("tell me a joke")

In [None]:
resp

### Prompt Engineer

We use prompt engineer to ensure the content `GPT` gave us is in the same content as the `openassist/guanaco` data.

```python
    ### Human:
    ### Assistant:
```

In [None]:
def prompt_engineered_api(text: str):

    prompt = f"""
        I have the following content: {text}

        Write one question based on the content above. Just write ONE question in a sentence. No more.
    """

    resp = call_chatgpt(prompt)

    return resp

In [None]:
scraped_content[0]

In [None]:
resp = prompt_engineered_api(scraped_content[20])
resp

In [None]:
this_sample_question = resp.split("###")[0]
this_sample_answer = scraped_content[20]
print(this_sample_question)
print(this_sample_answer)

In [None]:
len(scraped_content)

## Create `DataDict` Structure

In [None]:
from datasets import Dataset, DatasetDict
from tqdm import tqdm

In [None]:
%%time

raw_content_questions = []
raw_content_answers = []
for i in tqdm(range(len(scraped_content))):
    resp = prompt_engineered_api(scraped_content[i])
    this_sample_question = resp.split("###")[0]
    this_sample_answer = scraped_content[i]
    raw_content_questions.append(this_sample_question)
    raw_content_answers.append(this_sample_answer)

In [None]:
i = 20
raw_content_questions[i], raw_content_answers[i]

In [None]:
# Example data - replace these with your actual data
train_data = {
    'questions': raw_content_questions,
    'answers': raw_content_answers
}

# Create Dataset objects for training and testing
train_dataset = Dataset.from_dict(train_data)

# Combine them into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
})

# Display the structure of the dataset
print(dataset_dict)

## Push to HuggingFace Hub

In [None]:
! huggingface-cli login

In [None]:
from huggingface_hub import HfApi, create_repo

In [None]:
# Replace 'your_token_here' with your actual Hugging Face Auth token
# Replace 'youthless-homeless-shelter-web-scrape-dataset' with your desired repository name
auth_token = 'xxx'
repo_name = 'youthless-homeless-shelter-web-scrape-dataset-qa-formatted'
username = 'eagle0504' # replace with your Hugging Face username

api = HfApi()
create_repo(repo_name, token=auth_token, private=False) # Set private=True if you want it to be a private dataset

In [None]:
app_id = f"{username}/{repo_name}"
print(app_id)

In [None]:
%%time

dataset_dict.push_to_hub(app_id)

## Pull Data from HuggingFace

If you are already have a `DataDict` on *HuggingFace*, you can start here and use the following code to load the data in directly to make some queries.

You can use this code directly in a `streamlit` application.

In [None]:
! pip install chromadb

Collecting chromadb
  Downloading chromadb-0.4.22-py3-none-any.whl (509 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.0/509.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.109.2-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)
  Downloading uvicorn-0.27.1-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.4.1-py2.

In [None]:
import chromadb
from datasets import load_dataset
import numpy as np
import pandas as pd
import string

In [None]:
dataset = load_dataset("eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted")
client = chromadb.Client()
random_number = np.random.randint(low=1e9, high=1e10)
random_string = ''.join(np.random.choice(list(string.ascii_uppercase + string.digits), size=10))
combined_string = f"{random_number}{random_string}"
collection = client.create_collection(combined_string)

# Embed and store the first N supports for this demo
L = len(dataset["train"]['questions'])
collection.add(
    ids=[str(i) for i in range(0, L)],  # IDs are just strings
    documents=dataset["train"]['questions'], # Enter questions here
    metadatas=[{"type": "support"} for _ in range(0, L)],
)

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:04<00:00, 18.6MiB/s]


In [127]:
question = "What is the main focus of the Youth Spirit Artworks program in Santa Babra, New York?"

In [128]:
results = collection.query(
    query_texts=question,
    n_results=5
)

In [129]:
idx = results["ids"][0]
idx = [int(i) for i in idx]
idx

[0, 63, 108, 118, 84]

In [130]:
ref = pd.DataFrame(
    {
        "idx": idx,
        "question": [dataset["train"]['questions'][i] for i in idx],
        "answers": [dataset["train"]['answers'][i] for i in idx],
        "distances": results["distances"][0]
    }
)
ref

Unnamed: 0,idx,question,answers,distances
0,0,What is the main focus of the Youth Spirit Art...,About YSAYouth Spirit Artworks (YSA) is a prog...,0.314319
1,63,What is the goal of Youth Spirit Artworks' com...,In response to the dire need for youth housing...,0.494221
2,108,What opportunities are available for youth in ...,The youth in the Village are fullyintegrated i...,0.54613
3,118,What services does Youth Spirit Artworks provi...,Youth Spirit Artworks compliments its training...,0.56125
4,84,What is the mission of Youth Spirit Artworks (...,By providing underserved homeless andlow-incom...,0.561282


In [None]:
special_threshold = 0.3
filtered_ref = ref[ref["distances"] < special_threshold]
filtered_ref

Unnamed: 0,idx,question,answers,distances
0,1,What challenges do older homeless and low-inco...,We focus on BIPOC and LGBTQIA+communities who ...,0.0
