In [1]:
!pip install --upgrade openai



In [2]:
import os
os.environ["OPENAI_API_KEY"] = 'your_api_key_here'

### Preparing Data

In [2]:
import json

#### Reddit Data

In [2]:
import json
import openai
from tqdm.auto import tqdm

In [3]:
# Load data from the JSON file
with open('/content/reddit_posts.json', 'r') as file:
    posts_data = json.load(file)

In [28]:
def generate_qa_pairs(text):
    # Define the prompt to generate both question and answer
    prompt_text = f"Given the following description about pregnancy experiences, create a relevant question and provide a detailed answer based on the description:\n\n{text}\n\nGenerate the question and answer:"
    try:
        response = openai.Completion.create(
            engine="gpt-3.5-turbo-instruct",  # Consider using the latest model for better results
            prompt=prompt_text,
            max_tokens=700,
            temperature=0.5,
        )
        output = response.choices[0].text.strip()
        # Assuming the output format is "Question: <question> Answer: <answer>"
        if "Question:" in output and "Answer:" in output:
            question = output.split("Answer:")[0].replace("Question:", "").strip() + " ->"
            answer = output.split("Answer:")[1].strip() + ".\n"
            return question, answer
        else:
            return "Failed to generate a valid question ->", "No answer could be generated.\n"
    except Exception as e:
        print(f"An error occurred: {e}")
        return "Failed to generate a valid question ->", "No answer could be generated.\n"

In [29]:
# Prepare the training data
training_data = []
batch_size = 20  # Update the file every 20 records

In [30]:
for i, post in enumerate(tqdm(posts_data, desc="Generating Q&A pairs")):
    question, answer = generate_qa_pairs(post['body'])
    training_data.append({
        "prompt": question,
        "completion": answer
    })

    # Save in batches
    if (i + 1) % batch_size == 0 or (i + 1) == len(posts_data):
        with open('training_data.json', 'a') as f:  # Appending to ensure all data is in one file
            json.dump(training_data, f, indent=4)
        training_data = []  # Reset for the next batc

Generating Q&A pairs:   0%|          | 0/982 [00:00<?, ?it/s]

KeyboardInterrupt: 

Stopped created data because we've now enought data for finetuning.

In [32]:
import json

def correct_json_file(input_file, output_file):
    try:
        # Open the original file to read its contents
        with open(input_file, 'r') as file:
            # Read the file content as a single string
            file_data = file.read()

        # Remove problematic characters that are creating separate JSON arrays
        file_data = file_data.replace('][', ',')

        # Convert the corrected string back to a JSON object to ensure it's valid
        data = json.loads(file_data)

        # Write the corrected JSON back to a new file
        with open(output_file, 'w') as file:
            json.dump(data, file, indent=4)

        print("The JSON data has been corrected and saved to:", output_file)

    except Exception as e:
        print("An error occurred:", e)

# Specify the original file and the new file to save the corrected data
original_file = 'training_data.json'
corrected_file = 'training_data_corrected.json'

# Run the correction function
correct_json_file(original_file, corrected_file)

The JSON data has been corrected and saved to: training_data_corrected.json


In [16]:
import json

def convert_to_chat_format(input_file, output_file):
    try:
        # Load the existing data
        with open(input_file, 'r') as file:
            data = json.load(file)

        # Open the output file in write mode
        with open(output_file, 'w') as outfile:
            # Process each entry in the original data
            for item in data:
                # Create the chat format for each entry
                chat_entry = {
                    "messages": [
                        {"role": "system", "content": "This chatbot is designed to provide information on pregnancy-related topics."},
                        {"role": "user", "content": item["prompt"].replace(' ->', '')},  # Remove the arrow for cleaner content
                        {"role": "assistant", "content": item["completion"].strip('.\n')}  # Strip the dot and newline for cleaner content
                    ]
                }
                # Write the transformed entry as a JSON string followed by a newline
                json.dump(chat_entry, outfile)
                outfile.write('\n')

        print("Data has been successfully converted and saved to:", output_file)

    except Exception as e:
        print(f"An error occurred: {e}")

In [24]:
# Specify the input and output files
input_file = 'training_data_corrected.json'
output_file = 'chat_formatted_data.jsonl'

# Convert the data
convert_to_chat_format(input_file, output_file)

Data has been successfully converted and saved to: chat_formatted_data.jsonl


### Finetuning

In [26]:
file_name = "/content/chat_formatted_data.jsonl"

In [22]:
from openai import OpenAI
client = OpenAI()

In [27]:
upload_response = client.files.create(
  file=open(file_name, "rb"),
  purpose="fine-tune"
)
upload_response

FileObject(id='file-Tm6mmGrsyc5CfTxWGGjX6Qj6', bytes=405302, created_at=1717405210, filename='chat_formatted_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [28]:
# Save file name
file_id = upload_response.id
file_id

'file-Tm6mmGrsyc5CfTxWGGjX6Qj6'

In [29]:
fine_tune_response = client.fine_tuning.jobs.create(
  training_file=file_id,
  model="gpt-3.5-turbo"
)

In [30]:
fine_tune_response

FineTuningJob(id='ftjob-esS4nQiy7SNpmXbMSLtLT8km', created_at=1717405219, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-ZJPiXBQoLyWrdv0cz81slhOC', result_files=[], seed=252355966, status='validating_files', trained_tokens=None, training_file='file-Tm6mmGrsyc5CfTxWGGjX6Qj6', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)

In [41]:
# List 10 fine-tuning jobs
client.fine_tuning.jobs.list(limit=10)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-esS4nQiy7SNpmXbMSLtLT8km', created_at=1717405219, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:personal::9VydpLwM', finished_at=1717408300, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-ZJPiXBQoLyWrdv0cz81slhOC', result_files=['file-3d3T89WYDBKrl1hSyJc5mLBa'], seed=252355966, status='succeeded', trained_tokens=217614, training_file='file-Tm6mmGrsyc5CfTxWGGjX6Qj6', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None), FineTuningJob(id='ftjob-rWSiOX0pEUdlaKcF90hyUe8t', created_at=1717404689, error=Error(code='invalid_training_file', message='The job failed due to an invalid training file. Invalid file format. Input file file-hQgPPxv2DLXt1SfyedE1l2E7 is in the prompt-completion format, but the specified model gpt-3.5-turbo-0125 is a cha

In [43]:
# Retrieve the state of a fine-tune
client.fine_tuning.jobs.retrieve("ftjob-esS4nQiy7SNpmXbMSLtLT8km")

FineTuningJob(id='ftjob-esS4nQiy7SNpmXbMSLtLT8km', created_at=1717405219, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:personal::9VydpLwM', finished_at=1717408300, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-ZJPiXBQoLyWrdv0cz81slhOC', result_files=['file-3d3T89WYDBKrl1hSyJc5mLBa'], seed=252355966, status='succeeded', trained_tokens=217614, training_file='file-Tm6mmGrsyc5CfTxWGGjX6Qj6', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)

In [39]:
from openai import OpenAI
client = OpenAI()

completion = client.chat.completions.create(
  model="ft:gpt-3.5-turbo-0125:personal::9VydpLwM",
  messages=[
    {"role": "system", "content": "This chatbot is designed to provide information on pregnancy-related topics."},
    {"role": "user", "content": "How has the poster's pregnancy experience been impacted by their living situation and budget constraints?"}
  ]
)
print(completion.choices[0].message)


ChatCompletionMessage(content="The poster's pregnancy experience has been overshadowed by challenges such as not being able to afford a phone charger, sharing a vehicle with their spouse, losing a pair of slippers, utilizing a secondhand breast pump, needing to gain access to their grandparents' home to use a microwave for thawing out breast milk, and living in a condemned mobile home. These challenges have made the pregnancy journey very difficult and have impacted the family's overall well-being", role='assistant', function_call=None, tool_calls=None)


### Building RAG

In [44]:
!pip install pypdf langchain langchain-community faiss-cpu tiktoken

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.2.1-py3-none-any.whl (973 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.5/973.5 kB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-community
  Downloading langchain_community-0.2.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [

In [48]:
!pip install langchain-openai

Collecting langchain-openai
  Downloading langchain_openai-0.1.8-py3-none-any.whl (38 kB)
Installing collected packages: langchain-openai
Successfully installed langchain-openai-0.1.8


In [51]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

##### Building VectorDB (Knowloedge Base)

In [46]:
loader = PyPDFLoader("/content/The Pregnancy Encyclopedia ( PDFDrive ).pdf")

In [47]:
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

In [50]:
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-large", api_key=os.getenv("OPENAI_API_KEY"))

In [52]:
db = FAISS.from_documents(docs, embeddings_model)

In [53]:
db.save_local("knowlegde_base")

In [54]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

In [56]:
retriever = db.as_retriever(search_type="mmr")
RAG_PROMPT = """\
You are chatbot, designed to provide information on pregnancy-related topics.
Use the following context to answer the user's query. Also, Act like a real human.
Analyze the question and related context correctly to answer question.
Question:
{question}

Context:
{context}
"""
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
openai_chat_model = ChatOpenAI(model="ft:gpt-3.5-turbo-0125:personal::9VydpLwM")
retrieval_augmented_generation_chain = (
    {"context": itemgetter("question")
      | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")}
)

In [57]:
user_input = "How has the poster's pregnancy experience been impacted by their living situation and budget constraints?"

In [58]:
answer = await retrieval_augmented_generation_chain.ainvoke({"question": user_input})

In [60]:
answer['response'].content

'The poster\'s pregnancy experience has been impacted by their living situation and budget constraints. As mentioned in the text, the poster is a single parent and is worried about being able to afford having a baby. This is a common concern for new moms, especially if they have limited financial resources and no partner to share the expenses. The poster is advised to get their finances in order and start saving for their child\'s future. They are also encouraged to make a clear financial plan and discuss household finances with their partner, if applicable. Additionally, it is mentioned that having a baby will impact the poster\'s income, especially if they have to consider childcare costs. It is suggested to look for ways to save money and to set up a "baby" account. Overall, the poster\'s pregnancy experience is being affected by the financial responsibilities and challenges that come with being a single parent'