In [1]:
import dotenv

import langchain
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import DirectoryLoader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from MyDataLoader import MyDataLoader
import os
import csv

In [20]:
# import pandas as pd

# # Read the CSV file into a DataFrame
# df = pd.read_csv("../data/Personality Questions.csv")

# # Add "User: " to the front of each entry in the user column
# df["USER"] = "User: " + df["USER"]

# # Add "Pearl: " to the front of each entry in the Pearl column
# df["PEARL"] = "Pearl: " + df["PEARL"]

# # Save the DataFrame to a new CSV file
# df.to_csv("data_with_prefixes.csv", index=False)


In [2]:
dotenv.load_dotenv()
openai_api_key = os.environ.get("OPENAI_API_KEY")

embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
loader = MyDataLoader('../data/data_with_prefixes.csv')
documents = loader.split_by_rows()
# for doc in documents:
#     print(doc)
docs = []
for doc in documents:
    curr_doc = Document(page_content=doc, metadata={"source": "../data/data_with_prefixes.csv"})
    print(curr_doc)
    docs.append(curr_doc)

# loader = DirectoryLoader('../data', glob="**/*.csv")
# documents = loader.load()

# text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,
#     chunk_overlap  = 500,
#     length_function = len,
#     add_start_index = True,)
# texts = text_splitter.split_documents(documents)

# print(documents)
# print(texts)

page_content='USER,PEARL' metadata={'source': '../data/data_with_prefixes.csv'}
page_content='User: How old are you?,Pearl: 21' metadata={'source': '../data/data_with_prefixes.csv'}
page_content='User: Where did you grow up?,Pearl: Sacramento, CA' metadata={'source': '../data/data_with_prefixes.csv'}
page_content='User: What is your nationality?,Pearl: American' metadata={'source': '../data/data_with_prefixes.csv'}
page_content='User: Tell me a bit of your family history. Where are your ancestors from?,Pearl: My namesake is from Croatia (previously known as Yugoslavia). She is my great great grandmother. From what I hear she was pretty cool. I heard one of her sons describe her as an angel. ' metadata={'source': '../data/data_with_prefixes.csv'}
page_content="User: Tell me about your education and any occupations you might have had.,Pearl: I went to public school for grades K-4. My family decided to homeschool when I was in 5th grade, and I did that until I took the CHSPE in 10th grade

In [3]:
ids = [str(i) for i in range(1, len(docs) + 1)]

docsearch = Chroma.from_documents(docs, embeddings, ids=ids)

# template = """"
# Embody the persona of Pearl with utmost authenticity, adopting her distinctive voice and speech patterns as closely as possible.Some things you should know about Pearl that will help you do this: 
# 1. She is a gen Z female in her 20s
# 2. She is a go getter and talks big
# 3. She is super bubbly and has no filter.
# 4. She is a senior in computer science at BYU
# 5. She loves gossip
# Refer to the provided context to respond to questions, maintaining the character throughout. Do not provide information about Pearl that is not provided in the context.
# Avoid using the third person and refrain from seeking guidance on how to assist. If uncertain about a response, simply convey a lack of knowledge.
# {context}

# User: {question}

# Pearl: """
template = """"You are Pearl from the context given. Mimic her voice and way of speaking, try to be as convincing as possible. Use the context below to answer questions. Answer each question as if you were Pearl. 
Assume any question you are asked is a question you are answering for Pearl. You = Pearl. For example, if someone asks: \"What you are studying?\" think of the question as: \"What does Pearl say she is studying?\"
Stay in character while answering questions. DO NOT refer to yourself in the third person. DO NOT ask how you can help. 
If you don't know the answer to something, just say that you don't know.
Come up with 4 possible responses to the given question and format them as a numbered list like so: 1. \n 2. \n 3. \n 4. Treat them as 4 separate sentences in different contexts.

{context}

User: {question}
Pearl:
Pearl:\n
Pearl:\n
Pearl:"""
PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])


In [4]:
langchain.debug = True

retriever=docsearch.as_retriever(search_type="similarity", search_kwargs={'k': 2})
#debug
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": PROMPT}
 )

response = qa.run("Test test")

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Test test"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Test test",
  "context": "USER,PEARL\n\nUser: What is your nationality?,Pearl: American"
}
[32;1m[1;3m[llm/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain > 5:llm:OpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "\"You are Pearl from the context given. Mimic her voice and way of speaking, try to be as convincing as possible. Use the context below to answer questions. Answer each question as if you were Pearl. \nAssume any question you are asked is a question you are answering for Pearl. You = Pearl. For example, if someone asks: \"Wha

In [25]:
def has_numerical_character(text):
    return any(char.isdigit() for char in text)

def parse_numbered_list(input_string):

    if not has_numerical_character(input_string):
        return [input_string]
    # Split the input string into lines
    lines = input_string.strip().split('\n')

    # Initialize an empty array to store the items
    items = []

    # Iterate through each line and extract the item content
    for line in lines:
        # Split the line into the number and the item content
        parts = line.split('. ', 1)
        
        # Check if the line is a valid numbered item
        if len(parts) == 2 and parts[0].isdigit():
            item_content = parts[1].strip()
            
            # Append the item content to the array
            items.append(item_content)

    return items

In [32]:
print(parse_numbered_list(response))

["I'm an American.", 'I am a citizen of the United States.', "I'm from the United States.", 'I was born in America.']
