In [None]:
#Uncomment to install revelant packages below 
#!pip3 install bs4
#!pip3 install openai
#!pip3 install python-dotenv
import openai
import json
import numpy as np
from bs4 import BeautifulSoup
import json
import csv 
import time
import os 
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file


In [None]:
# Relevant functions 

openai.api_key  = os.getenv('OPENAI_API_KEY')

def ask_davinci(question):
    response = openai.Completion.create(
      model="text-davinci-003",
      prompt=question,
      temperature=0.1,
      max_tokens=2000, 
      n=1
    )
    return response.choices[0].text.strip()


def ask_gpt3(question):
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
        {"role": "user", "content": question}
    ]
    )
    return completion.choices[0].message.content #.strip()

def ask_character(post_list, question):
    prompt = [
        {
        "role": "system",
        "content": "You are an assistant answering questions based on what the writer YFJ wrote"
        }, 
        {
        "role": "user",
        "content": "Here are some articles the writer YFJ wrote:"
        }]
   
    for post in post_list:
        prompt.append({"role": "user", "content": f"""{post['title']}: {post['content']} """})
    
    prompt.append({
        "role": "user",
        "content": f"The writer YFJ was asked the question after the delimiter =====. Please answer the question as if you are the writer YFJ. Start the answer with the word I. Say I don't know or I don't have an opinion if there's no relevant writing from the writer.  ===== {question}"
        })

    completion = openai.ChatCompletion.create(
        #model= "gpt-3.5-turbo",
        model= "gpt-4",
        messages=prompt
    )
    return completion.choices[0].message.content #.strip()

def get_embedding(input): 
    embedding = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=input
    )
    return embedding.data[0].embedding



def cosine_similarity(vec1, vec2):
    # Compute the dot product of vec1 and vec2
    dot_product = np.dot(vec1, vec2)

    # Compute the L2 norms (Euclidean norms) of vec1 and vec2
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    # Compute the cosine similarity
    similarity = dot_product / (norm_vec1 * norm_vec2)

    return similarity


In [None]:


# Generate the content database from substack exports

post_dir = './greatreset2022_substack' # change this to your dir
post_list_csv = f'{post_dir}/posts.csv'

parsed_data = []

with open(post_list_csv, 'r') as file:
    post_list = csv.DictReader(file)
    for row in post_list:
        #print(row)
        if row['is_published'] != 'true':
            continue
        output = {
            "title": row['title'],
            "post_id": row['post_id'], 
            "subtitle": row['subtitle']
        }
        post_file = f'{post_dir}/posts/{row["post_id"]}.html'
        with open(post_file, 'r') as f2:
            # Read the content of the file
            html_string = f2.read()

            # Creating a BeautifulSoup object and specifying the parser
            soup = BeautifulSoup(html_string, 'html.parser')
            
            # Finding the title and content
            
            output['content'] = " \n".join([p.text for p in soup.find_all('p')])
            
            # Appending the parsed data to the list
            parsed_data.append(output)

# Converting the list to a JSON string
# Open the output file in 'write' mode
with open('greatreset.json', 'w') as outfile:
    # Write the JSON data to the file
    json.dump(parsed_data, outfile)


In [None]:


# Generating Q&A from an existing post 
articles = []

with open('greatreset.json', 'r') as infile:
    # Write the JSON data to the file
    articles = json.load(infile)


qna_list = []
max_attempts = 3

for post in articles: 
    post_id = post['post_id']
    content = f'{post["title"]} {post["subtitle"]} {post["content"]}'
    prompt = f''' generate questions and answers based on the following article delimited by. =====. 
        Make sure the questions and answers are semantic with the writer's style intact.  
        Output the list of questions and answers in the json format where the json represents an array 
        where each element in the array is a dictionary with keys "question" and "answer".  
        ===== {content} ====='''
    print(post["title"])

    for attempt in range(max_attempts):
        try:
            # Attempt to call the function
            answer = ask_gpt3(prompt)
            answer_json = json.loads(answer)
            # If the function call was successful, break out of the loop
            break
        except Exception as e:
            # If there was an error, print it out and continue to the next attempt
            print(f'Attempt {attempt + 1} failed for {post_id} with error: {e}')
            time.sleep(5) 
        
    qna_dict = {
        "post_id": post_id,
        "qa_list": answer_json
    }
    qna_list.append(qna_dict)

with open('greatreset-processed.json', 'w') as outfile:
    # Write the JSON data to the file
    json.dump(qna_list, outfile, indent=4)



In [None]:
# Sanitize the gpt generated output. Ignore if the output is malformed 
with open('greatreset-processed.json', 'r') as infile:
    # Write the JSON data to the file
    qna_list = json.load(infile) 

sanitized_list = []
for item in qna_list: 
    if isinstance(item['qa_list'], list):
        sanitized_list.append(item)
    else: 
        #print(item['qa_list'].keys())
        keys = list(item['qa_list'].keys()) 
        if len(keys) == 1: 
            item['qa_list'] = item['qa_list'][keys[0]]
            sanitized_list.append(item)
        else:
            print(f"""{item['post_id']} ignored"""); 

with open('greatreset-sanitized.json', 'w') as outfile:
    # Write the JSON data to the file
    qna_list = json.dump(sanitized_list, outfile, indent=4)         


In [None]:
# Generate embedding list 
with open('greatreset-sanitized.json', 'r') as infile:
    # Write the JSON data to the file
    qna_list = json.load(infile) 
with open('greatreset.json', 'r') as infile2:
    # Write the JSON data to the file
    post_list = json.load(infile2) 

embedding_tuples = [] 
for item in qna_list: 
    post_id = item['post_id']
    qa_list = item['qa_list']
    print(f'''generating embedding for {post_id} from qa ''')
    for qa in qa_list: 
        content = json.dumps(qa) 
        em = get_embedding(content) 
        embedding_tuples.append([em, post_id])

for post in post_list: 
    post_id = post['post_id']
    print(f'''generating embedding for {post_id} from article ''')
    content = f'{post["title"]} {post["subtitle"]} {post["content"]}'
    em = get_embedding(content) 
    embedding_tuples.append([em, post_id])

with open('greatreset-embeddings.json', 'w') as outfile:
    # Write the JSON data to the file
    json.dump(embedding_tuples, outfile, indent=4)      


    
        


In [None]:
# Start here if you don't want to retrain the model 
# Load the embedding data and post database into the memory 
with open('greatreset-embeddings.json', 'r') as infile:
    # Write the JSON data to the file
    embedding_tuples = json.load(infile) 

print(len(embedding_tuples))
with open('greatreset.json', 'r') as infile2:
    # Write the JSON data to the file
    post_list = json.load(infile2) 
print(len(post_list))

post2content = {}
for post in post_list: 
    post2content[post['post_id']] = post 


In [None]:
#  ask a question, get an answer  

#question = "What do you think of Tesla's stock price?" 
#question = "What do you think of Meta's Metaverse?"
#question = "Do you think the Fed does a good job?"
#question = "What are your stock picks for 2023?"
#question = "Do you think there will be another great depression?"
#question = "Will there be IPOs in 2023?"
#question = "What will happen to venture capital in 2023?"
#question = "Do you think the Fed will lower interest rates in 2023?"
#question = "Do you think AI is a danger to humanity?"
#question = "Do you think there will be more banks going out of business?"
#question = "What do you think of tether?" 
question = "What do you think of the 2028 LA olympics?" # plug in your question here 
emq = get_embedding(question) 

scores = []
post_ids = []

for et in  embedding_tuples: 
    post_ids.append(et[1])
    scores.append(cosine_similarity(emq, et[0]))

# Convert the list to a numpy array
N=10
scores_array = np.array(scores)

# Get the indices that would sort the array
sorted_indices = np.argsort(scores_array)

# Get the top N indices. Note: [::-1] is used to reverse the array because argsort sorts in ascending order
top_N_indices = sorted_indices[-N:]
scores_array[top_N_indices]

print(top_N_indices)
print(scores_array[top_N_indices])

relevant_posts = list(set(np.array(post_ids)[top_N_indices]))
print(relevant_posts)

input_list = []
for p in relevant_posts: 
    input_list.append(post2content[p])
result = ask_character(input_list, question)

print("=======================")
print(f"Q: {question}")
print(f"YFJ Bot: {result}") 



