In [32]:
!pip install langchain==0.1.8
!pip install langchain-openai==0.0.6
!pip install python-dotenv==1.0.0
!pip install chromadb==0.4.18



In [34]:
import os
import json
import yaml
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.text_splitter import TokenTextSplitter
from langchain_community.vectorstores import Chroma

In [156]:
load_dotenv()

True

In [158]:
llm = ChatOpenAI(model="gpt-4-turbo",
                 temperature=0.7,
                 model_kwargs={
                    "frequency_penalty": 0.0,
                     "presence_penalty": 0.0,
                     "top_p": 1.0,
                  })

In [159]:
def write_to_file(filename, text):
    try:
        directory = os.path.dirname(filename)
        if not os.path.exists(directory):
            os.makedirs(directory)
        with open(filename, 'a') as file:
            file.write(text)
        print("Text successfully written to", filename)
    except Exception as e:
        print("An error occurred:", str(e))

In [160]:
def save_dict_to_json(data, filename):
    directory = os.path.dirname(filename)
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=4)

In [161]:
def load_results_from_json(int):
    if int == 1:
        with open("./sources/json/first_cycle_results.json", 'r') as file:
            data = json.load(file)
            return data
    if int == 2:
        with open("./sources/json/second_cycle_results.json", 'r') as file:
            data = json.load(file)
            return data
    if int == 3:
        with open("./sources/json/third_cycle_results.json", 'r') as file:
            data = json.load(file)
            return data

In [162]:
first_cycle_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant."),
    ("user", """
    Read the following transcript and extract all the arguments made about AI safety. Make sure they are self-contained.

  You must stick as close as possible to the transcript - use the author's own words and tone of voice.

  You must write each argument in a valid YAML format, surrounded with backticks.
  You must separate each argument with a new line.


  The simplest possible argument must at least contain three claims:

  ```yaml
  claim: "Top-level claim"
  premises:
    - claim: "First independent premise supporting the top-level claim"
    - claim: "Second independent premise supporting the top-level claim"
  ```

  And here's an example of a more complex argument, which also includes examples to illustrate lower-level claims:

  ```yaml
  claim: "Top-level claim"
  premises:
    - claim: "First independent premise supporting the top-level claim"
    - claim: "Supporting premise for the top-level claim"
      example: "Example supporting this premise"
    - claim: "Another supporting premise for the top-level claim"
    - claim: "Second independent premise supporting the top-level claim"
      premises:
        - claim: "Supporting premise for the second independent premise"
        - claim: "Another supporting premise for the second independent premise"
        - claim: "Independent premise supporting the second independent premise"
          example: "Example supporting this independent premise"
  ```

  Here's how to read this structure:

  The top-level claim is the main argument.
  Directly nested under the claim are independent premises. These provide justification independently of other premises.
  An example can be nested directly under a claim to provide further context or support.
  Just like the top-level claim, each premise can itself be supported by further individual premises, or examples, creating a nested structure.

  # Here is the transcript:

  {transcript}
  """)
])

In [163]:
second_cycle_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant."),
    ("user", """
  Based on the following transcript, make the arguments clear and distinct. 
  You may need to merge similar arguments to create a better, more logical argument.

  Create the best, strongest possible version of the arguments, here's what to do:

  - Make sure the arguments is self-contained
  - Make the arguments understandable on their own, out-of-context
  - Remember, arguments are not a description or an explanation
  - Premise must always give a reason to believe the claim above
  - Avoid using pronouns in premises
  - A claim can have a maximum of two child claims (premises), rewrite if needed

  # Argument format 

  You must write each argument in valid YAML format, surrounded with backticks.
  Separate each argument with new line.
  You must stick as closely as possible to the transcript. 
  Above all, you must express the argument in the words of the author, stick as close as possible to the tone of voice and phrases used in the transcript.



  Here are some examples:

  A simple argument might look like this: 

  ```yaml
  claim: "Top-level claim"
  premises:
    - claim: "First independent premise supporting the top-level claim"
    - claim: "Second independent premise supporting the top-level claim"
  ```

  And here's an example of a more complex argument, which also includes examples to illustrate lower-level claims:

  ```yaml
  claim: "Top-level claim"
  premises:
    - claim: "First independent premise supporting the top-level claim"
    - claim: "Supporting premise for the top-level claim"
      example: "Example supporting this premise"
    - claim: "Another supporting premise for the top-level claim"
    - claim: "Second independent premise supporting the top-level claim"
      premises:
        - claim: "Supporting premise for the second independent premise"
        - claim: "Another supporting premise for the second independent premise"
        - claim: "Independent premise supporting the second independent premise"
          example: "Example supporting this independent premise"
  ```

  Here's how to read this structure:

  The top-level claim is the main argument.
  Directly nested under the claim are independent premises. These provide justification independently of other premises.
  An example can be nested directly under a claim to provide further context or support.
  Just like the top-level claim, each premise can itself be supported by further individual premises, or examples, creating a nested structure.


  # Arguments to improve:

  {all_arguments}

  # Transcript

  {transcript}
  """)
])

In [164]:
third_cycle_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a world expert at creating accessible, persuasive explanations."),
    ("user", """
  Based on your own knowledge and the transcript, create a structured explanation for the following argument. Use the context from the transcript for the explanation.

  # Argument to use for the explanation

  {argument}


  The structured explanation must be directly based on the argument. You can also use the provided transcript for context.

  You must follow this YAML format:

  ```yaml
  counteragument_to: (what would be the argument, to which this argument is a counterargument? use your own knowledge. use bullet points)

  strongest_objection: (what is the strongest, good-faith, honest objection that a thoughful person might have? use bullet points)
  consequences_if_true: (if true, what would be the consequences? write in causal language,  use bullet points, max 3)

  link_to_ai_safety: (how is this linked to AI safety? 1 sentence.)

  simple_explanation: (explain this clearly to a college student in max. 4 sentences, speak persuasively as the author of this argument. don't use bullet points)

  examples: (max 3 examples, use bullet points)

  ```

  # Here is the transcript:

  {transcript}
  """)
])

In [85]:
def load_text_from_sources_and_make_chunks(directory):
    folder_names = []
    raw_texts = []
    for entry in os.listdir(directory):
        folder_names.append(entry)
        print(folder_names)
    for folder_name in folder_names:
        loader = DirectoryLoader(f'./sources/{folder_name}/source', glob="**/*.txt")
        text_splitter = CharacterTextSplitter(chunk_size=10000,
        chunk_overlap=0,
        length_function=len)
        raw_text = loader.load()
        entire_text_from_a_source = ""
        for doc in raw_text:
            entire_text_from_a_source += doc.page_content
        chunks = text_splitter.create_documents([entire_text_from_a_source])
        formatted_chunks = []
        for chunk in chunks:
            formatted_chunks.append(chunk.page_content)
        temp = {
            "name" : folder_name,
            "path": f'./sources/{folder_name}',
            "chunks": formatted_chunks,
        }
        raw_texts.append(temp)
    return raw_texts

In [86]:
sources_dicts = load_text_from_sources_and_make_chunks("./sources")

['david_deutsch_transcripts']
['david_deutsch_transcripts', 'json']


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/macbookpro/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/macbookpro/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [87]:
def first_cycle_of_extracting_arguments(dicts):
    first_cycle_chain = first_cycle_prompt | llm
    for dict in dicts:
        dict["arguments"] = []
        for chunk in dict["chunks"]:
            first_cycle_response = first_cycle_chain.invoke({"transcript": chunk})
            dict["arguments"].append(first_cycle_response.content)
        print("over")
        text = '\n\n'.join(dict["arguments"])
        filename = f"{dict['path']}/steps/first_step.md"
        write_to_file(filename, text)
    return dicts

In [90]:
dicts_with_extracted_args = first_cycle_of_extracting_arguments(sources_dicts)

over
Text successfully written to ./sources/david_deutsch_transcripts/steps/first_step.md
over
Text successfully written to ./sources/json/steps/first_step.md


In [92]:
save_dict_to_json(dicts_with_extracted_args, "./sources/json/first_cycle_results.json")

In [93]:
def second_cycle_of_extracting_arguments(dicts):
    second_cycle_chain = second_cycle_prompt | llm
    for dict in dicts:
        dict["improved_arguments"] = []
        for i, chunk in enumerate(dict["chunks"]):
            second_cycle_response = second_cycle_chain.invoke({"all_arguments": dict["arguments"][i], "transcript": chunk })
            dict["improved_arguments"].append(second_cycle_response.content)
        print("over")
        text = '\n\n'.join(dict["improved_arguments"])
        filename = f"{dict['path']}/steps/second_step.md"
        write_to_file(filename, text)
    return dicts

In [94]:
dicts_with_extracted_and_improved_args = second_cycle_of_extracting_arguments(dicts_with_extracted_args)

over
Text successfully written to ./sources/david_deutsch_transcripts/steps/second_step.md
over
Text successfully written to ./sources/json/steps/second_step.md


In [95]:
def split_improved_arguments_in_dict(dicts):
    for dict in dicts:
        dict["isolated_arguments"] = []
        for improved_arg in dict["improved_arguments"]:
            if improved_arg:
                splitted_args = improved_arg.split("```yaml")
                splitted_args_cleaned = []
                for arg in splitted_args:
                    arg_clean = arg.split("```")[0]
                    if (arg_clean != ""):
                        splitted_args_cleaned.append(arg_clean.strip())   
                if splitted_args_cleaned != None:
                    dict["isolated_arguments"].append(splitted_args_cleaned)
    return dicts

In [103]:
dicts_with_isolated_improved_arguments = split_improved_arguments_in_dict(dicts_with_extracted_and_improved_args)

In [135]:
save_dict_to_json(dicts_with_isolated_improved_arguments, "./sources/json/second_cycle_results.json")

In [149]:
def create_embeddings_with_smaller_chunks(directory):
    folder_names = []
    all_chunks = []
    dbs = []
    for entry in os.listdir(directory):
        if entry != "json":
            folder_names.append(entry)
    for folder_name in folder_names:
        loader = DirectoryLoader(f'./sources/{folder_name}/source', glob="**/*.txt")
        text_splitter = TokenTextSplitter(chunk_size=1000,
        chunk_overlap=100,
        length_function=len)
        raw_text = loader.load()
        entire_text_from_a_source = ""
        for doc in raw_text:
            entire_text_from_a_source += doc.page_content
        chunks = text_splitter.create_documents([entire_text_from_a_source])
        for chunk in chunks:
            chunk.metadata = {"folder": folder_name}
            all_chunks.append(chunk)
        embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
        db = Chroma.from_documents(all_chunks, embeddings)
        dbs.append(db)
    return dbs

In [150]:
dbs = create_embeddings_with_smaller_chunks("./sources")

In [170]:
def third_cycle_of_extracting_arguments(dicts):
    third_cycle_chain = third_cycle_prompt | llm
    for i, dict in enumerate(dicts):
        dict["explanations"] = []
        if i < len(dbs):  
         db = dbs[i]
        else:
         db = None
        for i, chunk in enumerate(dict["chunks"]):
            explanations = []
            for arg in dict["isolated_arguments"][i]:
                docs = db.similarity_search(arg, k=3)
                print(docs)
                context = ""
                for doc in docs:
                    context += "\n" + doc.page_content
                explanation = third_cycle_chain.invoke({"argument": arg, "transcript": context})
                explanations.append(explanation.content.split("```yaml\n")[1].split("```")[0].strip())
            dict["explanations"].append(explanations)
    return dicts

In [171]:
dicts_with_arguments_and_explanations = third_cycle_of_extracting_arguments(dicts_with_isolated_improved_arguments)

[Document(page_content=' and step by step. True, the atoms in the brain would be emulated by metal cogs and levers rather than organic material — but in the present context, inferring anything substantive from that distinction would be rank racism.\n\nDespite their best efforts, Babbage and Lovelace failed almost entirely to convey their enthusiasm about the Analytical Engine to others. In one of the great might-have-beens of history, the idea of a universal computer languished on the back burner of human thought. There it remained until the 20th century, when Alan Turing arrived with a spectacular series of intellectual tours de force, laying the foundations of the classical theory of computation, establishing the limits of computability, participating in the building of the first universal classical computer and, by helping to crack the Enigma code, contributing to the Allied victory in the Second World War.\n\nTuring fully understood universality. In his 1950 paper ‘Computing Machin

Failed to batch ingest runs: LangSmithError("Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPSConnectionPool(host='api.smith.langchain.com', port=443): Read timed out. (read timeout=10.0)\n")


[Document(page_content=' perform that the other could not in principle perform too?\n\nOne immediate difference between them was that the sequence of elementary steps (of counting, adding, multiplying by 10, and so on) that the Difference Engine used to compute a given function did not mirror those of the human ‘computers’. That is to say, they used different algorithms. In itself, that is not a fundamental difference: the Difference Engine could have been modified with additional gears and levers to mimic the humans’ algorithm exactly. Yet that would have achieved nothing except an increase in the error rate, due to increased numbers of glitches in the more complex machinery. Similarly, the humans, given different instructions but no hardware changes, would have been capable of emulating every detail of the Difference Engine’s method — and doing so would have been just as perverse. It would not have copied the Engine’s main advantage, its accuracy, which was due to hardware not softwa

In [173]:
save_dict_to_json(dicts_with_arguments_and_explanations, "./sources//json/third_cycle_results.json")

In [174]:
def save_third_step(dicts):
    for dict in dicts:
        dict["final_arguments"] = []
        for i, isolated_argument_group in enumerate(dict["isolated_arguments"]):
            for j, single_isolated_arg in enumerate(isolated_argument_group):
                final_arg = "\n".join([single_isolated_arg, dict["explanations"][i][j]])
                dict["final_arguments"].append(final_arg)
                final_arg_yaml_format = " ```yaml\n" + final_arg + "\n```\n\n"
                filename = f"{dict['path']}/steps/third_step.md"
                write_to_file(filename, final_arg_yaml_format)
    return dicts

In [175]:
final_dicts = save_third_step(dicts_with_arguments_and_explanations)

Text successfully written to ./sources/david_deutsch_transcripts/steps/third_step.md
Text successfully written to ./sources/david_deutsch_transcripts/steps/third_step.md
Text successfully written to ./sources/david_deutsch_transcripts/steps/third_step.md
Text successfully written to ./sources/david_deutsch_transcripts/steps/third_step.md
Text successfully written to ./sources/david_deutsch_transcripts/steps/third_step.md
Text successfully written to ./sources/david_deutsch_transcripts/steps/third_step.md
Text successfully written to ./sources/david_deutsch_transcripts/steps/third_step.md
Text successfully written to ./sources/david_deutsch_transcripts/steps/third_step.md
Text successfully written to ./sources/david_deutsch_transcripts/steps/third_step.md
Text successfully written to ./sources/david_deutsch_transcripts/steps/third_step.md
Text successfully written to ./sources/david_deutsch_transcripts/steps/third_step.md
Text successfully written to ./sources/david_deutsch_transcripts/

In [176]:
def create_directory_structure_for_chatbots(directory_name):
    script_directory = os.getcwd()
    new_directory_path = os.path.join(script_directory, directory_name)
    if not os.path.exists(new_directory_path):
        os.makedirs(new_directory_path)
        yaml_file_path = os.path.join(new_directory_path, "metadata.yaml")
        with open(yaml_file_path, "w") as yaml_file:
            with open(yaml_file_path, "w") as yaml_file:
                yaml.dump({}, yaml_file)
        prompts_folder_path = os.path.join(new_directory_path, "prompts")
        knowledge_base_folder_path = os.path.join(new_directory_path, "knowledge_base")
        for folder_path in [prompts_folder_path, knowledge_base_folder_path]:
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
                system_prompt_file_path = os.path.join(prompts_folder_path, "system_prompt.md")
                with open(system_prompt_file_path, "w") as system_prompt_file:
                    system_prompt_file.write("System Prompt")
    if os.path.exists(new_directory_path):
        yaml_file_path = os.path.join(new_directory_path, "metadata.yaml")
        with open(yaml_file_path, "w") as yaml_file:
            yaml.dump({"name": None, #add your chatbot name here don't use quotes when naming chatbot 
                       "tags": None, #tag your chatbot can be #optimistic or #pessimistic
                       "based_on": None #provide the uri to source of the raw_text used for arguments extraction, add it in new line with dash in front uri (- https://uri_to_source.com) 
                      }, yaml_file, sort_keys=False)
        prompts_folder_path = os.path.join(new_directory_path, "prompts")
        knowledge_base_folder_path = os.path.join(new_directory_path, "knowledge_base")
        for folder_path in [prompts_folder_path, knowledge_base_folder_path]:
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)

In [177]:
def create_chatbots(dicts):
    for dict in dicts:
        create_directory_structure_for_chatbots(dict["name"])
        prompt_filename = f'./{dict["name"]}/prompts/system_prompt.md'
        with open(prompt_filename, 'w') as file:
            file.write("Use arguments provided to answer the question.\n\nArguments:\n\n{arguments}")
        for i, final_arg in enumerate(dict["final_arguments"]):
            filename = f'./{dict["name"]}/knowledge_base/{dict["name"]}-{str(i + 1)}.md'
            with open(filename, 'w') as file:
                file.write(final_arg)

In [178]:
create_chatbots(final_dicts)

In [179]:
dicts_with_isolated_improved_arguments = load_results_from_json(2)
dicts_with_arguments_and_explanations = load_results_from_json(3)