In [31]:
import json
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

In [32]:
load_dotenv()
embeddings = OpenAIEmbeddings()

In [2]:
with open('title_url_text.json') as f:
    data = json.load(f)

In [25]:
def make_metadata(data, default_metadata):
    default_metadata['title'] = data['title']
    default_metadata['url'] = data['url']
    return default_metadata


loader = JSONLoader(
    file_path='title_url_text.json',
    # file_path='title_url_text_test.json',
    jq_schema='.[]',
    content_key='text',
    metadata_func=make_metadata,
    text_content=True,
)

data = loader.load()

In [33]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(data)
len(docs)

3699

In [30]:
# num of words in chunks
for doc in docs[:3]:
    print(len(doc.page_content.replace('.', ' ').split(' ')))

281
332
397


In [34]:
db = FAISS.from_documents(docs, embeddings)
db.save_local("faiss_index")

In [38]:
new_db = FAISS.load_local("faiss_index", embeddings)

In [39]:
query = "perfect goals of \'provable\' alignment, nor total alignment of superintelligences on exact human values"

In [45]:
docs_and_scores = new_db.similarity_search(query, k=10)
len(docs_and_scores)

10

In [46]:
docs_and_scores

[Document(page_content='2\n-2.  When I say that alignment is lethally difficult, I am not talking about ideal or perfect goals of \'provable\' alignment, nor total alignment of superintelligences on exact human values, nor getting AIs to produce satisfactory arguments about moral dilemmas which sorta-reasonable humans disagree about, nor attaining an absolute certainty of an AI not killing everyone.  When I say that alignment is difficult, I mean that in practice, using the techniques we actually have, "please don\'t disassemble literally everyone with probability roughly 1" is an overly large ask that we are not on course to get.  So far as I\'m concerned, if you can get a powerful AGI that carries out some pivotal superhuman engineering task, with a less than fifty percent change of killing more than one billion people, I\'ll take it.  Even smaller chances of killing even fewer people would be a nice luxury, but if you can get as incredibly far as "less than roughly certain to kill e