In [1]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='../.env')

True

In [1]:
from langchain import LLMChain, PromptTemplate
from langchain.llms import OpenAI

llm = OpenAI(temperature=0)

template = """You are an assistant that answers the following question correctly and honestly: {question}\n\n"""
prompt_template = PromptTemplate(input_variables=['question'], template=template)

question_chain = LLMChain(llm=llm, prompt=prompt_template)
question_chain.run("what is the latest fast and furious movie?")

'\nThe latest Fast and Furious movie is Fast & Furious 9, which was released in May 2021.'

In [5]:
from langchain.tools import Tool
from langchain.utilities import GoogleSearchAPIWrapper

search = GoogleSearchAPIWrapper()
TOP_N_RESULTS = 10

def top_n_results(query):
    return search.results(query, TOP_N_RESULTS)

tool = Tool(
    name = "Google Search",
    description = "Searches Google for the top 10 results",
    func = top_n_results,
)

query = "what is the latest fast and furious movie?"

results = tool.run(query)
print(results)

[{'title': 'Fast & Furious movies in order | chronological and release order ...', 'link': 'https://www.radiotimes.com/movies/fast-and-furious-order/', 'snippet': 'Mar 22, 2023 ... Fast & Furious Presents: Hobbs & Shaw (2019); F9 (2021); Fast and Furious 10 (2023). Tokyo Drift also marks the first appearance of Han Lue, a\xa0...'}, {'title': 'FAST X | Official Trailer 2 - YouTube', 'link': 'https://www.youtube.com/watch?v=aOb15GVFZxU', 'snippet': 'Apr 19, 2023 ... FAST X | Official Trailer 2 · Comments10K.'}, {'title': 'Fast & Furious - Wikipedia', 'link': 'https://en.wikipedia.org/wiki/Fast_%26_Furious', 'snippet': 'Fast X was designed to be the final movie of the franchise which later evolved in becoming a two part finale. Justin Lin was brought back to direct both movies\xa0...'}, {'title': "Here's How to Watch Every Fast and Furious Movie In Order ...", 'link': 'https://www.menshealth.com/entertainment/a36716650/fast-and-furious-movies-in-order/', 'snippet': "Sep 18, 2023 ... Furio

In [6]:
import newspaper

pages_content = []

for result in results:
    try:
        article = newspaper.Article(result["link"])
        article.download()
        article.parse()
        if len(article.text) > 0:
            pages_content.append({ "url": result["link"], "text": article.text })
    except:
        continue

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=100)

docs = []
for d in pages_content:
	chunks = text_splitter.split_text(d["text"])
	for chunk in chunks:
		new_doc = Document(page_content=chunk, metadata={ "source": d["url"] })
		docs.append(new_doc)    

In [9]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

docs_embeddings = embeddings.embed_documents([doc.page_content for doc in docs])
query_embedding = embeddings.embed_query(query)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_top_k_indices(list_of_doc_vectors, query_vector, top_k):
  # convert the lists of vectors to numpy arrays
  list_of_doc_vectors = np.array(list_of_doc_vectors)
  query_vector = np.array(query_vector)

  # compute cosine similarities
  similarities = cosine_similarity(query_vector.reshape(1, -1), list_of_doc_vectors).flatten()

  # sort the vectors based on cosine similarity
  sorted_indices = np.argsort(similarities)[::-1]

  # retrieve the top K indices from the sorted list
  top_k_indices = sorted_indices[:top_k]

  return top_k_indices

top_k = 2
best_indexes = get_top_k_indices(docs_embeddings, query_embedding, top_k)
best_k_documents = [doc for i, doc in enumerate(docs) if i in best_indexes]