In [27]:
# Dependencies
import pandas as pd
import os

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import DataFrameLoader
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import VectorDBQA

import chromadb

# Create environment variable for OpenAI API key
from config import openai_api_key
os.environ['OPENAI_API_KEY'] = openai_api_key


POPULATE THE CHROMA DATABASE OUTSIDE THE FUNCTION,
USE SIMILARITY SEARCH WITH METADTA INSIDE THE FUNCTION

In [2]:
chroma_client = chromadb.Client()

Using embedded DuckDB without persistence: data will be transient


In [3]:
# Create a Chroma collection
collection = chroma_client.create_collection(name="nfl-prospects-1")

No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


In [4]:
# Define the data
data = pd.read_csv('data/dataset.csv')

In [5]:
# Create the loader specify the column containing the text
loader = DataFrameLoader(data, page_content_column="Player Bio")

In [6]:
# Load the data
docs = loader.load()

In [7]:
# Define the embeddings
embeddings = OpenAIEmbeddings()

In [8]:
persist_directory = 'db'

embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(
    documents=docs, embedding=embedding, persist_directory=persist_directory)

Using embedded DuckDB with persistence: data will be stored in: db


In [9]:
vectordb.persist()
vectordb = None

In [10]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

Using embedded DuckDB with persistence: data will be stored in: db


In [60]:
llm = OpenAI(temperature=0.7)
chain = load_qa_chain(llm, chain_type="stuff")

In [69]:
query = "Who are your top 3 WR prospects in 2023, explain why?"

In [70]:
docsearch = vectordb.similarity_search(query, include_metadata=True, k=3)

In [71]:
chain.run(input_documents=docsearch, question=query)


" I don't know."

In [72]:
docsearch

[Document(page_content='Polished, quarterback-friendly, sure-handed possession receiver with a flare for the highlight-reel catch. Could be an effective No. 3 option, capable of lining up as a "Z" or slot, working short-to-intermediate and beating zone coverage.', metadata={'Player': 'Jarvis Landry', 'Player Grades': 6.1, 'Draft Class': 2014, 'Link': 'https://www.nfl.com/prospects/jarvis-landry/32004c41-4e16-3029-78da-4985470f8aab'}),
 Document(page_content='Athletic, smooth-moving, polished receiver who runs crisp routes, has terrific hands and boasts NFL bloodlines. Balanced skill set, dependability and versatility will be valued at the next level, where he could be a productive No. 2 or No. 3 receiver in a sophisticated passing system. Arrow pointing up.', metadata={'Player': 'T.J. Jones', 'Player Grades': 5.9, 'Draft Class': 2014, 'Link': 'https://www.nfl.com/prospects/t-j-jones/32004a4f-4e73-1255-22f2-c41bfb1a40f3'}),
 Document(page_content="Traits-based prospect with outstanding 