In [1]:
!pip install "weaviate-client>=3.26.7,<4.0.0"
!pip install langchain
!pip install langchain_community
!pip install openai==0.28



In [2]:
opn_k = "opn_k"
wea_k = "wea_k"
wea_c = "wea_c"

## Data Reading

In [3]:
# !mkdir data

In [4]:
!pip install unstructured
!pip install "unstructured[pdf]"



In [5]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader("./data",glob = "**/*.pdf")
data = loader.load()

In [6]:
data

[Document(metadata={'source': 'data/yolov7paper.pdf'}, page_content='2 2 0 2\n\nl u J\n\n6\n\n]\n\nV C . s c [\n\n1 v 6 9 6 2 0 . 7 0 2 2 : v i X r a\n\nYOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors\n\nChien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1 1Institute of Information Science, Academia Sinica, Taiwan kinyiu@iis.sinica.edu.tw, alexeyab84@gmail.com, and liao@iis.sinica.edu.tw\n\nAbstract\n\nYOLOv7 surpasses all known object detectors in both speed and accuracy in the range from 5 FPS to 160 FPS and has the highest accuracy 56.8% AP among all known real-time object detectors with 30 FPS or higher on GPU V100. YOLOv7-E6 object detector (56 FPS V100, 55.9% AP) outperforms both transformer-based detector SWIN- L Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by 509% in speed and 2% in accuracy, and convolutional- based detector ConvNeXt-XL Cascade-Mask R-CNN (8.6 FPS A100, 55.2% AP) by 551% in speed and 0.7% AP in accuracy, as we

## Text Spliting

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
docs = text_splitter.split_documents(data)

In [8]:
docs

[Document(metadata={'source': 'data/yolov7paper.pdf'}, page_content='2 2 0 2\n\nl u J\n\n6\n\n]\n\nV C . s c [\n\n1 v 6 9 6 2 0 . 7 0 2 2 : v i X r a\n\nYOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors\n\nChien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1 1Institute of Information Science, Academia Sinica, Taiwan kinyiu@iis.sinica.edu.tw, alexeyab84@gmail.com, and liao@iis.sinica.edu.tw\n\nAbstract'),
 Document(metadata={'source': 'data/yolov7paper.pdf'}, page_content='Abstract\n\nYOLOv7 surpasses all known object detectors in both speed and accuracy in the range from 5 FPS to 160 FPS and has the highest accuracy 56.8% AP among all known real-time object detectors with 30 FPS or higher on GPU V100. YOLOv7-E6 object detector (56 FPS V100, 55.9% AP) outperforms both transformer-based detector SWIN- L Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by 509% in speed and 2% in accuracy, and convolutional- based detector ConvNeXt-XL Cascade-Ma

In [9]:
len(docs)

90

## Embedding Convertion

In [10]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key= opn_k)

  embeddings = OpenAIEmbeddings(openai_api_key= OPENAI_API_KEY)


In [11]:
embeddings

OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, async_client=None, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)

## Vector Database Storage

In [16]:
from weaviate.client import Client
from weaviate.auth import AuthApiKey

auth_config = AuthApiKey(api_key=wea_k)

# Initialize the Weaviate client
client = Client(
    url=wea_c,
    auth_client_secret=auth_config,
    additional_headers={"X-OpenAI-Api-Key": opn_k},
    startup_period = 90
)

# Check connection
if client.is_ready():
    print("Connected to Weaviate successfully!")
else:
    print("Failed to connect to Weaviate.")

Connected to Weaviate successfully!


In [17]:
from langchain.vectorstores import Weaviate
# define input structure
client.schema.delete_all()
client.schema.get()
schema = {
    "classes": [
        {
            "class": "Chatbot",
            "description": "Documents for chatbot",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {"text2vec-openai": {"model": "ada", "type": "text"}},
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the paragraph",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": "content",
                },
            ],
        },
    ]
}

client.schema.create(schema)
vectorstore = Weaviate(client, "Chatbot", "content", attributes=["source"])

In [18]:
# load text into the vectorstore
text_meta_pair = [(doc.page_content, doc.metadata) for doc in docs]
texts, meta = list(zip(*text_meta_pair))
vectorstore.add_texts(texts, meta)

['f852b1aa-f384-4dd6-974b-4faaff710bcc',
 '6e953902-bb3f-43c2-bab5-a1ae698b6511',
 '4be93b72-1eb3-45d7-9284-de3de51e99d1',
 '297eee53-9fa5-497c-b3cb-0fec30077023',
 'a4839427-7a29-4c33-b045-e2e5402ad875',
 'eecf946c-fee4-41bc-a738-31ceba3b8d3f',
 '1f031776-6d47-45a1-a1a4-1c2d145936c6',
 'f956d990-35d4-4ef2-955b-f6fe94513e75',
 'f5749460-f99c-4513-806e-e0dc0acfee22',
 '12d59a97-ff6c-4538-bca7-5aa4cdf99ff5',
 'bfd29ff9-162c-4be3-937e-f1f9301abb5e',
 '7e77ef24-0e27-44c7-97e3-81ef9837d04e',
 '2c6d901d-47f2-4701-ad0a-8c61ff88e713',
 '52ad7f22-7712-4980-aa59-49461388cba6',
 '3a534b0d-48f9-4ea6-8234-d8574fe834b7',
 'e5c7f1de-1668-4dfd-af49-585a9b8c6c34',
 'f415fb62-655b-4e46-8e62-9cf529f56f53',
 'ff4ec2d6-89b2-4920-8367-f5bca7ff5e3f',
 '9d3e5fdf-0b83-49c1-9a09-140b9254a98a',
 '0a2639bc-fb7f-4c68-8952-1a1973ea6b69',
 '0658ab96-4523-4a8f-b513-65d96a0eae8c',
 '36f7fb09-0ccc-434b-afed-d3501bd0bf9b',
 '7c1b904d-ca8d-4c9b-9f92-c29da6b95f90',
 'f8930509-3647-4037-afa6-3c49d7e2851f',
 '302c187b-6da3-

## Similarity Measurement

In [19]:
query = "what is a yolo?"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, top_k=20)

In [20]:
docs

[Document(metadata={'source': 'data/yolov7paper.pdf'}, page_content='69.6% -\n\nYOLOv5-L (r6.1) [23] YOLOv5-M6 (r6.1) [23] YOLOR-CSP-X [81] YOLOv7-W6 YOLOv5-X (r6.1) [23] YOLOX-M [21] PPYOLOE-L [85] YOLOR-P6 [81] YOLOX-L [21] YOLOR-W6 [81] YOLOv5-L6 (r6.1) [23]\n\n46.5M 35.7M 96.9M 70.4M 86.7M 25.3M 52.2M 37.2M 54.2M 79.8G 76.8M\n\n109.1G 200.0G 226.8G 360.0G 205.7G 73.8G 110.1G 325.6G 155.6G 453.2G 445.6G\n\n640 1280 640 1280 640 640 640 1280 640 1280 1280\n\n99 90 87 84 83 81 78 76 69 66 63\n\n/ 49.0% - / 51.3% 71.4% 57.9% 53.0% / 52.7% 54.9% / 54.6% 72.6% 60.1%\n\n-\n\n-\n\n/ 50.7% 47.2% / 46.9% 51.4% / 50.9% 53.9% / 53.5% 50.1% / 49.7% 55.2% / 54.8% 72.7% 60.5%\n\n- 68.9% 71.4% -\n\n- 55.6% 58.9% -\n\n/ 53.7%\n\n\n\n\n\nYOLOX-X [21] YOLOv7-E6 YOLOR-E6 [81] PPYOLOE-X [85] YOLOv7-D6 YOLOv5-X6 (r6.1) [23] YOLOv7-E6E YOLOR-D6 [81]\n\n99.1M 97.2M 115.8M 98.4M 154.7M 140.7M 151.7M 151.7M\n\n281.9G 515.2G 683.2G 206.6G 806.8G 839.2G 843.2G 935.6G\n\n640 1280 1280 640 1280 1280 1280 1280\n

In [21]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [22]:
# define chain
chain = load_qa_chain(
    OpenAI(openai_api_key = opn_k,temperature=0),
    chain_type="stuff")

  OpenAI(openai_api_key = OPENAI_API_KEY,temperature=0),
stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  chain = load_qa_chain(


In [23]:
# create answer
chain.run(input_documents=docs, question=query)

  chain.run(input_documents=docs, question=query)


' YOLO (You Only Look Once) is a real-time object detection algorithm that uses a single neural network to predict bounding boxes and class probabilities for objects in an image. It was first introduced in 2016 by Joseph Redmon et al. and has since been improved upon with newer versions such as YOLOv2, YOLOv3, and YOLOv4. YOLO is known for its speed and accuracy, making it a popular choice for applications such as self-driving cars and surveillance systems.'