In [1]:
import transformers
import langchain
import os
from langchain.llms import OpenAI

In [2]:
os.environ["OPENAI_API_KEY"] = ""
llm=OpenAI(model='text-davinci-003', temperature=0)
text="Explain fine tuning with Large Language Models"
output= llm(text)
print(output)



Fine tuning with large language models is a process of using a pre-trained language model to improve the performance of a downstream task. This is done by taking a large language model, such as BERT or GPT-2, and fine-tuning it on a specific task. This involves adjusting the weights of the model to better fit the task at hand. This process can be used to improve the accuracy of a model on a specific task, such as sentiment analysis or question answering. Fine tuning with large language models can also be used to improve the generalization of a model, allowing it to better handle unseen data.


In [3]:
from sentence_transformers import SentenceTransformer
from langchain import PromptTemplate, LLMChain
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI

In [20]:
file_path = "VectorDBs.pdf"

In [21]:
#Load the PDF
loader = PyPDFLoader(file_path)
documents = loader.load_and_split()
# After Loading the document, split it into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
documents = text_splitter.split_documents(documents)
#Create the huggingface embeedings
embeddings = HuggingFaceEmbeddings(
 model_name="sentence-transformers/all-MiniLM-L6-v2"
)
#embed each chunk and load it into the FAISS vector store.
faiss_db = FAISS.from_documents(documents, embeddings)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [22]:
qa = RetrievalQA.from_chain_type(
 llm=ChatOpenAI(temperature = 0),
 chain_type="stuff",
 retriever=faiss_db.as_retriever(search_kwargs={"k": 3}),
 return_source_documents=True,
 verbose=True,
)

In [23]:
answer= qa("Give me the important points in the document")
print(answer['result'])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
- Vector databases are becoming increasingly popular in the field of data science and artificial intelligence.
- They are used to store and manage embeddings, which are numerical representations of data points.
- Vector databases are designed to efficiently handle high-dimensional data and enable fast similarity searches.
- They are particularly useful in applications such as search engines, recommendation systems, and generative AI.
- Weaviate is a popular vector database that provides a beginner-friendly interface for search operations.
- Managed vector databases offer a cutting-edge approach to managing embeddings and can simplify the deployment and maintenance process.
- Vector databases can be used to augment LLM (Language Model) apps, improving their performance and accuracy.
- Langchain is a platform that utilizes vector databases to enhance its functionality.
- Vector databases play a crucial role in compa

In [28]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator

In [29]:
pdf_folder_path = '/content/test'

In [34]:
os.listdir(pdf_folder_path)

['Vector Databases Are Getting Popular. This Is How They Compare Embedding.pdf',
 'What are Vector Databases and How Langchain uses Vector DBs  by Mehul Gupta.pdf',
 'How LangChain Makes Large Language Models More Powerful by Renu Khandelwal.pdf',
 'Not All Vector Databases Are Made Equal by Dmitry Kan.pdf']

In [47]:
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]

In [48]:
loaders

[<langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x7bab4d2abc10>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x7bab4d2ab730>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x7bab4d2abc70>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x7bab4d2ab430>]

In [4]:
from langchain.vectorstores import Chroma

In [9]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
import magic
import nltk
from langchain.document_loaders import PyPDFDirectoryLoader

In [10]:
loader = PyPDFDirectoryLoader('/content/test')
documents = loader.load()

In [12]:
# After Loading the document, split it into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
documents = text_splitter.split_documents(documents)
#Create the huggingface embbedings
embeddings = HuggingFaceEmbeddings(
 model_name="sentence-transformers/all-MiniLM-L6-v2")
chroma_db = Chroma.from_documents(documents, embeddings)

In [14]:
qa = RetrievalQA.from_chain_type(
 llm=ChatOpenAI(temperature = 0),
 chain_type="stuff",
 retriever=chroma_db.as_retriever(search_kwargs={"k": 3}),
 return_source_documents=True,
 verbose=True,
)

In [15]:
answer= qa("what are vector databases? give me their relevance to Large language models")
print(answer['result'])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Vector databases are a type of database that are specifically designed to store and retrieve high-dimensional, unstructured data. They are particularly relevant to large language models because they can efficiently handle the complex and diverse data that these models require.

Traditional databases typically rely on predefined schemas and distinct attributes to organize and retrieve data. However, large language models deal with unstructured data, such as documents, images, audio, and videos, which do not fit neatly into predefined schemas. Vector databases, on the other hand, can store and retrieve this unstructured data without the need for a predefined schema.

Vector databases use indexing techniques that are specifically optimized for high-dimensional data. These techniques allow for efficient searching and retrieval of data based on similarity or distance metrics. This is particularly useful for large langu