In [None]:
!pip install google-cloud-aiplatform --upgrade --user --quiet
!pip install langchain==0.0.229 chromadb==0.3.26 pydantic==1.10.8 typing-inspect==0.8.0 typing_extensions==4.5.0 pandas datasets google-api-python-client pypdf faiss-cpu transformers config --upgrade --user --quiet

In [5]:
from google.colab import auth as google_auth
google_auth.authenticate_user()
PROJECT_ID = "iron-decorator-297513" # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
from google.cloud import aiplatform
from langchain.llms import VertexAI
import vertexai
aiplatform.init(project=PROJECT_ID, location=LOCATION)
vertexai.init(project=PROJECT_ID, location=LOCATION)


In [9]:
# Utility functions for Embeddings API with rate limiting
from langchain.embeddings import VertexAIEmbeddings
import langchain
from pydantic import BaseModel
import time
from typing import List

def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    print("Waiting")
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            print(".", end="")
            time.sleep(sleep_time)


class CustomVertexAIEmbeddings(VertexAIEmbeddings, BaseModel):
    requests_per_minute: int
    num_instances_per_batch: int

    # Overriding embed_documents method
    def embed_documents(self, texts: List[str]):
        limiter = rate_limit(self.requests_per_minute)
        results = []
        docs = list(texts)

        while docs:
            # Working in batches because the API accepts maximum 5
            # documents per request to get embeddings
            head, docs = (
                docs[: self.num_instances_per_batch],
                docs[self.num_instances_per_batch :],
            )
            chunk = self.client.get_embeddings(head)
            results.extend(chunk)
            next(limiter)

        return [r.values for r in results]

In [56]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import CTransformers
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
import sys
from langchain.document_loaders.csv_loader import CSVLoader

In [57]:
### Embedding with Rate limit in place
# LLM model
llm = VertexAI(
    model_name="text-bison@001",
    max_output_tokens=512,
    temperature=0,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

# Chat
#chat = ChatVertexAI()

# Embedding
EMBEDDING_QPM = 15
EMBEDDING_NUM_BATCH = 2
embeddings = CustomVertexAIEmbeddings(
    requests_per_minute=EMBEDDING_QPM,
    num_instances_per_batch=EMBEDDING_NUM_BATCH,
)

In [16]:
embeddings

CustomVertexAIEmbeddings(client=<vertexai.preview.language_models._PreviewTextEmbeddingModel object at 0x7cc1b45b7a00>, model_name='textembedding-gecko', temperature=0.0, max_output_tokens=128, top_p=0.95, top_k=40, stop=None, project=None, location='us-central1', credentials=None, request_parallelism=5, max_retries=6, requests_per_minute=15, num_instances_per_batch=2)

In [17]:
from google.colab import drive
drive.mount('/content/drive')
vector_save_directory = '/content/drive/MyDrive/Colab Notebooks/datafiles/chroma_vector_db/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [59]:
#going to use the CSV Loader provided by the Langchain for one file alone
import csv
loader = CSVLoader(file_path="/content/data1.csv", encoding="utf-8", csv_args={'delimiter': ','})
data = loader.load()

In [24]:
#used this for mutiple CSV files. Did not help the LLM to properly get the right answer,
from langchain.document_loaders.base import BaseLoader
from typing import Optional
from typing import Dict
from langchain.docstore.document import Document


class CSVLoader(BaseLoader):
    def __init__(
        self,
        file_paths: List[str],
        source_column: Optional[str] = None,
        csv_args: Optional[Dict] = None,
        encoding: Optional[str] = None,
    ):
        self.file_paths = file_paths
        self.source_column = source_column
        self.encoding = encoding
        self.csv_args = csv_args or {}

    def load(self) -> List[Document]:
        docs = []
        for file_path in self.file_paths:
            with open(file_path, newline="", encoding=self.encoding) as csvfile:
                csv_reader = csv.DictReader(csvfile, **self.csv_args)  # type: ignore
                for i, row in enumerate(csv_reader):
                    content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
                    try:
                        source = (
                            row[self.source_column]
                            if self.source_column is not None
                            else file_path
                        )
                    except KeyError:
                        raise ValueError(
                            f"Source column '{self.source_column}' not found in CSV file."
                        )
                    metadata = {"source": source, "row": i}
                    doc = Document(page_content=content, metadata=metadata)
                    docs.append(doc)
        return docs

In [31]:
import csv
loader = CSVLoader(file_paths=["/content/data1.csv","/content/data2.csv","/content/data3.csv"], encoding="utf-8", csv_args={'delimiter': ','})
data = loader.load()

In [55]:
#mutiple CSV
data

[Document(page_content='vehicle_id: 1\nvehicle_number: ABC123\nsmoke_emission: 0.3\ntemperature: 32', metadata={'source': '/content/data1.csv', 'row': 0}),
 Document(page_content='vehicle_id: 2\nvehicle_number: DEF456\nsmoke_emission: 0.5\ntemperature: 28', metadata={'source': '/content/data1.csv', 'row': 1}),
 Document(page_content='vehicle_id: 3\nvehicle_number: GHI789\nsmoke_emission: 0.1\ntemperature: 37', metadata={'source': '/content/data1.csv', 'row': 2}),
 Document(page_content='vehicle_id: 4\nvehicle_number: JKL012\nsmoke_emission: 0.8\ntemperature: 25', metadata={'source': '/content/data1.csv', 'row': 3}),
 Document(page_content='vehicle_id: 5\nvehicle_number: MNO345\nsmoke_emission: 0.2\ntemperature: 34', metadata={'source': '/content/data1.csv', 'row': 4}),
 Document(page_content='vehicle_id: 6\nvehicle_number: PQR678\nsmoke_emission: 0.7\ntemperature: 27', metadata={'source': '/content/data1.csv', 'row': 5}),
 Document(page_content='vehicle_id: 7\nvehicle_number: STU901\ns

In [60]:
#single CSV
data

[Document(page_content='vehicle_id: 1\nvehicle_number: ABC123\nsmoke_emission: 0.3\ntemperature: 32', metadata={'source': '/content/data1.csv', 'row': 0}),
 Document(page_content='vehicle_id: 2\nvehicle_number: DEF456\nsmoke_emission: 0.5\ntemperature: 28', metadata={'source': '/content/data1.csv', 'row': 1}),
 Document(page_content='vehicle_id: 3\nvehicle_number: GHI789\nsmoke_emission: 0.1\ntemperature: 37', metadata={'source': '/content/data1.csv', 'row': 2}),
 Document(page_content='vehicle_id: 4\nvehicle_number: JKL012\nsmoke_emission: 0.8\ntemperature: 25', metadata={'source': '/content/data1.csv', 'row': 3}),
 Document(page_content='vehicle_id: 5\nvehicle_number: MNO345\nsmoke_emission: 0.2\ntemperature: 34', metadata={'source': '/content/data1.csv', 'row': 4}),
 Document(page_content='vehicle_id: 6\nvehicle_number: PQR678\nsmoke_emission: 0.7\ntemperature: 27', metadata={'source': '/content/data1.csv', 'row': 5}),
 Document(page_content='vehicle_id: 7\nvehicle_number: STU901\ns

In [33]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
text_chunks = text_splitter.split_documents(data)

In [51]:
len(text_chunks)
text_chunks[10]

Document(page_content='vehicle_id: 11\nvehicle_number: EFGH789\nsmoke_emission: 0.5\ntemperature: 26', metadata={'source': '/content/data1.csv', 'row': 10})

In [34]:
# Store docs in local vectorstore as index
# it may take a while since API is rate limited
from langchain.vectorstores import Chroma

chroma_db = Chroma.from_documents(text_chunks,
                                  embeddings,
                                  persist_directory=vector_save_directory)

Waiting
..................................................

In [34]:
chroma_db.persist()
#this s the step where the vectors/embeddinggs are saved to the directoiry

In [36]:
retriever = vector_read_from_db.as_retriever(search_type="similarity", search_kwargs={"k": 2})

In [52]:

# Create chain to answer questions
from langchain.chains import RetrievalQA

# Uses LLM to synthesize results from the search index.
# We use Vertex PaLM Text API for LLM
qa = RetrievalQA.from_chain_type(
              llm=llm,
              chain_type="stuff",
              retriever=retriever,
              return_source_documents=True
)

In [None]:
query = "Give me the vehicle ID for each of the following scenario 1.Maximum Temperature 2.Maximum Top_speed 3.Minimum Mileage"

result = qa({"query": query})
print(result)

In [42]:
qa("how many records are present in each csv")

{'query': 'how many records are present in each csv',
 'result': '10000',
 'source_documents': []}

In [79]:
#single CSV
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
text_chunks = text_splitter.split_documents(data)

from langchain.vectorstores import Chroma
vector_save_directory = '/content/drive/MyDrive/Colab Notebooks/datafiles/FAISS_vector_db_single/'
chroma_db = Chroma.from_documents(text_chunks,
                                  embeddings,
                                  persist_directory=vector_save_directory)
chroma_db.persist()
retriever = vector_read_from_db.as_retriever(search_type="similarity", search_kwargs={"k": 2})
from langchain.chains import RetrievalQA

# Uses LLM to synthesize results from the search index.
# We use Vertex PaLM Text API for LLM
qa = RetrievalQA.from_chain_type(
              llm=llm,
              chain_type="stuff",
              retriever=retriever,
              return_source_documents=True
)


In [62]:
import pandas as pd
df = pd.read_csv("/content/data1.csv")
df.head()

Unnamed: 0,vehicle_id,vehicle_number,smoke_emission,temperature
0,1,ABC123,0.3,32
1,2,DEF456,0.5,28
2,3,GHI789,0.1,37
3,4,JKL012,0.8,25
4,5,MNO345,0.2,34


In [68]:
query = "how many records are present in the file "

result = qa({"query": query})
print(result)

{'query': 'how many records are present in the file ', 'result': '1000', 'source_documents': []}


In [71]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=ad9e1ef2c1c166f4236342b89e3ad444cf10820a73e26ff960fa8b7bb3a4f9f0
  Stored in directory: 

In [72]:
## try it with the sentence encoder embeddings for a single file


embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [80]:
from langchain.vectorstores import FAISS
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
text_chunks = text_splitter.split_documents(data)
docsearch = FAISS.from_documents(text_chunks, embeddings)
docsearch.save_local(vector_save_directory)

In [82]:
query = "vehicle id with maximum_temperature "
docs = docsearch.similarity_search(query, k=3)
print("Result", docs)

Result [Document(page_content='vehicle_id: 10\nvehicle_number: ABCD456\nsmoke_emission: 0.1\ntemperature: 36', metadata={'source': '/content/data1.csv', 'row': 9}), Document(page_content='vehicle_id: 17\nvehicle_number: 123ABC\nsmoke_emission: 0.6\ntemperature: 21', metadata={'source': '/content/data1.csv', 'row': 16}), Document(page_content='vehicle_id: 1\nvehicle_number: ABC123\nsmoke_emission: 0.3\ntemperature: 32', metadata={'source': '/content/data1.csv', 'row': 0})]


In [88]:
df.temperature.max()

38

In [93]:
df.loc[df['temperature']==38]

Unnamed: 0,vehicle_id,vehicle_number,smoke_emission,temperature
15,16,WXYZ234,0.4,38


In [91]:
qa = ConversationalRetrievalChain.from_llm(llm, retriever=docsearch.as_retriever())

In [94]:
while True:
    chat_history = []
    #query = "What is the value of  GDP per capita of Finland provided in the data?"
    query = input(f"Input Prompt: ")
    if query == 'exit':
        print('Exiting')
        sys.exit()
    if query == '':
        continue
    result = qa({"question":query, "chat_history":chat_history})
    print("Response: ", result['answer'])

Input Prompt: vehicle_id with maximum_temperature 
Response:  18
Input Prompt: vehicle_number with minimum smoke emission
Response:  123EFGH


KeyboardInterrupt: ignored

In [95]:
df.loc[df['smoke_emission']==df['smoke_emission'].max()]

Unnamed: 0,vehicle_id,vehicle_number,smoke_emission,temperature
8,9,YZ123,0.9,22
17,18,456DEF,0.9,35
25,26,890ABCD,0.9,27
