In [None]:
%pip install -U --quiet langchain-google-genai langchain faiss-cpu pypdf sentence-transformers PyPDF2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.4/116.4 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━

In [None]:
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from google.colab import userdata
import google.generativeai as genai

In [None]:
api_key = userdata.get('GOOGLE_API_KEY')
if not api_key:
    raise ValueError("Missing GOOGLE_API_KEY environment variable")

genai.configure(api_key=api_key)

In [None]:
os.environ["GOOGLE_API_KEY"] = api_key

In [None]:
class pdfQA:

  def __init__(self, model_path="models/embedding-001"):
    self.embeddings = GoogleGenerativeAIEmbeddings(model=model_path)
    self.chain = self._get_conversational_chain()


  def _get_pdf_text(self, pdf_docs):
      """Extracts text from a list of PDF documents."""
      text = ""
      for pdf in pdf_docs:
          pdf_reader = PdfReader(pdf)
          for page in pdf_reader.pages:
              text += page.extract_text()
      return text


  def _get_text_chunks(self, text):
      """Splits text into chunks for embedding."""
      text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
      chunks = text_splitter.split_text(text)
      return chunks


  def _create_vector_store(self, text_chunks):
      """Creates a FAISS vector store from text chunks and embeddings."""
      embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
      vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
      vector_store.save_local("faiss_index")


  def _get_conversational_chain(self):
      """Defines the question-answering chain using a prompt template and model."""
      prompt_template = """
      Answer the question with full context details. If you don't know the answer say "The context doesn't tell you about".\n\n
      Context:\n {context}?\n
      Question: \n{question}\n

      Answer:
      """

      model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)

      prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
      chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)

      return chain


  def _answer_user_question(self, user_question):
      """Answers the user's question using the conversational chain and embeddings."""
      embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

      new_db = FAISS.load_local("faiss_index", embeddings=embeddings, allow_dangerous_deserialization=True)
      docs = new_db.similarity_search(user_question)

      chain = self.chain

      response = chain(
          {"input_documents": docs, "question": user_question}, return_only_outputs=True
      )

      return response["output_text"]


  def main(self):
      """Handles user interaction, PDF processing, and question answering."""
      # Consider using Colab file uploader (e.g., from google.colab import files)

      print("Chat with PDF using Gemini")

      # Load vector store or create it if it doesn't exist
      if not os.path.exists("faiss_index"):
          print("Vector store not found. Processing PDFs to create it...")
          pdf_docs = input("Enter paths to your PDF files separated by commas (,) or 'upload' to upload from local machine: ")
          if pdf_docs == "upload":
              # Implement upload functionality using Colab's file uploader
              pass  # Replace with upload logic
          else:
              pdf_docs = pdf_docs.split(",")

          raw_text = self._get_pdf_text(pdf_docs)
          text_chunks = self._get_text_chunks(raw_text)
          self._create_vector_store(text_chunks)
          print("Vector store created.")

      print("Ask a Question from the PDF Files (or 'quit')\n")
      while True:
          user_question = input("User: ")
          if user_question.lower() == 'quit':
              break
          answer = self._answer_user_question(user_question)
          # You can optionally display the answer here (if not printed earlier)
          print(f"\nGemini: \n{answer}\n\n")


In [None]:
if __name__ == "__main__":
  qa_system = pdfQA()
  qa_system.main()

Chat with PDF using Gemini
Ask a Question from the PDF Files (or 'quit')

User: summarize the case study and results

Gemini: 
The case study evaluated the performance of different forecasting models for one-step-ahead forecasting of wind speed. Five time series from different cities in Brazil were used, and the models included linear models (AR and ARMA), neural networks (MLP, RBF, ELM, and ESN), hybrid models (AR+ANN, ARMA+ANN), and ensemble models. The results showed that the ensemble models generally performed better than the single models, with the Median Ensemble of all single models except the RBF being the best overall predictor. The ARMA model performed well for series with lower coefficient of variation, while the Ensemble Median SM-RBF performed better for series with higher coefficient of variation and magnitudes. The study highlights the importance of considering different forecasting models and ensemble approaches to improve the accuracy of wind speed forecasting for rene