In [None]:
import os
import sys

from dotenv import load_dotenv
import PyPDF2
from PyPDF2 import PdfReader
import gradio as gr
import re

from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain_openai import OpenAIEmbeddings

In [None]:
def process_text(text: str) -> str:

    temp_marker = "TEMP_MARKER"
    text = re.sub("  ", temp_marker, text)
    text = re.sub(" ", "", text)
    text = re.sub(temp_marker, " ", text)
    
    return text

## "Stripe-2022" text extracting and cleaning 

In [None]:
#stripe-2022-update

text_stripe = []
with open('stripe-2022-update.pdf', 'rb') as f:
    reader = PyPDF2.PdfReader(f)
    
    text_stripe = [process_text(page.extract_text()) for page in reader.pages]

text_stripe = '\n'.join(text_stripe)
print(text_stripe)

## "Coffee_Manual" text extracting and cleaning 

In [None]:
# CoffeeB_Manual Globe_EN_10.08.2022

with open('CoffeeB_Manual Globe_EN_10.08.2022.pdf', 'rb') as f:
        
    reader = PyPDF2.PdfReader(f)
    text_manual = ""

    for page_number in range(len(reader.pages)):
        page = reader.pages[page_number]
        text_manual += page.extract_text()

text_manual = text_manual.replace('X', '')
text_manual = re.sub(r"RZ_CoffeeB_Cover_BDA_Globe_DE\.indd.*\d+:\d+", '', text_manual)
text_manual = re.sub(r' -', '', text_manual)

print(text_manual)

In [None]:
texts = {"data_stripe.txt": text_stripe, "data_manual.txt": text_manual}

for filename, text in texts.items():
    with open(filename, "w", encoding="utf-8") as f:
    f.write(text + "\n")

print("Text added to files (existing files will be overwritten).")


## setting up API key

In [None]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = api_key

In [None]:
query = sys.argv[1]
loader1 = TextLoader('data_stripe.txt')
loader2 = TextLoader('data_manual.txt')

index = VectorstoreIndexCreator().from_loaders([loader1, loader2])

In [None]:
import gradio as gr

def ask_question(user_query):
    
    if user_query.lower() == "quit":
        return "You have chosen to exit."
    else:
        return index.query(user_query)

iface = gr.Interface(
  fn=ask_question,
  inputs=gr.Textbox(lines=2, placeholder="Enter your question here",label="question"),
  outputs="text",
  title="Test",
  description="Enter your question based on 2 given docs."
)

iface.launch(share=True)
