In [10]:
import logging
from telegram import Update
from telegram.constants import ParseMode
from telegram.ext import Updater, CommandHandler, MessageHandler, filters, ApplicationBuilder, ContextTypes 
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from ollama import Client
import nest_asyncio
from telegram.helpers import escape_markdown
import asyncio


In [11]:
nest_asyncio.apply()

# Replace 'YOUR_API_TOKEN' with the token you received from the BotFather
API_TOKEN = 'YOUR_API_TOKEN'


# Set up logging
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

client = Client(host='http://localhost:11434')

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=512,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

In [12]:
embeddings = HuggingFaceEmbeddings(model_name= 'BAAI/bge-large-en')


2024-04-24 13:58:06,750 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: BAAI/bge-large-en
2024-04-24 13:58:07,962 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: cuda


In [13]:
def load_and_process_documents(file_path):

    loader = PyPDFLoader(file_path)
    pages = loader.load_and_split(text_splitter=text_splitter)
    for page in pages:
        page.page_content = page.page_content.replace('\n',' ')
    return pages


In [14]:
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
    """Handler for the /start command"""
    user_id = update.effective_user.id
    if user_id not in context.bot_data:
        context.bot_data[user_id] = {}
    await update.message.reply_text('Welcome! Please send me the PDF documents you want to process.')

In [19]:
async def document_handler(update: Update, context: ContextTypes.DEFAULT_TYPE):
    """Handler for receiving PDF documents"""
    user_id = update.effective_user.id
    
     # Initialize user-specific data if it doesn't exist
    if user_id not in context.bot_data:
        context.bot_data[user_id] = {}
        
    document = update.message.document
    if document.mime_type == 'application/pdf':
        file_id = document.file_id
        new_file = await context.bot.get_file(file_id)
        file_path = f"{file_id}.pdf"
        await new_file.download_to_drive(file_path)
        
        pages = load_and_process_documents(file_path)
        if 'vectordb' not in context.bot_data[user_id]:
            vectordb = Chroma.from_documents(pages, embeddings)
            context.bot_data[user_id]['vectordb'] = vectordb
        else:
            vectordb = context.bot_data[user_id]['vectordb']
            vectordb.add_documents(pages)
        
        await update.message.reply_text('PDF document received and processed. You can now ask questions about the content.')
    else:
        await update.message.reply_text(f"Unsupported file type: {document.mime_type}. Skipping this file.")

In [16]:
def get_prompt(question, vectordb):
    
    documents = vectordb.similarity_search(question, k=10)
    context = '\n'.join(doc.page_content for doc in documents)
    
    prompt = f"""Using only the context below, answer the following question:
    context : {context}
    question: {question}"""
    
    return prompt

In [17]:
async def question_handler(update: Update, context: ContextTypes.DEFAULT_TYPE):
    """Handler for answering questions based on the processed documents"""
    user_id = update.effective_user.id
    question = update.message.text
    vectordb = context.bot_data.get(user_id, {}).get('vectordb')
    if vectordb:
        prompt = get_prompt(question, vectordb)
        
        response = client.chat(model='mistral:instruct', messages=[
          {
            'role': 'user',
            'content': prompt,
          },
        ])

        await context.bot.send_message(chat_id=update.effective_chat.id, text=escape_markdown(response['message']['content']))
    else:
        await update.message.reply_text('No processed documents found. Please send PDF documents first.')

In [None]:
def main():
    """Main function to run the bot"""
 
    application = ApplicationBuilder().token(API_TOKEN).build()

    # Register command and message handlers
    
    application.add_handler(CommandHandler("start", start))
    application.add_handler(MessageHandler(filters.Document.ALL, document_handler))
    application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, question_handler))

    # Start the bot
    application.run_polling()
   # application.idle()

if __name__ == '__main__':
    main()

2024-04-24 14:00:44,622 - httpx - INFO - HTTP Request: POST https://api.telegram.org/bot6831169398:AAHv0s4tjGtkPWOs0hBUIztzGOkneP3IiMI/getMe "HTTP/1.1 200 OK"
2024-04-24 14:00:44,733 - httpx - INFO - HTTP Request: POST https://api.telegram.org/bot6831169398:AAHv0s4tjGtkPWOs0hBUIztzGOkneP3IiMI/deleteWebhook "HTTP/1.1 200 OK"
2024-04-24 14:00:44,735 - telegram.ext.Application - INFO - Application started
2024-04-24 14:00:55,087 - httpx - INFO - HTTP Request: POST https://api.telegram.org/bot6831169398:AAHv0s4tjGtkPWOs0hBUIztzGOkneP3IiMI/getUpdates "HTTP/1.1 200 OK"
2024-04-24 14:00:55,876 - httpx - INFO - HTTP Request: POST https://api.telegram.org/bot6831169398:AAHv0s4tjGtkPWOs0hBUIztzGOkneP3IiMI/getUpdates "HTTP/1.1 200 OK"
2024-04-24 14:00:57,390 - httpx - INFO - HTTP Request: POST https://api.telegram.org/bot6831169398:AAHv0s4tjGtkPWOs0hBUIztzGOkneP3IiMI/getFile "HTTP/1.1 200 OK"
2024-04-24 14:00:57,610 - httpx - INFO - HTTP Request: GET https://api.telegram.org/file/bot6831169398%3A