In [9]:
# !rm -rf ~/.cache/huggingface

In [None]:
import fitz
import re
import torch
import faiss
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import os
from huggingface_hub import login
from dotenv import load_dotenv
from omegaconf import OmegaConf
import yaml
with open('config.yml', 'r') as config_file:
    config = yaml.safe_load(config_file)
file_path = config.get('FILE_PATH', '')
load_dotenv()

login(token=os.getenv("TOKEN"))

class PDFProcessor:
    def __init__(self, pdf_path, model_embedd, tokenizer_embedd, model_name, tokenizer_name):
        self.pdf_path = pdf_path
        self.tokenizer_embedd = AutoTokenizer.from_pretrained(tokenizer_embedd)
        self.model_embedd = AutoModel.from_pretrained(model_embedd)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.text_chunks = []
        self.index = None

    def extract_text_from_pdf(self):
        text = ""
        doc = fitz.open(self.pdf_path)
        for page in doc:
            text += page.get_text()
        return text

    def clean_text(self, text):
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def chunk_text_by_sentence(self, text, max_chunk_size=500, overlap_size=50):
        sentences = re.split(r'(?<=[.!?]) +', text)
        chunks = []
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            sentence_length = len(sentence)

            if current_length + sentence_length <= max_chunk_size:
                current_chunk.append(sentence)
                current_length += sentence_length
            else:
                chunks.append(' '.join(current_chunk))
                overlap_text = ' '.join(current_chunk)[-overlap_size:]
                current_chunk = [overlap_text, sentence]
                current_length = len(overlap_text) + sentence_length

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    def embed_text(self, text):
        inputs = self.tokenizer_embedd(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            embeddings = self.model_embedd(**inputs).last_hidden_state.mean(dim=1)
        return embeddings

    def build_faiss_index(self):
        chunk_embeddings = []
        for chunk in self.text_chunks:
            chunk_embedding = self.embed_text(chunk)
            chunk_embeddings.append(chunk_embedding)

        embedding_matrix = torch.vstack(chunk_embeddings).numpy()
        self.index = faiss.IndexFlatL2(embedding_matrix.shape[1])
        self.index.add(embedding_matrix)

    def process_pdf(self):
        raw_text = self.extract_text_from_pdf()
        cleaned_text = self.clean_text(raw_text)
        self.text_chunks = self.chunk_text_by_sentence(cleaned_text)
        self.build_faiss_index()

    def query_database(self, query_text, top_k=1):
        query_embedding = self.embed_text(query_text).numpy()
        distances, indices = self.index.search(query_embedding, top_k)
        return indices[0]

    def generate_response(self, retrieved_chunks):
        input_text = " ".join(retrieved_chunks)
        inputs = self.tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512)

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(device)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=100,
                do_sample=True,  
                temperature=0.7,
                top_k=50,
                top_p=0.9,
                pad_token_id=self.tokenizer.eos_token_id,
                num_beams=5,
                repetition_penalty=2.0,
                early_stopping=True,
                no_repeat_ngram_size=2
            )

        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response

    def retrieve_and_generate_response(self, query_text):
        retrieved_indices = self.query_database(query_text)
        retrieved_chunks = [self.text_chunks[i] for i in retrieved_indices]
        response = self.generate_response(retrieved_chunks)
        return response

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
pdf_path = file_path
processor = PDFProcessor(pdf_path, 'bert-base-multilingual-cased', 'bert-base-multilingual-cased', 'jtatman/orca-tau-4k-persian-alpaca-f32', 'jtatman/orca-tau-4k-persian-alpaca-f32')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/135 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/257 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/764 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

In [None]:
processor.process_pdf()
query = "چیست؟ Artificial Intelligence لطفاً به زبان فارسی توضیح دهید که هوش مصنوعی یا"
response = processor.retrieve_and_generate_response(query)
print(response)

هوش مصنوعی (AI یا Artificial Intelligence) شاخه ای از علوم کامپیوتر است که به طراحی و توسعه سیستم ها و برنامه هایی می پردازد که می توانند وظایف را با نوعی هوش و استدالل مشابه انسان انجام دهند. این سیستم ها با استفاده از الگوریتم ها و مدل های ریاضی، قادرند از داده ها یاد بگیرند، تصمیم بگیرند، و مسائل پیچیده را حل کنند. کاربردهای هوش مصنوعی: 1. پردازش زبان طبیعی (NLP): توانایی درک و تولید زبان انسانی توسط کامپیوتر.2. تشخیص گفتار: آزمایش دقیق و مقرون به صرفه برای نتایج خودکار.3. ماشین‌های ذخیره‌سازی حافظه: فرآیند مدیریت و پشتیبانی منابع حذفی جدید.4. تجزیه و التحلیل: عملکرد انعطاف‌پذیرتر و مؤثرتر بر اطلاعات.5. خدمات مشترک: ارائه محصولات و خدمات به اشتراک‌گذاری کسب‌وکارت.6.
