In [2]:
import os
import re
import requests
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
import faiss
import torch
from transformers import BertTokenizer, BertModel
from gpt4all import GPT4All

In [3]:
def download_pdf_from_url(url, save_path):
    response = requests.get(url)
    with open(save_path, 'wb') as out_file:
        out_file.write(response.content)

In [4]:
def extract_text_from_pdf(pdf_path):
    resource_manager = PDFResourceManager()
    fake_file_handle = StringIO()
    converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
    page_interpreter = PDFPageInterpreter(resource_manager, converter)

    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh,
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)

        text = fake_file_handle.getvalue()

    converter.close()
    fake_file_handle.close()

    os.remove(pdf_path)

    if text:
        return text

In [5]:
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    sentences = re.split(r'(?<=[.!?]) +', text)
    sentences = [sentence for sentence in sentences if sentence.strip()]
    return sentences

In [6]:
def embed_sentences(sentences, use_gpu=False):
    device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased', resume_download=False)

    model.to(device)

    tokenized_texts = [tokenizer.encode(sentence, add_special_tokens=True, max_length=512, truncation=True) for sentence in sentences]

    max_len = max(len(sent) for sent in tokenized_texts)
    padded_tokenized_texts = [sent + [tokenizer.pad_token_id] * (max_len - len(sent)) for sent in tokenized_texts]

    indexed_tokens = torch.tensor(padded_tokenized_texts).to(device)

    with torch.no_grad():
        outputs = model(input_ids=indexed_tokens)
        encoded_layers = outputs.last_hidden_state

    embeddings = encoded_layers[:, 0, :]

    return embeddings

In [7]:
def create_faiss_index(embeddings):
    index = faiss.IndexFlatL2(embeddings.size(-1))
    index.add(embeddings.cpu().numpy())
    return index