In [None]:
%pip install nltk transformers torch annoy seaborn matplotlib scikit-learn PyPDF2 plotly


In [None]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel
from annoy import AnnoyIndex
import PyPDF2
import os
import pickle
from tqdm import tqdm


nltk.download('punkt')
nltk.download('stopwords')


In [None]:
from huggingface_hub import snapshot_download

snapshot_download(repo_id='PromptSystematicReview/Prompt_Systematic_Review_Dataset', 
                  repo_type='dataset', 
                  local_dir='./PapersDirectory', 
                  allow_patterns=['papers/*'], 
                  local_dir_use_symlinks=False)


In [39]:
#LOADING THE FILE TEXT IN

def read_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = " ".join([page.extract_text() for page in pdf_reader.pages if page.extract_text() is not None])
    return text





In [40]:
#PREPROCESSING THE TEXT

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words


In [41]:
from collections import defaultdict
import torch
from torch.nn import DataParallel
from transformers import BertTokenizer, BertMod

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model = DataParallel(model).cuda()  # This will distribute the model across all available GPUs


# Modify the embedding function for batch processing
def embed_words_batch(words):
    inputs = tokenizer(words, padding=True, return_tensors='pt', truncation=True)
    inputs = {k: v.cuda() for k, v in inputs.items()}  # Move inputs to GPU
    outputs = model(**inputs)
    return outputs.hidden_states[-1][:, 0, :].detach().cpu().numpy()

# Creating embeddings with caching and batch processing
def create_embedding_dictionary_batch(file_path, batch_size=10):
    document_text = read_pdf(file_path)
    words = preprocess_text(document_text)
    unique_words = list(set(words))  # Unique words for caching

    # Caching embeddings
    cached_embeddings = defaultdict(lambda: None)
    embeddings_dict = {}

    for i in range(0, len(unique_words), batch_size):
        batch_words = unique_words[i:i+batch_size]
        batch_embeddings = embed_words_batch(batch_words)

        for word, embedding in zip(batch_words, batch_embeddings):
            cached_embeddings[word] = embedding

    for word in words:
        embeddings_dict[word] = {
            'embedding': cached_embeddings[word],
            'file': file_path
        }

    return embeddings_dict


def process_multiple_pdfs(folder_path, n, batch=10):
    all_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    processed_files = all_files[:n]  # Process only the first n files

    all_embeddings = {}
    for file in tqdm(processed_files, desc="Processing PDFs"):
        file_path = os.path.join(folder_path, file)
        try:
            embeddings_dict = create_embedding_dictionary_batch(file_path, batch)
            all_embeddings.update(embeddings_dict)
        except Exception as e:  # Catching a more general exception
            print(f"Error reading PDF: {file}. Skipping this file.")
            continue

    return all_embeddings




In [None]:

from multiprocessing import Pool
import os

def preprocess_and_read(file_path):
    try:
        document_text = read_pdf(file_path)
        words = preprocess_text(document_text)
        return words, file_path
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return [], file_path

def process_pdfs_in_parallel(folder_path, n, num_workers=8):
    all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]
    processed_files = all_files[:n]

    with Pool(num_workers) as p:
        results = list(tqdm(p.imap(preprocess_and_read, processed_files), total=len(processed_files)))

    return results


def create_embeddings_from_preprocessed_data(preprocessed_data, batch_size=50):
    all_embeddings = {}

    for words, file_path in preprocessed_data:
        unique_words = list(set(words))
        cached_embeddings = defaultdict(lambda: None)

        for i in range(0, len(unique_words), batch_size):
            batch_words = unique_words[i:i + batch_size]
            batch_embeddings = embed_words_batch(batch_words)

            for word, embedding in zip(batch_words, batch_embeddings):
                all_embeddings[word] = {
                    'embedding': embedding,
                    'file': file_path
                }

    return all_embeddings

In [None]:
papers_path = './PapersDirectory/papers'
num_papers_to_process = 1  # Replace with the number of papers you want to process

# Step 1: Parallel preprocessing
preprocessed_data = process_pdfs_in_parallel(papers_path, num_papers_to_process)

# Step 2: Batch embedding
embeddings_dict = create_embeddings_from_preprocessed_data(preprocessed_data)

In [44]:
# Assuming embeddings_dict is your dictionary
with open('vector_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings_dict, f)
