In [None]:
%pip install nltk transformers torch annoy seaborn matplotlib scikit-learn PyPDF2 plotly

In [None]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
import torch
from transformers import BertTokenizer, BertModel
from nltk.tokenize import word_tokenize, sent_tokenize
from multiprocessing import Pool
from huggingface_hub import snapshot_download
from tqdm import tqdm
from collections import defaultdict
import torch
from torch.nn import DataParallel
from paper_processing_for_embeddings import preprocess_and_read, read_pdf

nltk.download('punkt')
nltk.download('stopwords')


In [None]:

snapshot_download(repo_id='PromptSystematicReview/Prompt_Systematic_Review_Dataset', 
                  repo_type='dataset', 
                  local_dir='./PapersDirectory', 
                  allow_patterns=['papers/*'], 
                  local_dir_use_symlinks=False)


In [5]:
# Check if CUDA is available, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

# If multiple GPUs are available, use DataParallel
if torch.cuda.device_count() > 1:
    model = DataParallel(model)

model = model.to(device)

In [6]:
def process_pdfs_in_parallel(folder_path, n, num_workers=8):
    all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]
    processed_files = all_files[:n]

    with Pool(num_workers) as p:
        results = list(tqdm(p.imap(preprocess_and_read, processed_files), total=len(processed_files)))

    return results

def count_pdfs_in_directory(directory_path):
    return len([f for f in os.listdir(directory_path) if f.endswith('.pdf') and os.path.isfile(os.path.join(directory_path, f))])

In [7]:
# Embedding Functions
def embed_text_batch(text_list, batch_size=25):
    all_embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch_texts = text_list[i:i + batch_size]
        inputs = tokenizer(batch_texts, padding=True, return_tensors='pt', truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()
        all_embeddings.extend(embeddings)
    return all_embeddings

# Main Processing Function
def create_embeddings(folder_path, embedding_type='word', num_papers=None, num_workers=8):
    num_papers_to_process = count_pdfs_in_directory(folder_path) if num_papers is None else num_papers
    preprocessed_data = process_pdfs_in_parallel(folder_path, num_papers_to_process, num_workers)
    
    embeddings_dict = {}
    for words, file_path in preprocessed_data:
        if embedding_type == 'word':
            text_units = list(set(words))
        elif embedding_type == 'sentence':
            text_units = sent_tokenize(' '.join(words))


        embeddings = embed_text_batch(text_units)
        for unit, embedding in zip(text_units, embeddings):
            embeddings_dict[unit] = {'embedding': embedding, 'file': file_path}
    
    return embeddings_dict

In [None]:
# Set the path to the directory containing the papers
papers_path = './PapersDirectory/papers'
batch_size = 40

# Count the number of PDFs in the directory to process all available papers
num_papers_to_process = count_pdfs_in_directory(papers_path)

# Continue with processing
embeddings_dict = create_embeddings(papers_path, embedding_type='sentence')

In [9]:
# Assuming embeddings_dict is your dictionary
with open('sentence_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings_dict, f)
