In [None]:
%pip install nltk transformers torch annoy seaborn matplotlib scikit-learn PyPDF2 plotly

In [None]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel
import PyPDF2
import os
import pickle
from tqdm import tqdm
from huggingface_hub import snapshot_download
import os
import numpy as np
import PyPDF2



from collections import defaultdict
from tqdm import tqdm
from multiprocessing import Pool
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from torch.nn import DataParallel
import torch
from paper_processing_for_embeddings import preprocess_and_read 

nltk.download('punkt')
nltk.download('stopwords')


In [17]:
# Check if CUDA is available, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

# If multiple GPUs are available, use DataParallel
if torch.cuda.device_count() > 1:
    model = DataParallel(model)

model = model.to(device)

In [18]:
def process_pdfs_in_parallel(folder_path, n, num_workers=8):
    all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]
    processed_files = all_files[:n]

    with Pool(num_workers) as p:
        results = list(tqdm(p.imap(preprocess_and_read, processed_files), total=len(processed_files)))

    return results


In [1]:
def embed_words_batch(words):
    inputs = tokenizer(words, padding=True, return_tensors='pt', truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to the appropriate device
    outputs = model(**inputs)
    return outputs.hidden_states[-1][:, 0, :].detach().cpu().numpy()

def create_embeddings_from_preprocessed_data(preprocessed_data, batch_size):
    all_embeddings = {}

    for words, file_path in preprocessed_data:
        unique_words = list(set(words))
        cached_embeddings = defaultdict(lambda: None)

        for i in range(0, len(unique_words), batch_size):
            batch_words = unique_words[i:i + batch_size]
            batch_embeddings = embed_words_batch(batch_words)

            for word, embedding in zip(batch_words, batch_embeddings):
                all_embeddings[word] = {
                    'embedding': embedding,
                    'file': file_path
                }

    return all_embeddings


In [None]:

snapshot_download(repo_id='PromptSystematicReview/Prompt_Systematic_Review_Dataset', 
                  repo_type='dataset', 
                  local_dir='./PapersDirectory', 
                  allow_patterns=['papers/*'], 
                  local_dir_use_symlinks=False)


In [None]:
def count_pdfs_in_directory(directory_path):
    return len([f for f in os.listdir(directory_path) if f.endswith('.pdf') and os.path.isfile(os.path.join(directory_path, f))])

# Set the path to the directory containing the papers
papers_path = './PapersDirectory/papers'
batch_size = 25

# Count the number of PDFs in the directory to process all available papers
num_papers_to_process = count_pdfs_in_directory(papers_path)

# Continue with processing
preprocessed_data = process_pdfs_in_parallel(papers_path, num_papers_to_process)
embeddings_dict = create_embeddings_from_preprocessed_data(preprocessed_data, batch_size)


In [23]:
# Assuming embeddings_dict is your dictionary
with open('word_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings_dict, f)
