# Disease

In [None]:
import json
import pandas as pd
import nltk
import os
import torch
from transformers import BertTokenizer, BertModel, BertConfig
from tqdm import tqdm

nltk.download('punkt')

def process_files(json_file_path, csv_file_path, output_directory):
    # Load JSON data
    with open(json_file_path, 'r') as json_file:
        json_data = json.load(json_file)

    # Load disease annotations from CSV
    annotations_df = pd.read_csv(csv_file_path)

    # Convert non-string values in 'annotation_text' column to strings
    annotations_df['annotation_text'] = annotations_df['annotation_text'].astype(str)

    # Extract unique words from the JSON data
    all_words = set()
    for entry in json_data:
        text = entry.get("ARTICLE", {}).get("TEXT", "")
        words = nltk.word_tokenize(text)
        all_words.update(words)

    # Tokenize each entry in the 'annotation_text' column
    annotations_df['annotation_words'] = annotations_df['annotation_text'].apply(nltk.word_tokenize)

    # Filter annotations based on unique words
    filtered_annotations = annotations_df[annotations_df['annotation_words'].apply(lambda x: any(word in all_words for word in x))]

    # Load pre-trained PubMedBERT model and tokenizer
    config = BertConfig.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext', output_hidden_states=True)
    tokenizer = BertTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
    model = BertModel.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext', config=config)
    
    model.eval()

    # Create different output files for different types of embeddings
    output_file_sum = os.path.join(output_directory, f"PubMedBERT_Disease_Sum{os.path.basename(csv_file_path).replace('disease_annotations', '').replace('.csv', '.txt')}")
    output_file_last4 = [os.path.join(output_directory, f"PubMedBERT_Disease_Layer_{i}{os.path.basename(csv_file_path).replace('disease_annotations', '').replace('.csv', '.txt')}") for i in range(-4, 0)]

    with open(output_file_sum, 'w') as text_file_sum:
        text_files_last4 = [open(file_name, 'w') for file_name in output_file_last4]

        for phrase in tqdm(set(filtered_annotations['annotation_text']), desc="Processing phrases


## Chemical

In [None]:
import json
import pandas as pd
import nltk
import os
import torch
from transformers import BertTokenizer, BertModel, BertConfig
from tqdm import tqdm

nltk.download('punkt')

def process_files(json_file_path, csv_file_path, output_directory):
    # Load JSON data
    with open(json_file_path, 'r') as json_file:
        json_data = json.load(json_file)

    # Load chemical annotations from CSV
    annotations_df = pd.read_csv(csv_file_path)

    # Convert non-string values in 'annotation_text' column to strings
    annotations_df['annotation_text'] = annotations_df['annotation_text'].astype(str)

    # Extract unique words from the JSON data
    all_words = set()
    for entry in json_data:
        text = entry.get("ARTICLE", {}).get("TEXT", "")
        words = nltk.word_tokenize(text)
        all_words.update(words)

    # Tokenize each entry in the 'annotation_text' column
    annotations_df['annotation_words'] = annotations_df['annotation_text'].apply(nltk.word_tokenize)

    # Filter annotations based on unique words
    filtered_annotations = annotations_df[annotations_df['annotation_words'].apply(lambda x: any(word in all_words for word in x))]

    # Load pre-trained PubMedBERT model and tokenizer
    config = BertConfig.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext', output_hidden_states=True)
    tokenizer = BertTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
    model = BertModel.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext', config=config)
    
    model.eval()

    # Create different output files for different types of embeddings
    output_file_sum = os.path.join(output_directory, f"PubMedBERT_Chemical_Sum{os.path.basename(csv_file_path).replace('chemical_annotations', '').replace('.csv', '.txt')}")
    output_file_last4 = [os.path.join(output_directory, f"PubMedBERT_Chemical_Layer_{i}{os.path.basename(csv_file_path).replace('chemical_annotations', '').replace('.csv', '.txt')}") for i in range(-4, 0)]

    with open(output_file_sum, 'w') as text_file_sum:
        text_files_last4 = [open(file_name, 'w') for file_name in output_file_last4]

        for phrase in tqdm(set(filtered_annotations['annotation_text']), desc="Processing phrases"):
            # Tokenize and encode the phrase
            inputs = tokenizer(phrase, return_tensors='pt', truncation=True, padding=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)

            # Get the hidden states for the last 4 layers
            hidden_states = outputs.hidden_states[-4:]

            # Sum of the last 4 layers
            sum_embedding = torch.sum(torch.stack(hidden_states), dim=0).squeeze(0)
            sum_embedding = torch.sum(sum_embedding, dim=0).tolist()
            embedding_str_sum = ', '.join(map(str, sum_embedding))
            text_file_sum.write(f"Phrase:{phrase}, Embedding: [{embedding_str_sum}]\n")

            # Save each of the last 4 layers individually
            for i, hidden_state in enumerate(hidden_states):
                layer_embedding = torch.mean(hidden_state.squeeze(0), dim=0).tolist()
                embedding_str_layer = ', '.join(map(str, layer_embedding))
                text_files_last4[i].write(f"Phrase:{phrase}, Embedding: [{embedding_str_layer}]\n")

        # Close individual layer files
        for text_file in text_files_last4:
            text_file.close()

    print(f"Summation embeddings saved in {output_file_sum}")
    print(f"Individual layer embeddings saved in {output_directory}")

# Define the directory containing the JSON and CSV files
json_dir = "split_pubtator"
csv_dir = "csvfiles"
output_directory = "3gen_wordembeddings/PubMedBERT/Chemical"
os.makedirs(output_directory, exist_ok=True)

# Process each JSON file and its corresponding chemical annotations CSV file, processing only those ending with ".json"
for file_name in tqdm(os.listdir(json_dir), desc="Processing files"):
    if file_name.endswith(".json"):
        json_file_path = os.path.join(json_dir, file_name)
        csv_file_name = f"chemical_annotations_{file_name.replace('proper_pubtator_', '').replace('.json', '')}.csv"
        csv_file_path = os.path.join(csv_dir, csv_file_name)
        if os.path.exists(csv_file_path):
            process_files(json_file_path, csv_file_path, output_directory)


## Gene

In [None]:
import json
import pandas as pd
import nltk
import os
import torch
from transformers import BertTokenizer, BertModel, BertConfig
from tqdm import tqdm

nltk.download('punkt')

def process_files(json_file_path, csv_file_path, output_directory):
    # Load JSON data
    with open(json_file_path, 'r') as json_file:
        json_data = json.load(json_file)

    # Load gene annotations from CSV
    annotations_df = pd.read_csv(csv_file_path)

    # Convert non-string values in 'annotation_text' column to strings
    annotations_df['annotation_text'] = annotations_df['annotation_text'].astype(str)

    # Extract unique words from the JSON data
    all_words = set()
    for entry in json_data:
        text = entry.get("ARTICLE", {}).get("TEXT", "")
        words = nltk.word_tokenize(text)
        all_words.update(words)

    # Tokenize each entry in the 'annotation_text' column
    annotations_df['annotation_words'] = annotations_df['annotation_text'].apply(nltk.word_tokenize)

    # Filter annotations based on unique words
    filtered_annotations = annotations_df[annotations_df['annotation_words'].apply(lambda x: any(word in all_words for word in x))]

    # Load pre-trained PubMedBERT model and tokenizer
    config = BertConfig.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext', output_hidden_states=True)
    tokenizer = BertTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
    model = BertModel.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext', config=config)
    
    model.eval()

    # Create different output files for different types of embeddings
    output_file_sum = os.path.join(output_directory, f"PubMedBERT_Gene_Sum{os.path.basename(csv_file_path).replace('gene_annotations', '').replace('.csv', '.txt')}")
    output_file_last4 = [os.path.join(output_directory, f"PubMedBERT_Gene_Layer_{i}{os.path.basename(csv_file_path).replace('gene_annotations', '').replace('.csv', '.txt')}") for i in range(-4, 0)]

    with open(output_file_sum, 'w') as text_file_sum:
        text_files_last4 = [open(file_name, 'w') for file_name in output_file_last4]

        for phrase in tqdm(set(filtered_annotations['annotation_text']), desc="Processing phrases"):
            # Tokenize and encode the phrase
            inputs = tokenizer(phrase, return_tensors='pt', truncation=True, padding=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)

            # Get the hidden states for the last 4 layers
            hidden_states = outputs.hidden_states[-4:]

            # Sum of the last 4 layers
            sum_embedding = torch.sum(torch.stack(hidden_states), dim=0).squeeze(0)
            sum_embedding = torch.sum(sum_embedding, dim=0).tolist()
            embedding_str_sum = ', '.join(map(str, sum_embedding))
            text_file_sum.write(f"Phrase:{phrase}, Embedding: [{embedding_str_sum}]\n")

            # Save each of the last 4 layers individually
            for i, hidden_state in enumerate(hidden_states):
                layer_embedding = torch.mean(hidden_state.squeeze(0), dim=0).tolist()
                embedding_str_layer = ', '.join(map(str, layer_embedding))
                text_files_last4[i].write(f"Phrase:{phrase}, Embedding: [{embedding_str_layer}]\n")

        # Close individual layer files
        for text_file in text_files_last4:
            text_file.close()

    print(f"Summation embeddings saved in {output_file_sum}")
    print(f"Individual layer embeddings saved in {output_directory}")

# Define the directory containing the JSON and CSV files
json_dir = "split_pubtator"
csv_dir = "csvfiles"
output_directory = "3gen_wordembeddings/PubMedBERT/Gene"
os.makedirs(output_directory, exist_ok=True)

# Process each JSON file and its corresponding gene annotations CSV file, processing only those ending with ".json"
for file_name in tqdm(os.listdir(json_dir), desc="Processing files"):
    if file_name.endswith(".json"):
        json_file_path = os.path.join(json_dir, file_name)
        csv_file_name = f"gene_annotations_{file_name.replace('proper_pubtator_', '').replace('.json', '')}.csv"
        csv_file_path = os.path.join(csv_dir, csv_file_name)
        if os.path.exists(csv_file_path):
            process_files(json_file_path, csv_file_path, output_directory)
