### Disease

In [None]:
import pandas as pd
import nltk
import os
import numpy as np

nltk.download('punkt')

# Load GloVe embeddings from file
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.split()
            word = parts[0]
            embedding = np.array([float(val) for val in parts[1:]])
            embeddings[word] = embedding
    return embeddings

# Function to get GloVe embeddings for a given text
def get_glove_embeddings(text, glove_embeddings):
    words = nltk.word_tokenize(text)
    # Filter words not in GloVe embeddings
    valid_words = [word.lower() for word in words if word.lower() in glove_embeddings]
    if not valid_words:
        return np.zeros_like(next(iter(glove_embeddings.values())))  # Return zeros if no valid words
    embeddings = [glove_embeddings[word.lower()] for word in valid_words]
    avg_embedding = np.mean(embeddings, axis=0)
    return avg_embedding.flatten()

# Define the directory containing the JSON and CSV files
json_dir = "split_pubtator"
csv_dir = "csvfiles"
output_dir = "gen_wordembeddings/GloVe/Disease"
os.makedirs(output_dir, exist_ok=True)

# Load GloVe embeddings
glove_file_path = "glove/glove.42B.300d.txt"
glove_embeddings = load_glove_embeddings(glove_file_path)

def process_files(json_file_path, csv_file_path, output_directory, glove_embeddings):
    # Load JSON data
    with open(json_file_path, 'r') as json_file:
        json_data = json.load(json_file)

    # Load disease annotations from CSV
    annotations_df = pd.read_csv(csv_file_path)

    # Extract unique phrases from the annotations
    unique_phrases = set(annotations_df['annotation_text'].dropna().unique())

    # Get GloVe embeddings for each unique phrase
    embeddings_dict = {phrase: get_glove_embeddings(phrase, glove_embeddings) for phrase in unique_phrases}

    # Output file name and path
    output_file_name = f"GloVe_Disease{os.path.basename(csv_file_path).replace('disease_annotations', '').replace('.csv', '.txt')}"
    output_file_path = os.path.join(output_directory, output_file_name)

    with open(output_file_path, 'w') as text_file:
        for phrase, embedding in embeddings_dict.items():
            # Find the corresponding row(s) in the annotations DataFrame
            rows = annotations_df[annotations_df['annotation_text'] == phrase]
            for _, row in rows.iterrows():
                mn_value = row['MN']
                # Convert embedding list to string representation
                embedding_str = ', '.join(map(str, embedding.tolist()))
                # Write to file
                text_file.write(f"Phrase:{phrase}, MN:{mn_value}, Embedding: [{embedding_str}]\n")

    print(f"Embeddings saved in {output_file_path}")

# Process each JSON file and its corresponding disease annotations CSV file
for file_name in os.listdir(json_dir):
    if file_name.endswith(".json"):
        json_file_path = os.path.join(json_dir, file_name)
        csv_file_name = f"disease_annotations_{file_name.replace('proper_pubtator_', '').replace('.json', '')}.csv"
        csv_file_path = os.path.join(csv_dir, csv_file_name)
        if os.path.exists(csv_file_path):
            process_files(json_file_path, csv_file_path, output_dir, glove_embeddings)


In [None]:
import glob
import os

# Path to the directory containing the text files
input_directory = 'gen_wordembeddings/GloVe/42B/Disease'
output_file_path = 'gen_wordembeddings/GloVe/42B/Disease/GloVe_disease_embeddings_combined.txt'  # Path for the output file

# Use glob to match all '.txt' files in the directory
text_files = glob.glob(os.path.join(input_directory, '*.txt'))

# Open the output file in write mode
with open(output_file_path, 'w') as outfile:
    # Iterate over each file path in the list
    for text_file_path in text_files:
        # Open each file for reading
        with open(text_file_path, 'r') as infile:
            # Write its contents to the output file
            outfile.write(infile.read())
            # Optionally write a newline between the contents of each file
            outfile.write('\n')

print(f'All text files from {input_directory} have been combined into {output_file_path}')


### Chemical

In [None]:
import json
import pandas as pd
import nltk
import os
import numpy as np

nltk.download('punkt')

# Function to load GloVe embeddings from file
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.split()
            word = parts[0]
            embedding = np.array([float(val) for val in parts[1:]])
            embeddings[word] = embedding
    return embeddings

# Function to get GloVe embeddings for a given text
def get_glove_embeddings(text, glove_embeddings):
    words = nltk.word_tokenize(text)
    # Filter words not in GloVe embeddings
    valid_words = [word.lower() for word in words if word.lower() in glove_embeddings]
    if not valid_words:
        return np.zeros_like(next(iter(glove_embeddings.values())))  # Return zeros if no valid words
    embeddings = [glove_embeddings[word.lower()] for word in valid_words]
    avg_embedding = np.mean(embeddings, axis=0)
    return avg_embedding.flatten()

def process_files(json_file_path, csv_file_path, output_directory, glove_embeddings):
    # Load JSON data
    with open(json_file_path, 'r') as json_file:
        json_data = json.load(json_file)

    # Load chemical annotations from CSV
    annotations_df = pd.read_csv(csv_file_path)

    # Convert non-string values in 'annotation_text' column to strings
    annotations_df['annotation_text'] = annotations_df['annotation_text'].astype(str)

    # Extract unique words from the JSON data
    all_words = set()
    for entry in json_data:
        text = entry.get("ARTICLE", {}).get("TEXT", "")
        words = nltk.word_tokenize(text)
        all_words.update(words)

    # Tokenize each entry in the 'annotation_text' column
    annotations_df['annotation_words'] = annotations_df['annotation_text'].apply(nltk.word_tokenize)

    # Filter annotations based on unique words
    filtered_annotations = annotations_df[annotations_df['annotation_words'].apply(lambda x: any(word in all_words for word in x))]

    # Get GloVe embeddings for each unique phrase
    embeddings_dict = {phrase: get_glove_embeddings(phrase, glove_embeddings) for phrase in set(filtered_annotations['annotation_text'])}

    # Output file name and path
    output_file_name = f"GloVe_Chemical{os.path.basename(csv_file_path).replace('chemical_annotations', '').replace('.csv', '.txt')}"
    output_file_path = os.path.join(output_directory, output_file_name)

    with open(output_file_path, 'w') as text_file:
        for phrase, embedding in embeddings_dict.items():
            # Find the corresponding row(s) in the annotations DataFrame
            rows = annotations_df[annotations_df['annotation_text'] == phrase]
            for _, row in rows.iterrows():
                # Convert embedding array to string representation
                embedding_str = ', '.join(map(str, embedding.tolist()))
                # Write to file
                text_file.write(f"Phrase:{phrase}, Embedding: [{embedding_str}]\n")

    print(f"Embeddings saved in {output_file_path}")

# Define the directory containing the JSON and CSV files
json_dir = "split_pubtator"
csv_dir = "csvfiles"
output_directory = "gen_wordembeddings/GloVe/Chemical"
os.makedirs(output_directory, exist_ok=True)

# Load GloVe embeddings
glove_file_path = "glove/glove.42B.300d.txt"
glove_embeddings = load_glove_embeddings(glove_file_path)

# Process each JSON file and its corresponding chemical annotations CSV file
for file_name in os.listdir(json_dir):
    if file_name.endswith("10_1.json"):
        json_file_path = os.path.join(json_dir, file_name)
        csv_file_name = f"chemical_annotations_{file_name.replace('proper_pubtator_', '').replace('.json', '')}.csv"
        csv_file_path = os.path.join(csv_dir, csv_file_name)
        if os.path.exists(csv_file_path):
            process_files(json_file_path, csv_file_path, output_directory, glove_embeddings)


In [None]:
import glob
import os

# Path to the directory containing the text files
input_directory = 'gen_wordembeddings/GloVe/Chemical/'
output_file_path = 'gen_wordembeddings/GloVe/Chemical/GloVe_chemical_embeddings_combined.txt'  # Path for the output file

# Use glob to match all '.txt' files in the directory
text_files = glob.glob(os.path.join(input_directory, '*.txt'))

# Open the output file in write mode
with open(output_file_path, 'w') as outfile:
    # Iterate over each file path in the list
    for text_file_path in text_files:
        # Open each file for reading
        with open(text_file_path, 'r') as infile:
            # Write its contents to the output file
            outfile.write(infile.read())
            # Optionally write a newline between the contents of each file
            outfile.write('\n')

print(f'All text files from {input_directory} have been combined into {output_file_path}')


### Gene

In [None]:
import json
import pandas as pd
import nltk
import os
import numpy as np

# Ensure that the punkt tokenizer is downloaded
nltk.download('punkt')

def load_glove_embeddings(glove_file_path):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

def process_files(json_file_path, csv_file_path, output_directory, glove_embeddings):
    # Load JSON data
    with open(json_file_path, 'r') as json_file:
        json_data = json.load(json_file)

    # Load gene annotations from CSV
    annotations_df = pd.read_csv(csv_file_path)
    annotations_df['annotation_text'] = annotations_df['annotation_text'].astype(str)

    # Extract unique words from the JSON data
    all_words = set()
    for entry in json_data:
        if "ARTICLE" in entry and "TEXT" in entry["ARTICLE"]:
            text = entry["ARTICLE"]["TEXT"]
            words = nltk.word_tokenize(text)
            all_words.update(words)

    # Tokenize each entry in the 'annotation_text' column
    annotations_df['annotation_words'] = annotations_df['annotation_text'].apply(nltk.word_tokenize)

    # Filter annotations based on unique words
    filtered_annotations = annotations_df[annotations_df['annotation_words'].apply(lambda x: any(word in all_words for word in x))]

    # Process and save embeddings
    output_file_name = f"GloVe_Gene_{os.path.basename(csv_file_path).replace('gene_annotations_', '').replace('.csv', '.txt')}"
    output_file_path = os.path.join(output_directory, output_file_name)

    with open(output_file_path, 'w') as text_file:
        for phrase in set(filtered_annotations['annotation_text']):
            # Get the GloVe embedding for each word in the phrase
            embeddings = [glove_embeddings.get(word, np.zeros(300)) for word in nltk.word_tokenize(phrase)]
            embeddings = np.mean(embeddings, axis=0)

            # Convert the embedding array to a string representation
            embedding_str = ', '.join(map(str, embeddings))

            # Write the phrase and its embedding to the output file
            text_file.write(f"Gene: {phrase}, Embedding: [{embedding_str}]\n")

    print(f"Embeddings saved in {output_file_path}")

# Define directories
json_dir = "split_pubtator"
csv_dir = "csvfiles"
output_dir = "gen_wordembeddings/GloVe/Gene"
os.makedirs(output_dir, exist_ok=True)

# Load GloVe embeddings
glove_file_path = "glove/glove.42B.300d.txt"
glove_embeddings = load_glove_embeddings(glove_file_path)

# Process each JSON file and its corresponding gene annotations CSV file
for file_name in os.listdir(json_dir):
    if file_name.endswith(".json"):
        json_file_path = os.path.join(json_dir, file_name)
        csv_file_name = f"gene_annotations_{file_name.replace('proper_pubtator_', '').replace('.json', '')}.csv"
        csv_file_path = os.path.join(csv_dir, csv_file_name)
        if os.path.exists(csv_file_path):
            process_files(json_file_path, csv_file_path, output_dir, glove_embeddings)


In [None]:
import glob
import os

# Path to the directory containing the text files
input_directory = 'gen_wordembeddings/GloVe/Gene'
output_file_path = 'gen_wordembeddings/GloVe/Gene/GloVe_gene_embeddings_combined.txt'  # Path for the output file

# Use glob to match all '.txt' files in the directory
text_files = glob.glob(os.path.join(input_directory, '*.txt'))

# Open the output file in write mode
with open(output_file_path, 'w') as outfile:
    # Iterate over each file path in the list
    for text_file_path in text_files:
        # Open each file for reading
        with open(text_file_path, 'r') as infile:
            # Write its contents to the output file
            outfile.write(infile.read())
            # Optionally write a newline between the contents of each file
            outfile.write('\n')

print(f'All text files from {input_directory} have been combined into {output_file_path}')


Note: The same code can be used pretrained models of GloVe trained on 42B tokens and 840B tokens. glove_file_path = "glove/glove.42B.300d.txt" by replacing this path with  "glove/glove.840B.300d.txt". In order to download the same, you can visit https://nlp.stanford.edu/projects/glove/ . After downloading this, you will have to unzip to the folder glove/

In [None]:
import zipfile
import os

# Path to the zip file
zip_file_path = 'glove/glove.840B.300d.zip'
# Directory where you want to extract the files
extract_to_directory = 'glove/'

# Check if the extraction directory exists, and create if it does not
os.makedirs(extract_to_directory, exist_ok=True)

# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all the contents into the directory
    zip_ref.extractall(extract_to_directory)

print(f"Files extracted to {extract_to_directory}")
