# Disease

In [None]:
import json
import pandas as pd
from gensim.models import Word2Vec
import nltk
import os

nltk.download('punkt')

def process_files(json_file_path, csv_file_path, output_directory):
    # Load JSON data
    with open(json_file_path, 'r') as json_file:
        json_data = json.load(json_file)

    # Load disease annotations from CSV
    annotations_df = pd.read_csv(csv_file_path)

    # Extract unique words from the JSON data
    all_words = set()
    for entry in json_data:
        text = entry.get("ARTICLE", {}).get("TEXT", "")
        words = nltk.word_tokenize(text)
        all_words.update(words)

    # Tokenize each entry in the 'annotation_text' column, ensuring that each element is a string
    annotations_df['annotation_words'] = annotations_df['annotation_text'].apply(
        lambda x: nltk.word_tokenize(str(x)) if pd.notnull(x) else []
    )

    # Filter annotations based on unique words
    filtered_annotations = annotations_df[annotations_df['annotation_words'].apply(lambda x: any(word in all_words for word in x))]

    # Train Word2Vec model (CBOW)
    model = Word2Vec(sentences=[list(all_words)], vector_size=250, window=6, min_count=1, sg=0)

    # Get unique phrases, their MN values, and embeddings using average pooling
    unique_embeddings = set()
    output_file_name = f"CBOW_Disease{os.path.basename(csv_file_path).replace('disease_annotations', '').replace('.csv', '.txt')}"
    output_file_path = os.path.join(output_directory, output_file_name)

    with open(output_file_path, 'w') as text_file:
        for _, row in filtered_annotations.iterrows():
            phrase = row['annotation_text']
            mn_value = row['MN']
            phrase_words = nltk.word_tokenize(phrase)
            phrase_embeddings = [model.wv[word].tolist() for word in phrase_words if word in model.wv]

            if phrase_embeddings:
                # Perform average pooling
                avg_embedding = [round(sum(vec) / len(vec), 6) for vec in zip(*phrase_embeddings)]
                # Convert embedding list to string representation
                embedding_str = ', '.join(map(str, avg_embedding))

                # Write to file
                text_file.write(f"Phrase:{phrase}, MN:{mn_value}, Embedding: [{embedding_str}]\n")

    print(f"Embeddings saved in {output_file_path}")

# Define the directory containing the JSON and CSV files
json_dir = "split_pubtator"
csv_dir = "csvfiles"

# Make sure the output directory exists
output_dir = "gen_wordembeddings/CBOW/Disease"
os.makedirs(output_dir, exist_ok=True)

# Process each JSON file and its corresponding disease annotations CSV file
# Process each JSON file and its corresponding disease annotations CSV file, skipping files ending with "_9"
for file_name in os.listdir(json_dir):
    if file_name.endswith(".json"):
        json_file_path = os.path.join(json_dir, file_name)
        csv_file_name = f"disease_annotations_{file_name.replace('proper_pubtator_', '').replace('.json', '')}.csv"
        csv_file_path = os.path.join(csv_dir, csv_file_name)
        if os.path.exists(csv_file_path):
            process_files(json_file_path, csv_file_path, output_dir)


Note: The parameters can be adjusted based on the required window size and vector size.

In [None]:
import glob
import os

# Path to the directory containing the text files
input_directory = 'gen_wordembeddings/CBOW/Disease/'
output_file_path = 'gen_wordembeddings/CBOW/Disease/CBOW_Disease_embeddings_combined.txt'  # Path for the output file

# Use glob to match all '.txt' files in the directory
text_files = glob.glob(os.path.join(input_directory, '*.txt'))

# Open the output file in write mode
with open(output_file_path, 'w') as outfile:
    # Iterate over each file path in the list
    for text_file_path in text_files:
        # Open each file for reading
        with open(text_file_path, 'r') as infile:
            # Write its contents to the output file
            outfile.write(infile.read())
            # Optionally write a newline between the contents of each file
            outfile.write('\n')

print(f'All text files from {input_directory} have been combined into {output_file_path}')


Note: Combine all the word embeddings of a particular model

# Gene

In [None]:
import json
import pandas as pd
from gensim.models import Word2Vec
import nltk
import os

nltk.download('punkt')

def process_files(json_file_path, csv_file_path, output_directory):
    # Load JSON data
    with open(json_file_path, 'r') as json_file:
        json_data = json.load(json_file)

    # Load gene annotations from CSV
    annotations_df = pd.read_csv(csv_file_path)
    annotations_df['annotation_text'] = annotations_df['annotation_text'].astype(str)

    # Extract unique words from JSON data
    all_words = set()
    for entry in json_data:
        if "ARTICLE" in entry and "TEXT" in entry["ARTICLE"]:
            text = entry["ARTICLE"]["TEXT"]
            words = nltk.word_tokenize(text)
            all_words.update(words)

    # Tokenize each entry in the 'annotation_text' column
    annotations_df['annotation_words'] = annotations_df['annotation_text'].apply(nltk.word_tokenize)

    # Flatten the list of tokenized words
    all_annotation_words = [word for words_list in annotations_df['annotation_words'] for word in words_list]

    # Filter annotations based on unique words
    filtered_annotations = annotations_df[annotations_df['annotation_words'].apply(lambda x: any(word in all_words for word in x))]

    # Train Word2Vec model (CBOW)
    model = Word2Vec(sentences=[all_annotation_words], vector_size=250, window=6, min_count=1, sg=0)

    # Get unique phrases and their embeddings
    unique_embeddings = set()
    output_file_name = f"CBOW_Gene_{os.path.basename(csv_file_path).replace('gene_annotations_', '').replace('.csv', '.txt')}"
    output_file_path = os.path.join(output_directory, output_file_name)

    with open(output_file_path, 'w') as text_file:
        for phrase in set(filtered_annotations['annotation_text']):
            phrase_words = nltk.word_tokenize(phrase)
            phrase_embeddings = [model.wv[word].tolist() for word in phrase_words if word in model.wv]
            
            if phrase_embeddings:
                avg_embedding = [round(sum(vec) / len(vec), 6) for vec in zip(*phrase_embeddings)]
                if tuple(avg_embedding) not in unique_embeddings:
                    unique_embeddings.add(tuple(avg_embedding))
                    embedding_str = ', '.join(map(str, avg_embedding))
                    text_file.write(f"Gene: {phrase}, Embedding: [{embedding_str}]\n")

# Define directories
json_dir = "split_pubtator"
csv_dir = "csvfiles"
output_dir = "gen_wordembeddings/CBOW/Gene"
os.makedirs(output_dir, exist_ok=True)

# Process each JSON file and its corresponding gene annotations CSV file
for file_name in os.listdir(json_dir):
    if file_name.endswith(".json"):
        json_file_path = os.path.join(json_dir, file_name)
        csv_file_name = f"gene_annotations_{file_name.replace('proper_pubtator_', '').replace('.json', '')}.csv"
        csv_file_path = os.path.join(csv_dir, csv_file_name)
        if os.path.exists(csv_file_path):
            process_files(json_file_path, csv_file_path, output_dir)


In [None]:
import glob
import os

# Path to the directory containing the text files
input_directory = 'gen_wordembeddings/CBOW/Gene'
output_file_path = 'gen_wordembeddings/CBOW/Gene/CBOW_gene_embeddings_combined.txt'  # Path for the output file

# Use glob to match all '.txt' files in the directory
text_files = glob.glob(os.path.join(input_directory, '*.txt'))

# Open the output file in write mode
with open(output_file_path, 'w') as outfile:
    # Iterate over each file path in the list
    for text_file_path in text_files:
        # Open each file for reading
        with open(text_file_path, 'r') as infile:
            # Write its contents to the output file
            outfile.write(infile.read())
            # Optionally write a newline between the contents of each file
            outfile.write('\n')

print(f'All text files from {input_directory} have been combined into {output_file_path}')


# Chemical

In [None]:
import json
import pandas as pd
from gensim.models import Word2Vec
import nltk
import os

nltk.download('punkt')

def process_files(json_file_path, csv_file_path, output_directory):
    # Load JSON data
    with open(json_file_path, 'r') as json_file:
        json_data = json.load(json_file)

    # Load chemical annotations from CSV
    annotations_df = pd.read_csv(csv_file_path)

    # Convert non-string values in 'annotation_text' column to strings
    annotations_df['annotation_text'] = annotations_df['annotation_text'].astype(str)

    # Extract unique words from the JSON data
    all_words = set()
    for entry in json_data:
        text = entry.get("ARTICLE", {}).get("TEXT", "")
        words = nltk.word_tokenize(text)
        all_words.update(words)

    # Tokenize each entry in the 'annotation_text' column
    annotations_df['annotation_words'] = annotations_df['annotation_text'].apply(nltk.word_tokenize)

    # Filter annotations based on unique words
    filtered_annotations = annotations_df[annotations_df['annotation_words'].apply(lambda x: any(word in all_words for word in x))]

    # Train Word2Vec model (CBOW)
    model = Word2Vec(sentences=[list(all_words)], vector_size=250, window=6, min_count=1, sg=0)

    # Get unique phrases and their embeddings using average pooling
    unique_embeddings = set()
    output_file_name = f"CBOW_Chemical{os.path.basename(csv_file_path).replace('chemical_annotations', '').replace('.csv', '.txt')}"
    output_file_path = os.path.join(output_directory, output_file_name)

    with open(output_file_path, 'w') as text_file:
        for phrase in set(filtered_annotations['annotation_text']):
            phrase_words = nltk.word_tokenize(phrase)
            phrase_embeddings = [model.wv[word].tolist() for word in phrase_words if word in model.wv]

            if phrase_embeddings:
                # Perform average pooling
                avg_embedding = [round(sum(vec) / len(vec), 6) for vec in zip(*phrase_embeddings)]
                
                # Convert embedding list to string representation
                embedding_str = ', '.join(map(str, avg_embedding))

                # Write
                text_file.write(f"Phrase:{phrase}, Embedding: [{embedding_str}]\n")

    print(f"Embeddings saved in {output_file_path}")

# Define the directory containing the JSON and CSV files
json_dir = "split_pubtator"
csv_dir = "csvfiles"
output_directory = "gen_wordembeddings/CBOW/Chemical"
os.makedirs(output_directory, exist_ok=True)

# Process each JSON file and its corresponding chemical annotations CSV file, skipping files ending with "_9"
for file_name in os.listdir(json_dir):
    if file_name.endswith(".json"):
        json_file_path = os.path.join(json_dir, file_name)
        csv_file_name = f"chemical_annotations_{file_name.replace('proper_pubtator_', '').replace('.json', '')}.csv"
        csv_file_path = os.path.join(csv_dir, csv_file_name)
        if os.path.exists(csv_file_path):
            process_files(json_file_path, csv_file_path, output_directory)


In [None]:
import glob
import os

# Path to the directory containing the text files
input_directory = 'gen_wordembeddings/CBOW/Chemical/'
output_file_path = 'gen_wordembeddings/CBOW/Chemical/CBOW_chemical_embeddings_combined.txt'  # Path for the output file

# Use glob to match all '.txt' files in the directory
text_files = glob.glob(os.path.join(input_directory, '*.txt'))

# Open the output file in write mode
with open(output_file_path, 'w') as outfile:
    # Iterate over each file path in the list
    for text_file_path in text_files:
        # Open each file for reading
        with open(text_file_path, 'r') as infile:
            # Write its contents to the output file
            outfile.write(infile.read())
            # Optionally write a newline between the contents of each file
            outfile.write('\n')

print(f'All text files from {input_directory} have been combined into {output_file_path}')


Note : This file shows the code to create word embeddings using CBOW models for Disease, Gene and Chemicals. We can use various window size and vector size to create the same