In [1]:
import sys
sys.path.insert(0, 'src') 
import os
import json
from functools import partial

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import argparse
from tot.methods.bfs import solve
from tot.tasks.bio_name import Bio_Name

In [4]:
# args = argparse.Namespace(backend='gpt-4-1106-preview', temperature=0.7, task='bio_name', naive_run=False, prompt_sample=None, method_generate='sample_bionames', method_evaluate='votes_for_bionames', method_select='greedy', n_generate_sample=3, n_evaluate_sample=2, n_select_sample=2)
args = argparse.Namespace(backend='gpt-3.5-turbo-1106', temperature=0.7, task='bio_name', naive_run=False, prompt_sample=None, method_generate='sample_bionames', method_evaluate='votes_for_bionames', method_select='greedy', n_generate_sample=3, n_evaluate_sample=2, n_select_sample=2)
task = Bio_Name()

In [136]:
# import pandas as pd
# filename = 'src/tot/data/gene_sets/gene_sets.csv'
# df = pd.read_csv(filename, header=None, encoding='latin1')
# df.dropna(inplace=True)
# df.columns = ['_', '_', 'genes', 'count', 'process']

In [137]:
# x = df['genes'].tolist()
# y = df['process'].tolist()
# with open('src/tot/data/gene_sets/x.txt', 'w') as f:
#     for el in x:
#         f.write(el + '\n')
        
# with open('src/tot/data/gene_sets/y.txt', 'w') as f:
#     for el in y:
#         f.write(el + '\n')

In [5]:
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import torch

SapBERT_tokenizer = AutoTokenizer.from_pretrained('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')
SapBERT_model = AutoModel.from_pretrained('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')

In [6]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [7]:
def getSentenceEmbedding(sentence, tokenizer, model):
    # Tokenize sentences
    encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        
    # Perform pooling. In this case, mean pooling.
    sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
    
    return sentence_embedding

In [8]:
def getSentenceSimilarity(sentence1, sentence2, tokenizer, model, simMetric):
    sentence1_embedding = getSentenceEmbedding(sentence1, tokenizer, model)
    sentence2_embedding = getSentenceEmbedding(sentence2, tokenizer, model)
    
    if simMetric == "cosine_similarity":
        sentenceSim = cosine_similarity(sentence1_embedding, sentence2_embedding)[0][0]
    # ToDo: add other simMetrics
    #elif simMetric == "cosine_similarity_primitive": # use primitive operations
   #     sentenceSim = np.dot(sentence1_embedding, sentence2_embedding)/(norm(sentence1_embedding)*norm(sentence2_embedding))
    
    return sentenceSim, sentence1_embedding, sentence2_embedding

In [9]:
def similarity_score(x, y):
    return getSentenceSimilarity(x, y, SapBERT_tokenizer, SapBERT_model, "cosine_similarity")[0]

In [10]:
def get_all_labels():
    all_process_names = []
    for idx in range(12174):
        label = task.get_label(idx)
        all_process_names.append(label)
    print('Length of all process names: ', len(all_process_names))
    all_process_names = list(set(all_process_names))
    print('Length of all process names: ', len(all_process_names))
    return all_process_names

all_labels = get_all_labels()

import random
# TODO: implement differently
def similarity_quantile(candidate, y):
    scores = []
    test_labels = random.sample(all_labels, 100)
    for label in test_labels:
        if label != y:
            scores.append(similarity_score(label, y))
    scores = np.array(scores)
    candidate_score = similarity_score(candidate, y)
    # return the fraction of scores that are smaller than the candidate
    return (scores < candidate_score).mean()

Length of all process names:  12174
Length of all process names:  12174


In [11]:
def test_example(args, task, idx):
    label = task.get_label(idx)
    final_answer, ys, steps = solve(args, task, idx)
    return final_answer, ys, steps, label


In [12]:
def get_all_candidate_bio_processes(steps):
    candidate_processes = []
    step_count = 0
    for step in steps['steps'][:-1]:
        step_count += 1
        new_ys = [json.loads(step['new_ys'][i]) for i in range(len(step['new_ys']))]
        new_bio_processes = [y['Biological Process'] for y in new_ys]
        candidate_processes.extend(new_bio_processes)
    candidate_processes = list(set(candidate_processes))
    return candidate_processes
    

In [13]:
def get_best_candidate_bio_process(candidate_processes, label):
    scores = [similarity_score(candidate_process, label) for candidate_process in candidate_processes]
    best_candidate_process = candidate_processes[np.argmax(scores)]
    return best_candidate_process, np.max(scores)

In [14]:
from tot.models import *
gpt = partial(gpt, model=args.backend, temperature=args.temperature)
def get_gpt_similarity_score(process1, process2):
    system_message, user_message = task.similarity_prompt_wrap(process1, process2)
    response = gpt(system_message, user_message)
    similarity_score = task.unwrap_similarity(response)
    return similarity_score

In [15]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [16]:
def test_example_wrap(idx):
    final_answer, ys, steps, label = test_example(args, task, idx)
    candidate_processes = get_all_candidate_bio_processes(steps)
    best_candidate_process, best_candidate_similarity_score = get_best_candidate_bio_process(candidate_processes, label)
    final_answer_similarity_score = similarity_score(final_answer, label)
    final_answer_gpt_similarity_score = get_gpt_similarity_score(final_answer, label)
    final_answer_similarity_quantile = similarity_quantile(final_answer, label)
    best_candidate_similarity_quantile = similarity_quantile(best_candidate_process, label)
    print('Index:', idx)
    print('Final answer:', final_answer)
    print('True answer:', label.strip())
    print('Final answer similarity score:', similarity_score(final_answer, label))
    print('Best candidate process:', best_candidate_process)
    print('Best candidate similarity score:', best_candidate_similarity_score)
    print('GPT final answer similarity score:', final_answer_gpt_similarity_score)
    print('GPT best candidate similarity score:', get_gpt_similarity_score(best_candidate_process, label))
    print('Final Answer Similarity Quantile:', final_answer_similarity_quantile)
    print('Best Candidate Similarity Quantile:', best_candidate_similarity_quantile)
    print()
    return {'index': idx, 'final answer': final_answer, 'ys': ys, 'steps': steps, 'label': label, 'final answer similarity score': final_answer_similarity_score,'best candidate process': best_candidate_process, 'best similarity score': best_candidate_similarity_score, 'GPT similarity score': final_answer_gpt_similarity_score,
            'final answer similarity quantile': final_answer_similarity_quantile, 'best candidate similarity quantile': best_candidate_similarity_quantile}

In [17]:
print(similarity_quantile('Intrinsic Apoptotic Signaling Pathway', 'positive regulation of calcium ion transport into cytosol'))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


0.72


In [19]:
results = []
for idx in range(100, 110):
    results.append(test_example_wrap(idx))

KeyboardInterrupt: 

In [None]:
for idx in range(110, 120):
    results.append(test_example_wrap(idx))

Index: 110
Final answer: HOXB13 modulation of canonical Wnt signaling pathway
True answer: epithelial cell differentiation involved in prostate gland development
Final answer similarity score: 0.35918865
Best candidate process: Prostate Gland Development
Best candidate similarity score: 0.84307796
GPT final answer similarity score: 7
GPT best candidate similarity score: 9
Final Answer Similarity Quantile: 0.8
Best Candidate Similarity Quantile: 1.0

Index: 111
Final answer: Cholesterol biosynthesis in the endoplasmic reticulum
True answer: regulation of glycoprotein metabolic process
Final answer similarity score: 0.21200942
Best candidate process: Protein metabolism
Best candidate similarity score: 0.511616
GPT final answer similarity score: 3
GPT best candidate similarity score: 7
Final Answer Similarity Quantile: 0.32
Best Candidate Similarity Quantile: 0.99

Index: 112
Final answer: Glucose Phosphorylation
True answer: GDP-mannose biosynthetic process
Final answer similarity score:

In [None]:
# final_answer, ys, infos = solve(args, task, 0)
# print(ys[0])

In [None]:
# ys, infos = solve(args, task, 0)
# print(ys[0])