In [1]:
import transformers
from transformers import XLNetTokenizer, XLNetModel, AutoModel, BertTokenizerFast, AdamW

import torch
import torch.nn as nn
from torch.nn import MSELoss
from torch.utils.data import DataLoader, Dataset, TensorDataset
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, StandardScaler

from torch.nn import TripletMarginLoss

import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm import tqdm
from scipy.stats import spearmanr
import mplcursors
import time
import random
import pandas as pd
import numpy as np
import warnings
import re
import json
import networkx as nx
import obonet
from collections import Counter
import collections
import pickle

from pytorch_metric_learning.losses import NTXentLoss

# Solubility

In [2]:
import torch
import random
import numpy as np
import pandas as pd
import gc
from sklearn.metrics import accuracy_score

from src.litgene import FineTunedBERT, getEmbeddings
from src.utils import process_data
from src.train import trainer

torch.cuda.empty_cache()
gc.collect()

0

### Load Pre-trained Model

In [3]:
import pickle

dPathSolubility = '/home/tailab/LitGene/models/solubility/best_model.pth'

pool = "mean"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
genes = pd.read_csv("data/combined_solubility.csv")
model_name= "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

# create instance of model
model = FineTunedBERT(pool=pool,
                      model_name=model_name,
                      gene2vec_flag=False,
                      gene2vec_hidden=200,
                      task_type="unsupervised",
                      n_labels=len(set(genes.Label)),
                      device=device).to(device)

sol_model = torch.load(dPathSolubility)



In [4]:
model.load_state_dict(sol_model.module.state_dict())

<All keys matched successfully>

### Get Embeddings

In [5]:
# Data path
geneDataPath = "/home/tailab/LitGene/data/clean_genes.csv"
all_genes = pd.read_csv(geneDataPath)

In [6]:
# get embeddings and solublity predictions for each gene
sol_embeddings, sol_preds = getEmbeddings(all_genes["Summary"].to_list(),
                                          batch_size=20,
                                          model = sol_model.module,
                                          return_preds=True)

Loading a pretrained model ...




tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Tokenization ...
Tokenization Done.
Get Embeddings ...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 723/723 [01:39<00:00,  7.25it/s]


(14450, 768)
(14450, 2)


In [7]:
# compile predictions
df_preds = pd.DataFrame({
    'preds': ['Soluble' if pred == 1 else 'Insoluble' for pred in np.argmax(sol_preds, axis=1)]
}, index=all_genes["Gene name"].to_list())

df_preds.index.name = 'Gene'

In [8]:
df_preds

Unnamed: 0_level_0,preds
Gene,Unnamed: 1_level_1
FES,Soluble
HADHA,Insoluble
SLC7A7,Insoluble
LCK,Insoluble
HSPA2,Soluble
...,...
BPY2C,Soluble
CLPS,Insoluble
DNER,Insoluble
SOX7,Soluble
