In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1").to(device)

In [2]:
def replace_longest_text_with_halves(texts):
    if not texts:
        return texts

    max_length_index = max(range(len(texts)), key=lambda i: len(texts[i]))
    longest_text = texts[max_length_index]
    mid_index = len(longest_text) // 2
    first_half = longest_text[:mid_index]
    second_half = longest_text[mid_index:]
    texts[max_length_index:max_length_index + 1] = [first_half, second_half]
    return texts

def process_text_vectorization_with_pad(text_list):
    cls_list = []
    for i in tqdm(range(len(text_list))):
        sentences = text_list[i].split('. ')
        tokenized = tokenizer(sentences, add_special_tokens=True, padding = True, return_tensors="pt")
        tokenized = {k:torch.tensor(v).to(device) for k,v in tokenized.items()}

        if tokenized['input_ids'].size(dim=1) > 512:
            sentences_new = replace_longest_text_with_halves(sentences)
            tokenized_new = tokenizer(sentences_new, add_special_tokens=True, padding = True, return_tensors="pt")
            tokenized_new = {k:torch.tensor(v).to(device) for k,v in tokenized_new.items()}
            with torch.no_grad():
                hidden_state_new = model(**tokenized_new)
            cls_state_new = hidden_state_new.last_hidden_state[:,0,:]
            cls_state_new = cls_state_new.to('cpu')
            cls_list.append(torch.unsqueeze(torch.mean(cls_state_new, dim = 0), 0))

        else:
            with torch.no_grad():
                hidden_state = model(**tokenized)
            cls_state = hidden_state.last_hidden_state[:,0,:]
            cls_state = cls_state.to('cpu')
            cls_list.append(torch.unsqueeze(torch.mean(cls_state, dim = 0), 0))
    return cls_list

In [3]:
title_abstract_texts_X = pd.read_csv('title_abstract_texts_X.csv', sep=',')
title_abstract_texts_all = pd.read_csv('title_abstract_texts_all.csv', sep=',')

In [4]:
texts_X = list(title_abstract_texts_X['0'].values)
all_texts = list(title_abstract_texts_all['0'].values)

In [5]:
def add_space_after_period(text):
    return text.replace('.', '. ')

for i in tqdm(range(len(texts_X))):
    abst_sent = texts_X[i].split('. ')
    sent_len = []
    for k in range(len(abst_sent)):
        sent_len.append(len(abst_sent[k]))
    if max(sent_len) > 512:
        texts_X[i] = add_space_after_period(texts_X[i])

100%|██████████| 8676/8676 [00:00<00:00, 46754.81it/s]


In [6]:
text_vect_X = process_text_vectorization_with_pad(texts_X)

100%|██████████| 8676/8676 [04:04<00:00, 35.41it/s] 


In [7]:
cls_X_array = []
for i in range(len(text_vect_X)):
    cls_X_array.append(text_vect_X[i][0].numpy())
cls_X_array = np.array(cls_X_array)
cls_X_matrix = pd.DataFrame(cls_X_array)
cls_X_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.142937,-0.010388,-0.300239,0.045603,-0.224271,0.088998,0.058248,-0.025525,0.244006,-0.008804,...,0.076133,0.173372,-0.241930,0.161019,0.088807,0.111223,0.104462,-0.260264,0.255358,-0.133687
1,0.240684,-0.072100,-0.284138,0.157778,-0.463088,0.019786,0.067615,-0.060429,0.253169,-0.220929,...,0.042961,0.313909,-0.149251,-0.050656,0.213824,0.329547,0.132560,-0.298476,0.017470,-0.192766
2,0.345509,-0.062046,-0.108999,0.108394,-0.623241,-0.025925,0.004161,-0.091543,0.199448,-0.100404,...,0.114594,0.371746,-0.270757,-0.089838,0.109995,0.336510,0.136897,-0.386282,0.062447,-0.077958
3,0.169847,0.044081,-0.186888,0.110598,-0.346632,-0.030911,0.120177,0.159316,0.360916,-0.222897,...,0.113273,0.240852,-0.253946,0.023377,-0.064863,0.363694,0.146908,-0.102513,-0.007998,-0.346613
4,-0.059993,0.206300,-0.111001,0.072121,-0.514626,0.122562,-0.300802,0.178734,0.326506,-0.181128,...,0.153605,0.418970,-0.337501,0.021896,0.195015,0.189413,0.160346,-0.120846,0.139140,-0.064849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8671,0.514404,-0.341991,-0.277758,-0.281315,-0.900135,0.088724,-0.115461,-0.150830,0.266347,-0.638464,...,0.043787,0.051494,-0.736108,0.004996,0.224178,0.559863,0.073966,-0.292186,0.474752,-0.067334
8672,0.438990,-0.280852,-0.160663,-0.032951,-1.005837,0.275318,-0.108462,-0.137402,0.193869,-0.638665,...,-0.070142,0.079129,-0.458560,-0.096629,0.211857,0.641127,0.053482,-0.344218,0.601001,-0.018824
8673,0.382158,-0.150904,-0.287280,-0.181921,-0.814225,0.091256,-0.172458,-0.042342,0.266287,-0.636699,...,0.019253,0.023700,-0.564028,-0.023283,0.184329,0.701509,0.046647,-0.392832,0.548401,-0.008058
8674,0.544432,-0.274354,-0.198821,0.058009,-1.008883,0.044100,-0.268683,-0.269985,0.220555,-0.667259,...,0.086059,0.054656,-0.790563,0.070375,0.164737,0.594265,0.113003,-0.400263,0.381107,-0.027618


In [8]:
cls_X_matrix.to_csv('cls_X_matrix_biobert.csv', index=False)

In [9]:
for i in tqdm(range(len(all_texts))):
    abst_sent = all_texts[i].split('. ')
    sent_len = []
    for k in range(len(abst_sent)):
        sent_len.append(len(abst_sent[k]))
    if max(sent_len) > 512:
        all_texts[i] = add_space_after_period(all_texts[i])

100%|██████████| 73154/73154 [00:00<00:00, 122860.90it/s]


In [10]:
vect_abs_all = process_text_vectorization_with_pad(all_texts)

100%|██████████| 73154/73154 [37:39<00:00, 32.38it/s]


In [11]:
cls_all_array = []
for i in range(len(vect_abs_all)):
    cls_all_array.append(vect_abs_all[i][0].numpy())
cls_all_array = np.array(cls_all_array)
cls_all_matrix = pd.DataFrame(cls_all_array)
cls_all_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.152789,-0.057335,-0.201124,0.137978,-0.451061,0.098379,-0.097089,0.142553,0.413031,-0.116714,...,0.188383,0.186859,-0.063542,-0.130815,-0.220174,0.168822,-0.000159,-0.194338,0.269422,-0.150680
1,0.157053,-0.113428,0.063330,0.175639,-0.637485,-0.015452,-0.499921,-0.151592,0.181730,-0.171021,...,0.357190,0.232132,-0.058162,-0.028964,0.000345,0.213508,0.149837,-0.350822,0.080811,-0.046752
2,0.190852,-0.074956,-0.151756,0.151558,-0.571585,0.057746,-0.283601,-0.194791,0.231604,-0.259788,...,0.271830,0.293320,-0.014891,-0.089742,0.002669,0.196212,0.051039,-0.354713,0.035299,0.021342
3,0.223215,-0.200790,-0.054527,0.212024,-0.492094,0.001122,-0.127933,-0.093847,0.415823,-0.084688,...,0.178366,0.178773,-0.006907,-0.225145,-0.164677,0.156718,-0.131933,-0.377415,0.178100,-0.116207
4,0.238538,-0.250770,-0.110454,0.099299,-0.645052,-0.031172,-0.093688,-0.132495,0.298149,-0.155859,...,0.074458,0.217701,-0.007661,-0.177414,-0.132766,0.176126,0.037410,-0.383047,0.182842,-0.093565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73149,0.328874,-0.041704,-0.251929,0.008888,-0.403652,0.069325,-0.171900,0.115498,0.279273,-0.239867,...,0.087497,0.303288,-0.137332,0.240859,-0.036940,0.029911,-0.020448,-0.437462,-0.059018,-0.067692
73150,0.518474,-0.180379,-0.216814,-0.165273,-0.844066,-0.090597,-0.053323,-0.115919,0.268101,-0.615088,...,-0.111789,0.095170,-0.633553,0.014044,0.242705,0.604467,0.070909,-0.398274,0.492807,-0.165588
73151,0.532831,-0.201155,-0.339364,-0.046115,-0.834949,0.152010,-0.056959,-0.085662,0.369053,-0.622308,...,-0.137428,0.019984,-0.753576,0.134231,0.341834,0.599596,0.063056,-0.344714,0.284223,-0.136012
73152,0.333343,-0.088470,-0.216634,0.053493,-0.562578,0.073285,-0.157093,0.074940,0.325587,-0.224736,...,0.190844,0.358786,-0.069722,0.005553,-0.066968,0.198251,0.093936,-0.345373,0.005680,-0.327663


In [12]:
cls_all_matrix.to_csv('cls_all_matrix_biobert.csv', index=False)