In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert/distilbert-base-uncased").to(device)

In [2]:
def replace_longest_text_with_halves(texts):
    if not texts:
        return texts

    max_length_index = max(range(len(texts)), key=lambda i: len(texts[i]))
    longest_text = texts[max_length_index]
    mid_index = len(longest_text) // 2
    first_half = longest_text[:mid_index]
    second_half = longest_text[mid_index:]
    texts[max_length_index:max_length_index + 1] = [first_half, second_half]
    return texts

def process_text_vectorization_with_pad(text_list):
    cls_list = []
    for i in tqdm(range(len(text_list))):
        sentences = text_list[i].split('. ')
        tokenized = tokenizer(sentences, add_special_tokens=True, padding = True, return_tensors="pt")
        tokenized = {k:torch.tensor(v).to(device) for k,v in tokenized.items()}

        if tokenized['input_ids'].size(dim=1) > 512:
            sentences_new = replace_longest_text_with_halves(sentences)
            tokenized_new = tokenizer(sentences_new, add_special_tokens=True, padding = True, return_tensors="pt")
            tokenized_new = {k:torch.tensor(v).to(device) for k,v in tokenized_new.items()}
            with torch.no_grad():
                hidden_state_new = model(**tokenized_new)
            cls_state_new = hidden_state_new.last_hidden_state[:,0,:]
            cls_state_new = cls_state_new.to('cpu')
            cls_list.append(torch.unsqueeze(torch.mean(cls_state_new, dim = 0), 0))

        else:
            with torch.no_grad():
                hidden_state = model(**tokenized)
            cls_state = hidden_state.last_hidden_state[:,0,:]
            cls_state = cls_state.to('cpu')
            cls_list.append(torch.unsqueeze(torch.mean(cls_state, dim = 0), 0))
    return cls_list

In [3]:
title_abstract_texts_X = pd.read_csv('title_abstract_texts_X.csv', sep=',')
title_abstract_texts_all = pd.read_csv('title_abstract_texts_all.csv', sep=',')

In [4]:
texts_X = list(title_abstract_texts_X['0'].values)
all_texts = list(title_abstract_texts_all['0'].values)

In [5]:
def add_space_after_period(text):
    return text.replace('.', '. ')

for i in tqdm(range(len(texts_X))):
    abst_sent = texts_X[i].split('. ')
    sent_len = []
    for k in range(len(abst_sent)):
        sent_len.append(len(abst_sent[k]))
    if max(sent_len) > 512:
        texts_X[i] = add_space_after_period(texts_X[i])

100%|██████████| 8676/8676 [00:00<00:00, 220984.76it/s]


In [6]:
text_vect_X = process_text_vectorization_with_pad(texts_X)

 62%|██████▏   | 5420/8676 [01:18<01:11, 45.54it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (545 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 8676/8676 [02:11<00:00, 66.21it/s] 


In [7]:
cls_X_array = []
for i in range(len(text_vect_X)):
    cls_X_array.append(text_vect_X[i][0].numpy())
cls_X_array = np.array(cls_X_array)
cls_X_matrix = pd.DataFrame(cls_X_array)
cls_X_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.342722,-0.004484,-0.346638,-0.168926,0.089236,-0.177676,0.106963,-0.032243,-0.079951,-0.288498,...,0.026986,-0.114727,-0.088227,-0.287282,-0.044252,-0.170710,-0.306280,-0.261389,0.096745,0.505607
1,-0.271171,0.006172,-0.288506,-0.173792,0.021662,-0.350131,0.218746,-0.076112,-0.071149,-0.162519,...,0.016456,-0.122094,-0.238867,-0.075716,0.057287,-0.196784,-0.090823,-0.317880,0.237331,0.293788
2,-0.393772,-0.131484,-0.242243,-0.240539,-0.110215,-0.213364,0.203270,-0.003664,-0.069708,-0.128780,...,0.099234,-0.268260,-0.091081,-0.032991,0.214528,-0.159604,-0.200938,-0.300705,0.152454,0.518812
3,-0.238427,0.034744,-0.139065,-0.176169,-0.021488,-0.231425,0.270621,0.008618,-0.052706,-0.271989,...,-0.051795,-0.227093,-0.151738,-0.056780,0.083351,-0.111407,-0.117005,-0.330875,0.181610,0.390639
4,-0.265137,-0.003155,-0.210607,-0.102286,0.035321,-0.245864,0.102843,-0.061909,-0.095439,-0.341508,...,-0.007006,-0.290218,-0.144364,-0.172650,0.004381,-0.099000,-0.082755,-0.363948,0.211828,0.174176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8671,-0.439830,-0.038333,-0.127146,-0.127971,0.054535,-0.032939,0.105858,-0.126868,-0.101397,-0.207630,...,0.173427,-0.051145,-0.143715,-0.003201,0.132133,-0.267748,-0.163375,-0.247358,0.012413,0.285558
8672,-0.414775,-0.000314,-0.160012,-0.088603,0.124238,-0.067471,0.093089,-0.054681,-0.194643,-0.208247,...,0.158904,-0.144230,-0.096640,0.024968,0.056904,-0.348447,-0.085162,-0.117123,0.030773,0.274094
8673,-0.458199,0.023063,-0.132266,-0.131504,0.075839,-0.114167,0.054859,-0.082713,-0.258355,-0.142009,...,0.225330,-0.117032,-0.143580,0.002110,0.033681,-0.303065,-0.110005,-0.203172,0.123372,0.230422
8674,-0.463911,0.063319,-0.073809,-0.170398,-0.107142,-0.143235,0.151161,-0.038538,-0.199302,-0.070077,...,0.216844,-0.099376,-0.203702,0.081775,0.125852,-0.276327,-0.073297,-0.252165,0.088480,0.226375


In [8]:
cls_X_matrix.to_csv('cls_X_matrix_distilbert.csv', index=False)

In [9]:
for i in tqdm(range(len(all_texts))):
    abst_sent = all_texts[i].split('. ')
    sent_len = []
    for k in range(len(abst_sent)):
        sent_len.append(len(abst_sent[k]))
    if max(sent_len) > 512:
        all_texts[i] = add_space_after_period(all_texts[i])

100%|██████████| 73154/73154 [00:00<00:00, 185133.19it/s]


In [10]:
vect_abs_all = process_text_vectorization_with_pad(all_texts)

100%|██████████| 73154/73154 [20:17<00:00, 60.11it/s]


In [11]:
cls_all_array = []
for i in range(len(vect_abs_all)):
    cls_all_array.append(vect_abs_all[i][0].numpy())
cls_all_array = np.array(cls_all_array)
cls_all_matrix = pd.DataFrame(cls_all_array)
cls_all_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.453605,0.012366,-0.178650,-0.258654,-0.286272,-0.417724,0.261367,-0.196465,0.077612,-0.075825,...,0.119186,-0.305560,-0.086806,0.041364,-0.054232,-0.192086,-0.026442,-0.286045,0.253470,0.517443
1,-0.495989,-0.140710,-0.367238,-0.256807,-0.236114,-0.409673,0.347734,-0.337472,0.016361,0.167308,...,0.184742,-0.222091,-0.317151,0.013867,0.175589,-0.136564,-0.084568,-0.449286,0.192626,0.410023
2,-0.330818,-0.032672,-0.199075,-0.097819,-0.165409,-0.341445,0.252178,-0.238427,-0.008398,0.008714,...,0.122697,-0.224668,-0.151590,0.084835,0.057924,-0.242489,-0.121389,-0.374278,0.318736,0.363864
3,-0.441715,-0.187203,-0.290107,-0.123336,0.012996,-0.208763,0.234556,-0.343167,-0.043320,-0.048483,...,0.186551,-0.119274,-0.166094,0.036903,-0.062789,-0.125323,-0.107254,-0.378749,0.293355,0.367476
4,-0.432027,-0.043028,-0.212612,-0.091781,-0.132811,-0.320810,0.284429,-0.337681,-0.035206,0.054850,...,0.158630,-0.279671,-0.200459,0.051225,0.076997,-0.141877,-0.039395,-0.420144,0.329266,0.499865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73149,-0.331676,-0.138535,-0.219567,-0.180977,-0.086277,-0.310843,0.288212,-0.134300,0.009344,-0.062460,...,0.217527,-0.127800,-0.108598,-0.012845,0.114136,-0.167917,-0.141377,-0.328477,0.232506,0.424495
73150,-0.471300,-0.064664,-0.171229,-0.192913,-0.080736,-0.072503,0.284685,0.051779,-0.077687,-0.198709,...,0.059499,-0.118452,-0.131988,-0.024166,0.243674,-0.320150,-0.162332,-0.307133,0.005646,0.156975
73151,-0.449420,-0.049944,-0.119818,-0.202774,-0.006033,-0.146735,0.224019,-0.013027,-0.125910,-0.162897,...,0.194342,-0.106351,-0.135323,0.097838,0.180262,-0.156133,-0.122205,-0.185664,0.061446,0.207899
73152,-0.364177,-0.202038,-0.162660,-0.252921,-0.001226,-0.321560,0.210685,-0.103481,0.072021,-0.151687,...,0.141423,-0.067063,-0.091468,-0.045794,0.118179,-0.058068,-0.177614,-0.306548,0.154541,0.429382


In [12]:
cls_all_matrix.to_csv('cls_all_matrix_distilbert.csv', index=False)