In [None]:
import torch
from transformers import BertTokenizer,BertModel,BertForPreTraining,BertForQuestionAnswering
from keras.layers import Embedding, Dense, Dropout, Reshape, concatenate
import numpy as np
import glob
import os
import pickle

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = BertModel.from_pretrained('bert-base-cased',output_hidden_states=True)

def get_individual_token_ids(tokenizer, sentence, T=120):
    
    tokens = tokenizer.tokenize(sentence)
    tokens = ['[CLS]'] + tokens + ['[SEP]']
 
    padded_tokens = tokens +['[PAD]' for _ in range(T-len(tokens))]
    attn_mask = [ 1 if token != '[PAD]' else 0 for token in padded_tokens  ]

    seg_ids = [1 for _ in range(len(padded_tokens))]
    sent_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
#     print("senetence idexes \n {} ".format(sent_ids))

    token_ids = torch.tensor(sent_ids).unsqueeze(0) 
    attn_mask = torch.tensor(attn_mask).unsqueeze(0) 
    seg_ids   = torch.tensor(seg_ids).unsqueeze(0)
        
    
    return tokens, token_ids, attn_mask, seg_ids


def get_embedding(last_1_layer, last_2_layer, last_3_layer, last_4_layer, T=120):

    token_list = []
    
    for index in range(T):
        token = torch.add(last_1_layer[index],last_2_layer[index])
        token = torch.add(token,last_3_layer[index])
        token = torch.add(token,last_4_layer[index])
        #print(token)
        token_mean = torch.div(token, 4.0)
        #print(token_mean)
        token_list.append(token_mean)
        #token_mean.shape

#     print ('Shape is: %d x %d' % (len(token_list), len(token_list[0])))

#     sentence_embedding = torch.mean(torch.stack(token_list), dim=0)
#     print(sentence_embedding.shape)

    return token_list


def get_embedding_from_bert(bert_model, token_ids, attn_mask, seg_ids, num_layers=4, T=120, device=device):
    
    bert_model.eval()

    with torch.no_grad():
        model_outputs = bert_model(token_ids.to(device), attention_mask = attn_mask.to(device), token_type_ids = seg_ids.to(device))
        # model_outputs = bert_model(token_ids, attention_mask = attn_mask, token_type_ids = seg_ids)

    last_4_hidden_states = model_outputs[-1][-num_layers:]
#     print('**********', len(model_outputs), len(model_outputs[-1]), len(last_4_hidden_states))
#     print(token_ids)
    
    last_1_layer = torch.squeeze(last_4_hidden_states[0],dim=0)
    last_2_layer = torch.squeeze(last_4_hidden_states[1],dim=0)
    last_3_layer = torch.squeeze(last_4_hidden_states[2],dim=0)
    last_4_layer = torch.squeeze(last_4_hidden_states[3],dim=0)

    token_list_embedding = get_embedding(last_1_layer, last_2_layer, last_3_layer, last_4_layer, T)
    
    return token_list_embedding[:np.count_nonzero(attn_mask)]




def bert_embedding_individuals(output_path, output_file, sentences, tokenizer, bert_model, T=120, device=device):

#     if not os.path.exists(os.path.join(output_path, 'features/')):
#         os.makedirs(os.path.join(output_path, 'features/'))

    sentence_embeddings = []
    
    for sent_id, sentence in enumerate(sentences):
        # print(sentence)
        if len(sentence)==0:
            print(sent_id, 'empty sentence')
            sentence_embeddings.append([])
            continue

        try:
            if sent_id > 0 and sent_id % 50 == 0:
                print('processed {} sentences'.format(sent_id))
            sent_tokens = sentence.split()
            tkns, token_ids, attn_mask, seg_ids = get_individual_token_ids(tokenizer, sentence, T)
            token_list_embedding = get_embedding_from_bert(bert_model, token_ids, attn_mask, seg_ids, T=T, device=device)

            assert tkns[0] == '[CLS]'
            # print(token_list_embedding[0][0].shape, torch.squeeze(token_list_embedding[0]).shape)
            sentence_embeddings.append(torch.squeeze(token_list_embedding[0]))
            #sentence_embeddings.append(token_list_embedding)


        except Exception as e:
            #np.save(os.path.join(output_path, '{}_{}.npy'.format(output_file, sent_id)), sentence_embeddings)
            print(e)
            exit()

    
    #pickle.dump(sentence_embeddings, open(os.path.join(output_path, '{}.p'.format(output_file)), 'wb'))
    # np.save(os.path.join(output_path, 'features/embeddings.bert.npy'), sentence_embeddings)
    
    return sentence_embeddings
bert_model.to(device)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
example=["This is an example"]
example_output=bert_embedding_individuals("f","f",example,tokenizer, bert_model, T=120, device=device)
len(example_output[0])

768

In [None]:
import pandas as pd
nytedu= pd.read_csv('/content/nyt-pol.txt', sep='\t',header=None)
nytedu_d=nytedu.to_dict('records')
len(nytedu_d)

4133

In [None]:
import pickle
#pickle_out = open("/content/Bert-Nyt-EDU.pickle","wb")
import pandas as pd
nytedu= pd.read_csv('/content/nyt-pol.txt', sep='\t',header=None)
nytedu_d=nytedu.to_dict('records')
dataset_embeddings_2={}
import nltk
from nltk.tokenize import sent_tokenize
count_id=0
for s in nytedu_d:
  #print(s)
  l=s[0]
  sent=s[1]
  sent_tok=sent_tokenize(sent)
  sentence_bert=bert_embedding_individuals("f","f",sent_tok,tokenizer, bert_model, T=120, device=device)
  #sentence_feature= [sent2features(sent) for sent in sentence_bert]
  if str(count_id) not in dataset_embeddings_2:
    dataset_embeddings_2[str(count_id)]={}
  #dataset_embeddings[str(count_id)]["id"]=str(count_id)
  dataset_embeddings_2[str(count_id)]["label"]=l
  dataset_embeddings_2[str(count_id)]["article"]=sent
  dataset_embeddings_2[str(count_id)]["bert_emb"]=sentence_bert
 # dataset_embeddings[str(count_id)]["bert_features"]=sentence_feature
  #pickle.dump(dataset_embeddings_2[str(count_id)], pickle_out)
  with open(str("/content/Bert-Nyt-Pol/"+str(count_id)+'.pickle'), 'wb') as handle:
    pickle.dump(dataset_embeddings_2[str(count_id)], handle, protocol=pickle.HIGHEST_PROTOCOL)
  count_id+=1
  print(count_id)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2528
2529
2530
processed 50 sentences
2531
2532
2533
2534
processed 50 sentences
2535
2536
2537
2538
2539
processed 50 sentences
2540
2541
2542
2543
2544
2545
2546
2547
2548
processed 50 sentences
2549
2550
2551
processed 50 sentences
processed 100 sentences
processed 150 sentences
processed 200 sentences
processed 250 sentences
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
processed 50 sentences
2575
2576
2577
2578
processed 50 sentences
2579
processed 50 sentences
2580
processed 50 sentences
2581
2582
processed 50 sentences
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
processed 50 sentences
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
processed 50 sentences
processed 100 sentences
proc

In [None]:
!zip -r /content/Bert_pol.zip /content/Bert-Nyt-Pol/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: content/Bert-Nyt-Pol/3174.pickle (deflated 15%)
  adding: content/Bert-Nyt-Pol/591.pickle (deflated 13%)
  adding: content/Bert-Nyt-Pol/1611.pickle (deflated 13%)
  adding: content/Bert-Nyt-Pol/2120.pickle (deflated 13%)
  adding: content/Bert-Nyt-Pol/5230.pickle (deflated 15%)
  adding: content/Bert-Nyt-Pol/1864.pickle (deflated 13%)
  adding: content/Bert-Nyt-Pol/5522.pickle (deflated 15%)
  adding: content/Bert-Nyt-Pol/3569.pickle (deflated 14%)
  adding: content/Bert-Nyt-Pol/1503.pickle (deflated 15%)
  adding: content/Bert-Nyt-Pol/5347.pickle (deflated 15%)
  adding: content/Bert-Nyt-Pol/5486.pickle (deflated 14%)
  adding: content/Bert-Nyt-Pol/3136.pickle (deflated 15%)
  adding: content/Bert-Nyt-Pol/4494.pickle (deflated 15%)
  adding: content/Bert-Nyt-Pol/1973.pickle (deflated 14%)
  adding: content/Bert-Nyt-Pol/6206.pickle (deflated 13%)
  adding: content/Bert-Nyt-Pol/49.pickle (deflated 15%)
  adding: 

In [None]:
from google.colab import files
files.download("/content/Bert_pol.zip")