# Implementing BERTweet
    이 노트북은 Huggingface의 BERTweet 모델을 이용해 기존 raw data를 BERTweet 벡터 결과물로 변환하는 코드이다.

## Import Libraries and Pretrained model 

In [4]:
import pandas as pd
import torch

from transformers import AutoModel, AutoTokenizer 
import logging

In [None]:
# text:{BREAKING: Armed man takes hostage in ...} text_token: {['breaking', 'armed',...]}
raw_text = pd.read_csv('./data/raw_text_tokens.csv')

### 밑의 코드는 아래와 같은 결과를 출력한다.

    'BREAKING: Armed man takes hostage in kosher grocery east of Paris http://t.co/PBs3sMwhLt'
    
=> *BaseModelOutputWithPoolingAndCrossAttentions*
- 이 객체는 last_hidden_state와 pooler_output 텐서를 보유한다.

In [8]:
raw_text.text[0]

'BREAKING: Armed man takes hostage in kosher grocery east of Paris http://t.co/PBs3sMwhLt'

In [10]:
# For transformers v3.x: # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
## With TensorFlow 2.0+: # from transformers import TFAutoModel # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")

bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False) # For transformers v4.x+

input_ids = torch.tensor([tokenizer.encode(raw_text.text[0])])

with torch.no_grad():
    features = bertweet(input_ids)  # Models outputs are now tuples

# print(features)

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0480,  0.3487,  0.1595,  ..., -0.0423, -0.3147, -0.0868],
         [ 0.1590, -0.0998,  0.3287,  ..., -0.3114, -0.2305, -0.0202],
         [ 0.2081,  0.0818,  0.4836,  ..., -0.0545, -0.0031, -0.1902],
         ...,
         [-0.3463,  0.2122,  0.0758,  ...,  0.2262, -0.4408, -0.0811],
         [-0.0756, -0.1232, -0.0746,  ...,  0.0938,  0.0649, -0.0071],
         [-0.0688,  0.3809,  0.1822,  ..., -0.0761, -0.2949, -0.0837]]]), pooler_output=tensor([[ 2.8395e-01, -1.5147e-01,  2.4038e-02, -2.1575e-01,  2.1765e-02,
         -1.5563e-01,  1.0030e-01, -9.7862e-02,  2.0039e-01, -1.8787e-01,
          1.9666e-02, -4.1347e-02, -1.3147e-01,  3.1222e-02,  2.9899e-01,
         -1.3644e-01, -2.3771e-01, -4.6295e-02,  6.6188e-02,  2.8947e-03,
         -1.7158e-01, -9.1481e-02,  2.9454e-01,  7.3388e-02,  2.198

## Processing the entire raw_test corpus

In [141]:
tokens = [tokenizer.tokenize(tweet) for tweet in raw_text.text]

In [100]:
input_ids = torch.tensor([tokenizer.encode(raw_text.text[0])])

with torch.no_grad():
    features = bertweet(input_ids)  # Models outputs are now tuples

torch.Size([1, 25, 768])
torch.Size([25, 768])
torch.Size([25, 768])
torch.Size([768])
Our final sentence embedding vector of shape: torch.Size([768])


In [140]:
input_ids # input sentence's word indices

tensor([[    0, 19481, 31440,    22, 19959,   171,   956, 20778,    16,  3322,
         10794,  8923,  3420,    15,  3177, 45565, 11412, 46442,   423,   698,
           423,   455,  2938, 31429,     2]])

In [139]:
last_hidden = features.last_hidden_state
print(last_hidden.shape)
sentence_embedding = torch.mean(last_hidden, dim=0)
print(sentence_embedding.shape)
token_vecs = last_hidden[-1]
print(token_vecs.shape)
sentence_embedding = torch.mean(token_vecs, dim=0)
print(sentence_embedding.shape)

print ("Our final sentence embedding vector of shape:", sentence_embedding.size())

In [143]:
sentence_embedding

tensor([-5.0837e-02,  9.2732e-02,  1.1474e-01,  7.5394e-02,  1.0709e-01,
         1.3905e-01,  7.3160e-02,  1.3031e-01,  2.2848e-01, -1.0013e-01,
        -1.4994e-02,  1.1385e-01, -1.5093e-01,  1.2282e-01,  1.0534e-01,
         2.8266e-01, -9.2135e-03,  2.6026e-02,  3.8447e-01,  5.2748e-03,
        -2.9484e-02,  8.4773e-03,  1.3608e-01,  8.6456e-02,  1.8791e-01,
        -9.6116e-03,  6.7772e-02,  1.6709e-02,  2.5986e-01,  3.6181e-02,
        -3.3826e-02, -3.3800e-02,  1.2717e-01, -3.4795e-02,  1.8405e-01,
        -2.0551e-01,  3.6112e-02,  3.1310e-01,  2.8765e-02,  6.6649e-02,
        -2.6885e-02, -3.0104e-01,  2.5878e-02,  2.2134e-01,  4.1341e-01,
        -4.6079e-02, -8.6279e-03, -8.9164e-02,  1.8050e-01, -3.8341e-02,
         6.2557e-02,  1.9168e-02,  2.9405e-01, -1.2726e-01, -7.2294e-02,
        -1.6538e-01,  1.6099e-01,  3.1980e-01,  2.2167e-01, -4.7767e-02,
         4.3942e-02, -9.4910e-02,  1.6816e-01, -9.8562e-02,  2.3590e-01,
         8.3192e-03,  1.7944e-01, -4.2238e-02,  2.3

In [111]:
for i, token_str in enumerate(tokens[0]):
  print (i, token_str)

0 BREA@@
1 KING@@
2 :
3 Armed
4 man
5 takes
6 hostage
7 in
8 ko@@
9 sher
10 grocery
11 east
12 of
13 Paris
14 http://@@
15 t.co/@@
16 PB@@
17 s@@
18 3@@
19 s@@
20 M@@
21 wh@@
22 Lt


In [None]:
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bank 
# in "bank robber" vs "river bank" (different meanings).
diff_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[19])

# Calculate the cosine similarity between the word bank
# in "bank robber" vs "bank vault" (same meaning).
same_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[6])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)
print('Vector similarity for *different* meanings:  %.2f' % diff_bank)

In [36]:
raw_text.text[0]

'BREAKING: Armed man takes hostage in kosher grocery east of Paris http://t.co/PBs3sMwhLt'

In [68]:
tokens = [tokenizer.tokenize(tweet) for tweet in raw_text.text]
# tokens[1]

In [67]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokens[1])
texts_inverse = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
# texts_inverse[0]

In [74]:
# Display the words with their indeces.
for tup in zip(tokens[0], texts_inverse[0]):
    print('{} {}'.format(tup[0], tup[1]))

BREA@@ 19481
KING@@ 31440
: 22
Armed 19959
man 171
takes 956
hostage 20778
in 16
ko@@ 3322
sher 10794
grocery 8923
east 3420
of 15
Paris 3177
http://@@ 45565
t.co/@@ 11412
PB@@ 46442
s@@ 423
3@@ 698
s@@ 423
M@@ 455
wh@@ 2938
Lt 31429


## Bulk Embedding

In [294]:
from torch.utils.data import Dataset, DataLoader

In [295]:
text = raw_text.text
text.dropna(inplace=True)
# text = text.sample(frac=0.4, random_state=999)

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

sents = [tokenizer.tokenize(tweet) for tweet in text]
len(sents)

In [302]:
bertweet.eval()

embeddings = []

for sent in sents:
    input_ids = torch.tensor([tokenizer.encode(sent)])
    with torch.no_grad():
        features = bertweet(input_ids)  # Models outputs are now tuples
    embeddings.append(features)
    
result = [torch.mean(features.last_hidden_state[-1], dim=0).tolist() for features in embeddings]

In [308]:
df_bertweet = pd.DataFrame(result)
print(df_bertweet.shape)

(5802, 768)

In [3]:
df_bertweet.to_csv('./bertweet.csv', index = False)

NameError: name 'df_bertweet' is not defined

### Save the file

In [46]:
bertweet = pd.read_csv('./bertweet.csv')
df_data = pd.read_csv('./data_notembeded.csv')
print(df_data.shape)
print(bertweet.shape)
df_bertweet = pd.concat([df_data,bertweet],axis=1)
print(df_bertweet.shape)

(5802, 35)
(5802, 768)
(5802, 803)


In [47]:
df_bertweet.to_csv('./df_bertweet.csv', index = False)

## Bulk Embedding (Validation File)

In [32]:
data = pd.read_csv('data_valid_notembeded.csv')

text = data.text
text.dropna(inplace=True)
# text = text.sample(frac=0.4, random_state=999)

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

sents = [tokenizer.tokenize(tweet) for tweet in text]
len(sents)

In [38]:
bertweet.eval()
embeddings = []

for sent in sents:
    input_ids = torch.tensor([tokenizer.encode(sent)])
    with torch.no_grad():
        features = bertweet(input_ids)  # Models outputs are now tuples
    embeddings.append(features)
    
result = [torch.mean(features.last_hidden_state[-1], dim=0).tolist() for features in embeddings]

In [40]:
bertweet = pd.DataFrame(result)
print(bertweet.shape)
print(data.shape)
print(bertweet.shape)
df_valid_bertweet = pd.concat([data,bertweet],axis=1)
print(df_valid_bertweet.shape)

(390, 768)


In [6]:
bertweet

NameError: name 'bertweet' is not defined

In [45]:
df_valid_bertweet.to_csv('./df_valid_bertweet.csv', index = False)

## Bulk Embedding (Train and Test File)

In [None]:
data = pd.read_csv('train.csv')

text = data.text
text.dropna(inplace=True)
# text = text.sample(frac=0.4, random_state=999)

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

sents = [tokenizer.tokenize(tweet) for tweet in text]
print(len(sents))

bertweet.eval()
embeddings = []

for sent in sents:
    input_ids = torch.tensor([tokenizer.encode(sent)])
    with torch.no_grad():
        features = bertweet(input_ids)  # Models outputs are now tuples
    embeddings.append(features)
    
result = [torch.mean(features.last_hidden_state[-1], dim=0).tolist() for features in embeddings]

bertweet = pd.DataFrame(result)
print(bertweet.shape)
print(data.shape)
print(bertweet.shape)
df_valid_bertweet = pd.concat([data,bertweet],axis=1)
print(df_valid_bertweet.shape)