In [None]:
# Implementing BERTweet
    이 노트북은 Huggingface의 BERTweet 모델을 이용해 기존 raw data를 BERTweet 벡터 결과물로 변환하는 코드이다.
1. 라이브러리 가져오기
2. Pretrained Model / Tokenizer 불러온후
3. 전체 코처스를 토큰화한 후에 

In [None]:
## Import Libraries and Pretrained model 

In [1]:
import pandas as pd
import torch

from transformers import AutoModel, AutoTokenizer 
import logging

In [2]:
# text:{BREAKING: Armed man takes hostage in ...} text_token: {['breaking', 'armed',...]}
raw_text = pd.read_csv('./data/raw_text_tokens.csv')

In [None]:
### 밑의 코드는 아래와 같은 결과를 출력한다.

    'BREAKING: Armed man takes hostage in kosher grocery east of Paris http://t.co/PBs3sMwhLt'
    
=> *BaseModelOutputWithPoolingAndCrossAttentions*
- 이 객체는 last_hidden_state와 pooler_output 텐서를 보유한다.

In [3]:
# ------------------------------------ 테스팅 ----------------------------------- #
# raw_text.text[0]

In [4]:
# For transformers v3.x: # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
# With TensorFlow 2.0+: # from transformers import TFAutoModel # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")

bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
tokenizer = AutoTokenizer.from_pretrained(
    "vinai/bertweet-base", use_fast=False)  # For transformers v4.x+

# ------------------------------------ 테스팅 ----------------------------------- #
# input_ids = torch.tensor([tokenizer.encode(raw_text.text[0])])
# with torch.no_grad():
#     features = bertweet(input_ids)  # Models outputs are now tuples
# print(features)


Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [None]:
## Processing the entire raw_test corpus

In [12]:
tokens = [tokenizer.tokenize(tweet) for tweet in raw_text.text]
input_ids = torch.tensor([tokenizer.encode(raw_text.text[0])])

# with torch.no_grad():
#     features = bertweet(input_ids)  # Models outputs are now tuples

In [6]:
input_ids # input sentence's word indices

tensor([[    0, 19481, 31440,    22, 19959,   171,   956, 20778,    16,  3322,
         10794,  8923,  3420,    15,  3177, 45565, 11412, 46442,   423,   698,
           423,   455,  2938, 31429,     2]])

In [13]:
last_hidden = features.last_hidden_state
print(last_hidden.shape)
sentence_embedding = torch.mean(last_hidden, dim=0)
print(sentence_embedding.shape)
token_vecs = last_hidden[-1]
print(token_vecs.shape)
sentence_embedding = torch.mean(token_vecs, dim=0)
print(sentence_embedding.shape)

print ("Our final sentence embedding vector of shape:", sentence_embedding.size())

torch.Size([1, 25, 768])
torch.Size([25, 768])
torch.Size([25, 768])
torch.Size([768])
Our final sentence embedding vector of shape: torch.Size([768])


In [None]:
# sentence_embedding

In [None]:
for i, token_str in enumerate(tokens[0]):
  print (i, token_str)

In [None]:
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bank 
# in "bank robber" vs "river bank" (different meanings).
diff_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[19])

# Calculate the cosine similarity between the word bank
# in "bank robber" vs "bank vault" (same meaning).
same_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[6])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)
print('Vector similarity for *different* meanings:  %.2f' % diff_bank)

In [36]:
raw_text.text[0]

'BREAKING: Armed man takes hostage in kosher grocery east of Paris http://t.co/PBs3sMwhLt'

In [68]:
tokens = [tokenizer.tokenize(tweet) for tweet in raw_text.text]
# tokens[1]

In [67]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokens[1])
texts_inverse = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
# texts_inverse[0]

In [74]:
# Display the words with their indeces.
for tup in zip(tokens[0], texts_inverse[0]):
    print('{} {}'.format(tup[0], tup[1]))

BREA@@ 19481
KING@@ 31440
: 22
Armed 19959
man 171
takes 956
hostage 20778
in 16
ko@@ 3322
sher 10794
grocery 8923
east 3420
of 15
Paris 3177
http://@@ 45565
t.co/@@ 11412
PB@@ 46442
s@@ 423
3@@ 698
s@@ 423
M@@ 455
wh@@ 2938
Lt 31429


In [None]:
## Bulk Embedding

In [294]:
from torch.utils.data import Dataset, DataLoader

In [295]:
text = raw_text.text
text.dropna(inplace=True)
# text = text.sample(frac=0.4, random_state=999)

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

sents = [tokenizer.tokenize(tweet) for tweet in text]
len(sents)

In [302]:
bertweet.eval()

embeddings = []

for sent in sents:
    input_ids = torch.tensor([tokenizer.encode(sent)])
    with torch.no_grad():
        features = bertweet(input_ids)  # Models outputs are now tuples
    embeddings.append(features)
    
result = [torch.mean(features.last_hidden_state[-1], dim=0).tolist() for features in embeddings]

In [308]:
df_bertweet = pd.DataFrame(result)
print(df_bertweet.shape)

(5802, 768)

In [3]:
df_bertweet.to_csv('./bertweet.csv', index = False)

NameError: name 'df_bertweet' is not defined

In [None]:
### Save the file

In [46]:
bertweet = pd.read_csv('./bertweet.csv')
df_data = pd.read_csv('./data_notembeded.csv')
print(df_data.shape)
print(bertweet.shape)
df_bertweet = pd.concat([df_data,bertweet],axis=1)
print(df_bertweet.shape)

(5802, 35)
(5802, 768)
(5802, 803)


In [47]:
df_bertweet.to_csv('./df_bertweet.csv', index = False)

In [None]:
## Bulk Embedding (Validation File)

In [32]:
data = pd.read_csv('data_valid_notembeded.csv')

text = data.text
text.dropna(inplace=True)
# text = text.sample(frac=0.4, random_state=999)

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

sents = [tokenizer.tokenize(tweet) for tweet in text]
len(sents)

In [38]:
bertweet.eval()
embeddings = []

for sent in sents:
    input_ids = torch.tensor([tokenizer.encode(sent)])
    with torch.no_grad():
        features = bertweet(input_ids)  # Models outputs are now tuples
    embeddings.append(features)
    
result = [torch.mean(features.last_hidden_state[-1], dim=0).tolist() for features in embeddings]

In [40]:
bertweet = pd.DataFrame(result)
print(bertweet.shape)
print(data.shape)
print(bertweet.shape)
df_valid_bertweet = pd.concat([data,bertweet],axis=1)
print(df_valid_bertweet.shape)

(390, 768)


In [6]:
bertweet

NameError: name 'bertweet' is not defined

In [45]:
df_valid_bertweet.to_csv('./df_valid_bertweet.csv', index = False)

In [None]:
## Bulk Embedding (Train and Test File)

In [None]:
data = pd.read_csv('train.csv')

text = data.text
text.dropna(inplace=True)
# text = text.sample(frac=0.4, random_state=999)

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

sents = [tokenizer.tokenize(tweet) for tweet in text]
print(len(sents))

bertweet.eval()
embeddings = []

for sent in sents:
    input_ids = torch.tensor([tokenizer.encode(sent)])
    with torch.no_grad():
        features = bertweet(input_ids)  # Models outputs are now tuples
    embeddings.append(features)
    
result = [torch.mean(features.last_hidden_state[-1], dim=0).tolist() for features in embeddings]

bertweet = pd.DataFrame(result)
print(bertweet.shape)
print(data.shape)
print(bertweet.shape)
df_valid_bertweet = pd.concat([data,bertweet],axis=1)
print(df_valid_bertweet.shape)