# 電影評論
* data = https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data  
submmit: https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/submit  
* step
### 1. [準備原始文本數據](#preprocessing)
### 2. [BERT格式](#bertmode)
### 3. [下游任務模型](#finetune)
### 4. [訓練模型](#model)
### 5. [新樣本預測](#predict)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertModel, BertForMaskedLM
import os
import torch
from IPython.display import clear_output
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertForSequenceClassification

<a id = preprocessing></a>
### 準備原始文本數據

In [None]:
train = pd.read_csv('/home/bettyliao/sentiment/data/kaggle_movie_reviews/train.tsv', sep = '\t') 
test_ = pd.read_csv('/home/bettyliao/sentiment/data/kaggle_movie_reviews/test.tsv', sep = '\t')
print(f"""train: {train.columns.values}\ntest: {test_.columns.values}\n""") 
print(f"""train info: {train.info()}\ntest info:{test_.info()}""")

In [None]:
train = train[['Phrase', 'Sentiment']]
test = test_[['Phrase']]
display(train.head(), test.head())

In [None]:
train['Phrase'] = train['Phrase'].str.replace('\.', '[SEP]')
train['Phrase'] = train['Phrase'].str.replace(',', '[SEP]')

test['Phrase'] = test['Phrase'].str.replace('\.', '[SEP]')
test['Phrase'] = test['Phrase'].str.replace(',', '[SEP]')

In [None]:
sent_count = train.groupby(['Sentiment']).size().to_frame('count').reset_index() 
plt.figure(facecolor = 'grey')
plt.bar(sent_count['Sentiment'], sent_count['count'])
plt.title('Sentiment distribute')
print('each catrgory ratio: \n',train['Sentiment'].value_counts()/ len(train))

In [None]:
train_ = train.sample(frac = 0.01, random_state = 123)
train_.to_csv('/home/bettyliao/sentiment/data/kaggle_movie_reviews/train_.tsv', sep = '\t', index = False)
test.to_csv('/home/bettyliao/sentiment/data/kaggle_movie_reviews/test_.tsv', sep = '\t', index = False) 

<a id = bertmode></a>
### ● BERT格式

In [None]:
PRETRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME) 
vocab = tokenizer.vocab # 28996

In [None]:
type(train_['Phrase'].iloc[1])

In [None]:
class SentiDataset(Dataset):
    os.chdir('/home/bettyliao/sentiment/data/kaggle_movie_reviews')
    def __init__(self, mode, tokenizer):
        assert mode in ['train_', 'test_']
        self.mode = mode
        self.df = pd.read_csv(mode + '.tsv', sep = '\t').fillna('') 
        self.len = len(self.df)
        self.tokenizer = tokenizer
    def __getitem__(self, idx):
        if self.mode == 'test_':
            text_a = self.df.Phrase.iloc[idx]
            text_b = ''
            label_tensor = None
        else:
            text_a = self.df.Phrase.iloc[idx]
            text_b = ''
            label_tensor = torch.tensor(self.df.Sentiment.iloc[idx]) 
        # text_a
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces) 
        tokens_tensor = torch.tensor(ids)
        segments_tensor = torch.tensor([0] * len_a, dtype = torch.long) 
        
        return (tokens_tensor, segments_tensor, label_tensor)
    def __len__(self):
        return self.len

In [None]:
trainset = SentiDataset('train_', tokenizer = tokenizer)

In [None]:
sample_idx = 1
text_a, label = trainset.df.iloc[sample_idx].values  
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx] 
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist()) 
combined_text = ' '.join(tokens)

print(f"""
[origin]
sentence_a = {text_a}
label = {label}
---------------------------
[tensors]
tokens_tensor: {tokens_tensor}
segments_tensor: {segments_tensor}
label_tensor: {label_tensor}
[text]
{combined_text}
""")

In [None]:
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples]) 
    else:
        label_ids = None
    tokens_tensors = pad_sequence(tokens_tensors, batch_first = True) 
    segments_tensors = pad_sequence(segments_tensors, batch_first = True) 
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype = torch.long) 
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)  
    return tokens_tensors, segments_tensors, masks_tensors, label_ids
batch_size = 64
trainloader =  DataLoader(trainset, batch_size = batch_size, collate_fn = create_mini_batch)

In [None]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, masks_tensors, label_ids = data  

# tokens_tensors, segments_tensors, masks_tensors因長度不同需padding  
print(f"""
tokens_tensors: {tokens_tensors.shape}
{tokens_tensors}
-------------------------------------
segments_tensors: {segments_tensors.shape}
{segments_tensors}
------------------------------------------
masks_tensors: {masks_tensors.shape}
{masks_tensors}
------------------------------------
label_ids.shape = {label_ids.shape}
{label_ids}
""")

 <a id = finetune></a>
### ●下游任務模型

In [None]:
PRETRAINED_MODEL_NAME = 'bert-base-cased'
NUM_LABELS = 5
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels = NUM_LABELS) 
clear_output()

print("""
name     module
-------------------""")
for name, module in model.named_children():
    if name == 'bert':
        for n, _ in module.named_children():
            print(f'{name} : {n}')
    else:
        print('{:15}{}'.format(name, module))

<a id = model></a>
### ● 訓練模型

In [None]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("初始 classification acc:", round(acc * 100, 2))

In [None]:
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad] 

model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)

print(f'''
整體模型參數： {sum(p.numel() for p in model_params)}
線性模型參數: {sum(p.numel() for p in clf_params)}
''')

In [None]:
%%time
model.train()

optimizer = torch.optim.Adam(model.parameters(), lr = 1e-5) 
EPOCHS = 6  #
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data_ in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data_]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids = tokens_tensors, 
                        token_type_ids = segments_tensors, 
                        attention_mask = masks_tensors, 
                        labels = labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

<a id = predict></a>
### ● 新樣本預測

In [None]:
%%time
testset = SentiDataset('test_', tokenizer = tokenizer)
testloader = DataLoader(testset, batch_size = 256, collate_fn = create_mini_batch) 

In [None]:
predictions = get_predictions(model, testloader)

In [None]:
df = pd.DataFrame({'Sentiment': predictions.tolist()})

In [None]:
final = pd.concat([test_['PhraseId'], df], axis = 1)
final.Sentiment = final.Sentiment.astype('str')
final.Sentiment = final.Sentiment.str.replace('.0', '')

In [None]:
final.head()

In [None]:
final.to_csv('/home/bettyliao/sentiment/output/bert_result.csv', index = False) 

In [None]:
model.config

參考資料：  
https://medium.com/programming-with-data/32-transformer-%E9%A0%90%E8%A8%93%E7%B7%B4-%E9%9B%86%E5%A4%A7%E6%88%90%E7%9A%84-bert-%E6%A8%A1%E5%9E%8B-c928530f6db8