<a href="https://colab.research.google.com/github/zzozfe/essay/blob/master/BERT_sentiment_analysis_for_FOMC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install rarfile
!pip install transformers

In [0]:
import torch
from transformers import BertTokenizer
from IPython.display import clear_output
import numpy as np
import pandas as pd
import os

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
file_id = '1Nf8_tQJ98foP3ZHeB1E10DojaS5xOxrB'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('BERT_data.rar')

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
file_id = '1IT6bGrSSbZrzObpq_sTJJSZBEF80U2MR'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('FOMC.rar')
file_id = '1XYhUK8vxqpboCG4l1Zqiu60iwF_whNns'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('model_6.pkl')

In [0]:
import rarfile

rf = rarfile.RarFile('FOMC.rar')        
rf.extractall()               
rf.close()

In [0]:
BERT_data_positive = pd.read_csv('BERT_data_positive.csv')
BERT_data_negative = pd.read_csv('BERT_data_negative.csv')
BERT_data_neutral = pd.read_csv('BERT_data_neutral.csv')
BERT_data_unk = pd.read_csv('BERT_data_unk.csv')

In [0]:
train_data = pd.concat([BERT_data_positive,  BERT_data_negative, BERT_data_neutral])
print(f'shape : {train_data.shape}')
display()

shape : (20321, 2)


In [0]:
test_data = BERT_data_unk
print(f'test shape : {test_data.shape}')

test shape : (2122, 2)


In [0]:
train_data['labels'].value_counts()

2    10579
1     6054
0     3688
Name: labels, dtype: int64

In [0]:
from sklearn.model_selection import train_test_split
train, val, train_label, val_label = train_test_split(train_data['sentent'], train_data['labels'], test_size = 0.2, shuffle = True, random_state = 10)
train_set = pd.concat([train,train_label],axis=1)
val_set = pd.concat([val,val_label],axis=1)
del train, val, train_label, val_label

In [0]:
PRETRAINED_MODEL_NAME =  'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
clear_output()

In [0]:
train_set['length'] = train_set['sentent'].apply(lambda x : len(tokenizer.tokenize(x)))
train_set = train_set[train_set['length'] < 512]
train_set = train_set.drop(['length'], axis = 1)
val_set['length'] = val_set['sentent'].apply(lambda x : len(tokenizer.tokenize(x)))
val_set = val_set[val_set['length'] < 512]
val_set = val_set.drop(['length'], axis = 1)

In [0]:
from torch.utils.data import Dataset

class SentimentForFOMC(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
     
        
    def __getitem__ (self, idx):
        cont = '[CLS]' + self.data['sentent'].iloc[idx]
        label = self.data['labels'].iloc[idx]
        cont = self.tokenizer.tokenize(cont)
        cont = self.tokenizer.convert_tokens_to_ids(cont)
        cont = torch.tensor(cont)
        label = torch.tensor(label)
        
        return (cont, label)
    def __len__(self):
        return len(self.data)

In [0]:
from torch.nn.utils.rnn import pad_sequence

def create_minibatch(samples):
    
    tokens = [s[0] for s in samples]
    tokens = pad_sequence(tokens, batch_first=True)
    
    labels = torch.stack([l[1] for l in samples])
    masked = torch.zeros(tokens.shape)
    masked = masked.masked_fill(tokens != 0, 1)
  
    return tokens, masked, labels

In [0]:
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader
import time

use_gpu = torch.cuda.is_available()

model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels = 3)
clear_output()

train_dataset = SentimentForFOMC(train_set, tokenizer)
train_loader = DataLoader(train_dataset, batch_size = 2, shuffle=True, collate_fn = create_minibatch)
val_dataset = SentimentForFOMC(val_set, tokenizer)
val_loader = DataLoader(val_dataset, batch_size = 2, shuffle = False, collate_fn=  create_minibatch)

if use_gpu:
    model.cuda()

start = time.time()

optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
loss_fn = torch.nn.CrossEntropyLoss()

epochs = 10
for epoch in range(epochs):
    model.train()
    train_loss = []
    correct = 0
    total = 0
    for i, (tok, mas, lb) in enumerate(train_loader):
        if use_gpu:
            tok = tok.cuda()
            mas = mas.cuda()
            lb = lb.cuda()
        batch_size = tok.shape[0]    
        optimizer.zero_grad()
        outputs = model(input_ids = tok, attention_mask = mas)        
        ttt = tok
        mmm = mas
        #logits = outputs[0]
        _, pred = torch.max(outputs[0], 1)
        loss = loss_fn(outputs[0],lb)
        loss.backward()
        optimizer.step()
        correct += (pred == lb).sum().item()
        total += lb.shape[0]
        train_loss.append(loss.item())
    acc = correct/total 
    
    print(f'train [epoch{epoch+1:2d}], acc = {acc:.3f}, loss = {np.mean(train_loss)}')

    model.eval()
    with torch.no_grad():
        train_loss = []
        correct = 0
        total = 0
        for i, (tok, mas, lb) in enumerate(val_loader):
            if use_gpu:
                tok = tok.cuda()
                mas = mas.cuda()
                lb = lb.cuda()   
            batch_size = tok.shape[0]        
            outputs = model(input_ids = tok, attention_mask = mas)
           # logits = outputs[0]     
            _, pred = torch.max(outputs[0], 1)
            loss = loss_fn(outputs[0],lb)
            train_loss.append(loss.item())
            correct += (pred == lb).sum().item()
            total += lb.shape[0]
        acc = correct/total    
        print(f'valid [epoch{epoch+1:2d}], acc = {acc:.3f}, loss = {np.mean(train_loss)}')   
    if acc > 0.9:
      model_path = f'model_{epoch+1}.pkl'
      torch.save(model.state_dict(), model_path)
      print(f'model_{epoch+1} saved')
    print(f'spend {time.time()-start:3.1f}s')

train [epoch 1], acc = 0.886, loss = 0.2949298414779453
valid [epoch 1], acc = 0.965, loss = 0.10146689347379072
model_1 saved
spend 1138.7s
train [epoch 2], acc = 0.987, loss = 0.04814055623762968
valid [epoch 2], acc = 0.986, loss = 0.07121654013557548
model_2 saved
spend 2275.9s
train [epoch 3], acc = 0.995, loss = 0.023478268367880473
valid [epoch 3], acc = 0.986, loss = 0.07148566617239156
model_3 saved
spend 3413.2s
train [epoch 4], acc = 0.996, loss = 0.017295491465335565
valid [epoch 4], acc = 0.989, loss = 0.07040203537578883
model_4 saved
spend 4549.8s
train [epoch 5], acc = 0.997, loss = 0.013319515791637915
valid [epoch 5], acc = 0.988, loss = 0.07368416211308812
model_5 saved
spend 5687.7s
train [epoch 6], acc = 0.998, loss = 0.009655367532042807
valid [epoch 6], acc = 0.989, loss = 0.06731268763542175
model_6 saved
spend 6823.3s
train [epoch 7], acc = 0.998, loss = 0.012289948830373456
valid [epoch 7], acc = 0.989, loss = 0.06810205704596856
model_7 saved
spend 7960.5s
tr

In [0]:
test_dataset = SentimentForFOMC(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size = 2, collate_fn=create_minibatch)

model.load_state_dict(torch.load('model_6.pkl'))
model.cuda()

with torch.no_grad():
    predict_value = []
    for tok, mas, lb in test_loader:
        if use_gpu:
            tok = tok.cuda()
            mas = mas.cuda()
            lb = lb.cuda()
        outputs = model(input_ids = tok, attention_mask = mas)
        
        _, pred = torch.max(outputs[0], 1)
        predict_value += pred.tolist()

test_data['labels'] = predict_value       

In [0]:
from google.colab import files
test_data.to_csv('test_data.csv', index = None)
files.download('test_data.csv')

In [0]:
files.download('model_6.pkl')

In [0]:
import nltk
import pandas as pd

path = os.listdir('./FOMC')
path.sort()
path.pop()
path.pop(-1)
def load_data(date,repeat = True):
    with open('./FOMC/' + date, 'r', encoding = 'utf8') as cor:
        text = ''
        for i in cor:
            text += i
        if repeat:    
            repeat_word = {'Federal Open Market Committee':'FOMC','gross domestic product':'gdp'} 
            for i in repeat_word:
                text = text.replace(i, repeat_word[i])    
        text = nltk.sent_tokenize(text)  
    return text        
documents = []
for d in path:
    documents += load_data(d, repeat = False)[1:]


In [0]:
test_data = pd.DataFrame()
test_data['sentent'] = documents
test_data['labels'] = 0


Unnamed: 0,sentent,labels
0,"PRESENT: Mr. Greenspan, Chairman\n Mr....",0
1,Attended Wednesday session only.,0
2,2.,0
3,Attended Tuesday session only.,0
4,3.,0
