In [38]:
pip install torchtext==0.4

In [64]:
import torch
import torchtext
from torchtext import data, datasets
import spacy
import time
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
import torch.optim as optim
import pandas as pd
from sklearn.metrics import recall_score,precision_score


In [40]:
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
def preprocess(text):
    text = text.lower() 
    text=text.strip()
    text=re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\[[0-9]*\]',' ',text)
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text)
    return text
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)
#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()

# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))

In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [42]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [43]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id
print(init_token, eos_token, pad_token, unk_token)

In [44]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

print(max_input_length)

In [45]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [46]:
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)
LABEL = data.LabelField(dtype = torch.float,sequential=False)

In [47]:
train = data.TabularDataset('../input/proces-data/train_for_bert.csv', format='csv',skip_header=True,
        fields=[('text',TEXT),('id',LABEL)])
test = data.TabularDataset('../input/proces-data/test_for_bert.csv', format='csv',skip_header=True,
        fields=[('text',TEXT),('id',LABEL)])

In [48]:
for i in test:
    print(i.text)
    print(tokenizer.convert_ids_to_tokens(list(i.text)))
    break

In [49]:
# TEXT.build_vocab(train, vectors='glove.6B.100d')
LABEL.build_vocab(train)

In [50]:
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train,test), 
    batch_size = 64,
    shuffle=False,
    sort_within_batch = True,
    sort_key = lambda x: len(x.text),
    device = device)


In [51]:
# test1 = data.BucketIterator(
#     test, 
#     batch_size = 1,
#     shuffle=False,
#     sort_within_batch = True,
#     sort_key = lambda x: len(x.text),
#     device = device)

In [52]:
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

In [53]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = self.bert(text)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [55]:
def binary_accuracy(preds, y):

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc
def calc_f1(preds,y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    return rounded_preds.tolist(),y.tolist()
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [56]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.id)
        acc = binary_accuracy(predictions, batch.id)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [65]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    p_arr = []
    r_arr = []
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.id)
            
            acc = binary_accuracy(predictions, batch.id)
            a,b = calc_f1(predictions, batch.id)
            p_arr.extend(a)
            r_arr.extend(b)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    print(classification_report(r_arr,p_arr,digits=4))
    print('binary:', f1_score(r_arr, p_arr, average='binary'))
    print('acc',accuracy_score(r_arr,p_arr))
    print('recall',recall_score(r_arr,p_arr))
    print('pre',precision_score(r_arr,p_arr))
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [58]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.2

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False
# LEARNING_RATE = 0.001
optimizer = optim.Adam(params =  model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [59]:
N_EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
#     if valid_loss < best_valid_loss:
#         best_valid_loss = valid_loss
#         torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [66]:
evaluate(model, valid_iterator, criterion)

In [None]:
Epoch: 09 | Epoch Time: 0m 6s
	Train Loss: 0.180 | Train Acc: 93.81%
	 Val. Loss: 0.269 |  Val. Acc: 89.04%
              precision    recall  f1-score   support

         0.0       0.93      0.96      0.95       355
         1.0       0.93      0.87      0.90       199

    accuracy                           0.93       554
   macro avg       0.93      0.92      0.92       554
weighted avg       0.93      0.93      0.93       554


In [None]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.round(torch.sigmoid(model(tensor)))
#     prediction = torch.sigmoid(model(tensor))
    return prediction.item()