In [164]:
import sys
import os
from pathlib import Path
import pickle
import tqdm
import random
sys.path.append('..')

from transformers import AutoModel, AutoTokenizer, AutoConfig

import json
import pandas as pd

import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader, random_split

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yoonapark/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yoonapark/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
yelp_path = '../data/yelp_academic_dataset_review.json'

data = []
with open(yelp_path, 'r') as f:
  for line in f.readlines():
    data.append(json.loads(line))
    
data_df = pd.DataFrame(data)

In [236]:
pd.set_option('display.max_colwidth', 100)
yelp_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,Q1sbwvVQXV2734tPgoKj4Q,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1.0,6,1,0,Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge u...,2013-05-07 04:34:36
1,GJXCdrto3ASJOqKeVWPi6Q,yXQM5uF2jS6es16SJzNHfg,NZnhc2sEQy3RmzKTZnqtwQ,5.0,0,0,0,I *adore* Travis at the Hard Rock's new Kelly Cardenas Salon! I'm always a fan of a great blowo...,2017-01-14 21:30:33
2,2TzJjDVDEuAW6MR5Vuc1ug,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5.0,3,0,0,"I have to say that this office really has it together, they are so organized and friendly! Dr. ...",2016-11-09 20:09:03
3,yi0R0Ugj_xUx_Nek0-_Qig,dacAIZ6fTM6mqwW5uxkskg,ikCg8xy5JIg_NGPx-MSIDA,5.0,0,0,0,"Went in for a lunch. Steak sandwich was delicious, and the Caesar salad had an absolutely delici...",2018-01-09 20:56:38
4,11a8sVPMUFtaC7_ABRkmtw,ssoyf2_x0EQMed6fgHeMyQ,b1b1eb3uo-w561D0ZfCEiQ,1.0,7,0,0,"Today was my second out of three sessions I had paid for. Although my first session went well, I...",2018-01-30 23:07:38


In [175]:
class SentiBERT(nn.Module):
    
    def __init__(self, **kargs):
        super(SentiBERT, self).__init__()
        
        hidden_dim = kargs['hidden_dim']
        self.bert = AutoModel.from_pretrained(kargs['model_name_or_path'])
        
        for param in self.bert.parameters():
            param.requires_grad = False
    
        self.layer1 = nn.Sequential(nn.Linear(hidden_dim,hidden_dim),
                                   nn.ReLU())
        self.layer2 = nn.Linear(hidden_dim, 3)
        
    def forward(self,x): 
        """
        x: x is a list of token tensors, already tokenized by the tokenizer
        """
        x = self.bert(x)
        last_hidden_state_cls = x.last_hidden_state[:, 0, :]
        
        x = self.layer1(torch.squeeze(last_hidden_state_cls))
        out = self.layer2(x)
        
        return out

In [184]:
stop_words = set(stopwords.words('english'))

def stars_to_sentiment(x):
    return 2 if x >= 4.0 else (0 if x <2.0 else 1)

def tokenize_text(text):
    return ' '.join([word for word in word_tokenize(text) if not word.lower() in stop_words])

class ReviewDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, output_path):
        self.text = df['text']
        self.labels = df['stars'].apply(lambda x: stars_to_sentiment(x))
        
        # Remove stop words
        filtered_text = df['text'].apply(lambda x: tokenize_text(x))
        
        # tokenization
        self.text = tokenizer.batch_encode_plus(filtered_text.tolist(), truncation=True, 
                                                     add_special_tokens=True, padding='max_length', max_length=max_length)['input_ids']
        
        
        data_path = Path(output_path, 'tokenized_data.pkl')
        if not Path(data_path).exists():
            with open(data_path, 'wb') as f:
                pickle.dump({'text':self.text, 'labels':self.labels}, f)

        print(f'Loaded {len(self.labels)} examples.')
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        return torch.tensor(self.text[index],dtype=torch.long), torch.tensor(self.labels[index], dtype=torch.long)
    
        

In [185]:
model_name='bert-base-uncased'
max_length = 512
batch_size = 10
output_path = '../data/'
num_epoch = 4
learning_rate = 3e-4
batch_size = 8

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = SentiBERT(model_name_or_path=model_name, hidden_dim=768).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

dataset = ReviewDataset(data_df, tokenizer, 512, output_path)
train_set_size = int(len(dataset) * 0.8)
test_set_size = len(dataset) - train_set_size
train_dataset, test_dataset = random_split(dataset, [train_set_size, test_set_size])

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

 

Loaded 100 examples.


In [186]:
seed_val = 42

random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

for epoch in range(num_epoch):
    loss = 0
    total_batch = 0
    for batch, labels in train_dataloader:
        optimizer.zero_grad()
        
        output = model(batch.to(device))
        batch_loss = criterion(output, labels)
        
        batch_loss.backward()
        optimizer.step()
        
        loss += batch_loss.item() * batch_size
        total_batch += batch_size
    
    print(f'The total loss at epoch {epoch +1} : {loss:.5f}')
    print(f'The average loss at {epoch +1} : {loss:.5f}')
        
    

The total loss at epoch 1 : 80.72475
The average loss at 1 : 80.72475
The total loss at epoch 2 : 72.78884
The average loss at 2 : 72.78884
The total loss at epoch 3 : 71.81705
The average loss at 3 : 71.81705
The total loss at epoch 4 : 72.06914
The average loss at 4 : 72.06914


In [191]:
model.eval()

y_pred_list = []
with torch.no_grad():
    testset_size = 0
    corrects = 0
    for batch, labels in test_dataloader:
        logits = model(batch.to(device))
        probs = torch.softmax(logits, dim=1)
        pred = probs.argmax(dim=1)
        corrects += (pred == labels)
        testset_size += batch_size
    accuracy = corrects.sum().float() / float(testset_size)
    print(f'The accuracy of the testset is {accuracy:.3f}')


RuntimeError: The size of tensor a (8) must match the size of tensor b (4) at non-singleton dimension 0