In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import torch
import numpy as np

from torchtext.data import Field, TabularDataset, BucketIterator, Iterator

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

import torch.optim as optim

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from utils import load_metrics

## Preprocess

In [3]:
needs_file_path = 'laptop/needs_byasin.csv'
needs_preprocessed_path = 'laptop/needs.csv'
review_file_path = 'laptop/amazon_reviews.csv'
des_path = 'laptop/'

In [4]:
train_test_ratio = 0.90
train_valid_ratio = 0.80
first_n_words = 200

In [5]:
def trim_string(x):

    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])

    return x

In [6]:
needs = pd.read_csv(needs_file_path, index_col=0)

data_needs = needs.iloc[:,0].astype(str)
data_needs = data_needs[data_needs != 'nan']

for index in range(1, needs.shape[1]):
    temp = needs.iloc[:,index].astype(str)
    temp = temp[temp != 'nan']
    data_needs = pd.concat([data_needs, temp])

data_needs.reset_index(drop=True, inplace=True)
data_needs.to_csv(needs_preprocessed_path, index=0)

data_needs = pd.read_csv(needs_preprocessed_path)

data_needs["label"] = 1
data_needs['0'] = data_needs['0'].apply(trim_string)
data_needs.rename(columns={'0': 'text'}, inplace=True)


data_review = pd.read_csv(review_file_path)
data_review["label"] = 0
data_review['0'] = data_review['0'].apply(trim_string)
data_review.rename(columns={'0': 'text'}, inplace=True)


In [7]:
# Train - Test
df_need_full_train, df_need_test = train_test_split(data_needs, train_size = train_test_ratio, random_state=1)
df_review_full_train, df_review_test = train_test_split(data_review, train_size = train_test_ratio, random_state=1)

In [8]:
df_need_full_train.shape, df_need_test.shape

((963, 2), (107, 2))

In [9]:
# Train - valid
df_need_train, df_need_valid = train_test_split(df_need_full_train, train_size = train_valid_ratio, random_state=1)
df_review_train, df_review_valid = train_test_split(df_review_full_train, train_size = train_valid_ratio, random_state=1)

In [10]:
df_need_train.shape, df_need_valid.shape

((770, 2), (193, 2))

In [11]:
df_train = pd.concat([df_need_train, df_review_train], ignore_index=True, sort=False)
df_valid = pd.concat([df_need_valid, df_review_valid], ignore_index=True, sort=False)
df_test = pd.concat([df_need_test, df_review_test], ignore_index=True, sort=False)

In [12]:
df_train.to_csv(des_path + 'train.csv', index=False)
df_valid.to_csv(des_path + 'valid.csv', index=False)
df_test.to_csv(des_path + 'test.csv', index=False)

## Load data

In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [7]:
source_folder = 'laptop'
des_folder = 'record'

In [15]:
# https://huggingface.co/transformers/pretrained_models.html
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [16]:
tokenizer.pad_token

'[PAD]'

In [17]:
# Model parameter
MAX_SEQ_LEN = 128
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

# Fields

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = [('text', text_field), ('label', label_field)]

# TabularDataset

train, valid, test = TabularDataset.splits(path=source_folder, train='train.csv', validation='valid.csv',
                                           test='test.csv', format='CSV', fields=fields, skip_header=True)

# Iterators

train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=16, device=device, train=False, shuffle=False, sort=False)



## Model

In [18]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        options_name = "bert-base-uncased"
        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

    def forward(self, text, label):
        loss, text_features = self.encoder(text, labels=label)[:2]
        return loss, text_features

In [20]:
# Training Function

def train(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = train_iter,
          valid_loader = valid_iter,
          num_epochs = 5,
          eval_every = len(train_iter) // 2,
          file_path = des_folder,
          best_valid_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        print("start epoch:" + str(epoch))
        for (text, labels), _ in train_loader:
            labels = labels.type(torch.LongTensor)           
            labels = labels.to(device)
            text = text.type(torch.LongTensor)  
            text = text.to(device)
            output = model(text, labels)
            loss, _ = output

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    

                    # validation loop
                    for (text, labels), _ in valid_loader:
                        labels = labels.type(torch.LongTensor)           
                        labels = labels.to(device)
                        text = text.type(torch.LongTensor)  
                        text = text.to(device)
                        output = model(text, labels)
                        loss, _ = output
                        
                        valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '/' + 'model.pt', model, best_valid_loss)
                    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')

In [45]:
model = BERT().to(device)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [90]:
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [95]:
train(model=model, optimizer=optimizer)

start epoch:0




KeyboardInterrupt: 

In [29]:
device

device(type='cpu')

In [5]:
import utils

In [12]:
train_loss_list, valid_loss_list, global_steps_list = load_metrics(des_folder + '/metrics.pt', device)
plt.plot(global_steps_list, train_loss_list, label='Train')
plt.plot(global_steps_list, valid_loss_list, label='Valid')
plt.xlabel('Global Steps')
plt.ylabel('Loss')
plt.legend()
plt.savefig(des_folder + 'train_iter.png')

FileNotFoundError: [Errno 2] No such file or directory: 'record/metrics.pt'