In [1]:
#import libraries
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import sys
import nltk
import ssl
import re
import sklearn
from sklearn import feature_selection, feature_extraction, naive_bayes, pipeline, metrics
import transformers
from transformers import AutoTokenizer
import torch
import gc
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

#command to install pip packages in jupyter
#{sys.executable} -m pip install sentencepiece


In [2]:
#load data
df = pd.read_csv('spectrum.csv')
df = df.dropna()
df = df[['spectrum', 'body']]

#split data 
train, test = sklearn.model_selection.train_test_split(df, test_size=0.2, random_state=1)
train, val = sklearn.model_selection.train_test_split(train, test_size=0.25, random_state=1)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
val = val.reset_index(drop=True)

In [3]:
#tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [4]:
#model

model = transformers.AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [5]:
#tokenizer function
def tokenizer_func(data):
    text = [i[0] for i in data]
    labels = torch.tensor([i[1] for i in data])
    tokened = tokenizer(text,
                        padding='max_length',
                        truncation=True,
                        return_tensors='pt')
    return tokened, labels

In [6]:
#dataset for torch
class Dataset(torch.utils.data.Dataset):

    def __init__(self, data):
        
        self.labels = [1 if x == 'right' else 0 for x in data['spectrum'].values]
        self.dataframe = data

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        return self.dataframe.iloc[idx]
    
    def __getitem__(self, idx):

        texts = self.dataframe.iloc[idx]['body']
        labels = self.labels[idx]

        return texts, labels

In [7]:
#create torch dataset for train
train_data = Dataset(train)

train_dataloader = torch.utils.data.DataLoader(train_data, 
                                               batch_size=5,
                                               shuffle=True,
                                               collate_fn=tokenizer_func)

In [8]:
#train model
def trainer(model, train_dataloader, learning_rate, epochs):
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)
    

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label  in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                print(train_label)
                print(output.logits.argmax(dim=1))
                
                batch_loss = criterion(output.logits, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.logits.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            
            print(f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_input): .3f} \
            | Train Accuracy: {total_acc_train / len(train_input): .3f}')

                  
EPOCHS = 1
model = model
LR = 1e-6
              
trainer(model, train_dataloader, LR, EPOCHS)

  0%|                                                 | 0/23830 [00:00<?, ?it/s]

tensor([0, 0, 0, 1, 1])
tensor([0, 0, 0, 0, 0])


  0%|                                      | 1/23830 [00:12<85:18:57, 12.89s/it]

tensor([0, 1, 0, 0, 1])
tensor([0, 0, 0, 0, 0])


  0%|                                     | 1/23830 [00:23<157:27:15, 23.79s/it]


KeyboardInterrupt: 

In [None]:
#create torch dataset for test
test_data = Dataset(test)

test_dataloader = torch.utils.data.DataLoader(test_data, 
                                               batch_size=5,
                                               shuffle=True,
                                               collate_fn=tokenizer_func)

In [None]:
def evaluate(model, test_dataloader):


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    pred_labels = []
    y_labels = []
    y_prob = []
    with torch.no_grad():

        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)
            output = model(input_id, mask)
            pred_labels += output.logits.argmax(dim=1).tolist()
            y_labels += test_label.tolist()
            y_prob += nn.functional.softmax(outputs.logits, dim=1)
    
    return(pred_labels, y_labels, y_prob)
    
test_pred_labels, test_y_labels, test_y_prob = evaluate(model, test_dataloader)

In [11]:
#print results
classes = [1,0]
y_test_array = pd.get_dummies(test_y_labels, drop_first=False).values

## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(test_y_labels, test_pred_predicted)
auc = metrics.roc_auc_score(test_y_labels, test_y_prob[:,1])
print("Accuracy:",  round(accuracy,2))
print("Auc:", round(auc,2))
print("Detail:")
print(metrics.classification_report(test_y_labels, test_pred_labels))

fig, ax = plt.subplots(nrows=1, ncols=2)
## Plot roc
for i in range(len(classes)):
    fpr, tpr, thresholds = metrics.roc_curve(y_test_array[:,i],  
                           predicted_prob[:,i])
    ax[0].plot(fpr, tpr, lw=3, 
              label='{0} (area={1:0.2f})'.format(classes[i], 
                              metrics.auc(fpr, tpr))
               )
ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05], 
          xlabel='False Positive Rate', 
          ylabel="True Positive Rate (Recall)", 
          title="Receiver operating characteristic")
ax[0].legend(loc="lower right")
ax[0].grid(True)
    
## Plot precision-recall curve
for i in range(len(classes)):
    precision, recall, thresholds = metrics.precision_recall_curve(
                 y_test_array[:,i], predicted_prob[:,i])
    ax[1].plot(recall, precision, lw=3, 
               label='{0} (area={1:0.2f})'.format(classes[i], 
                                  metrics.auc(recall, precision))
              )
ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall', 
          ylabel="Precision", title="Precision-Recall curve")
ax[1].legend(loc="best")
ax[1].grid(True)
plt.show()

[0, 1, 0, 0, 1]