# Parameters

https://towardsdatascience.com/bert-text-classification-using-pytorch-723dfb8b6b5b

In [1]:
!pip install torch
!pip install torchtext
!pip install seaborn
!pip install ipywidgets

Looking in indexes: https://pypi.org/simple, https://truwl-pypicloud-admin:****@pypi.truwl.com/simple/
Looking in indexes: https://pypi.org/simple, https://truwl-pypicloud-admin:****@pypi.truwl.com/simple/
Looking in indexes: https://pypi.org/simple, https://truwl-pypicloud-admin:****@pypi.truwl.com/simple/
Looking in indexes: https://pypi.org/simple, https://truwl-pypicloud-admin:****@pypi.truwl.com/simple/


In [22]:
!curl https://raw.githubusercontent.com/truwl/capanno/biowdl-structs/workflows/biowdl/RNA-seq/0.1.0/rnaseq_inputs-metadata.yaml > rnaseq.yaml
!curl https://raw.githubusercontent.com/truwl/capanno/master/workflows/Broad/WholeGenomeGermlineSingleSample/v2.3.1/wgs_inputs-metadata.yaml > wgs.yaml
!curl https://raw.githubusercontent.com/truwl/capanno/master/workflows/ENCODE-DCC/atac-seq-pipeline/v1.9.0/atac-inputs-metadata.yaml > atac.yaml
!curl https://raw.githubusercontent.com/truwl/capanno/master/workflows/ENCODE-DCC/chip-seq-pipeline2/v1.6.0/chip-inputs-metadata.yaml > chip.yaml
!curl https://raw.githubusercontent.com/truwl/capanno/master/workflows/ENCODE-DCC/mirna-seq-pipeline/v1.2.0/mirna_seq_pipeline-inputs-metadata.yaml > mirna.yaml
!curl https://raw.githubusercontent.com/truwl/capanno/gatk-sv/workflows/Broad/gatk-sv-ss/v0.18.2-beta/inputs-metadata.yaml > sv.yaml
!curl https://raw.githubusercontent.com/truwl/capanno/master/workflows/variant/benchmarking/0.2/inputs-metadata.yaml > vb.yaml

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  6390  100  6390    0     0  29346      0 --:--:-- --:--:-- --:--:-- 29447
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  7697  100  7697    0     0  21842      0 --:--:-- --:--:-- --:--:-- 21804
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 38523  100 38523    0     0   126k      0 --:--:-- --:--:-- --:--:--  126k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 43259  100 43259    0     0   138k      0 --:--:-- --:--:-- --:--:--  138k
  % Total    % Received % Xferd  Average Speed   Tim

# Libraries

In [69]:
# Libraries

import matplotlib.pyplot as plt
import pandas as pd
import torch
import yaml
from tqdm import tqdm
# Preliminaries
from collections import Counter


from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator
# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [24]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# Preliminaries

In [54]:
def runitodf(runitpath):
  with open(runitpath) as metaf:
    meta=yaml.load(metaf,Loader=yaml.Loader)
    metakeys = meta['parameter_meta']
    df = pd.DataFrame(columns=['inputfield','desc','label'])
    for k in metakeys:
      df = df.append({
      "inputfield": k,
      "desc": metakeys[k].get('description',''),
      "label": metakeys[k]['group']
        }, ignore_index=True)
    return(df)

df=runitodf("rnaseq.yaml")
for f in ['wgs','atac','chip','mirna','sv','vb']:
    df=df.append(runitodf("{0}.yaml".format(f)))

df['inputfield']=df['inputfield'].str.split('.').str[1:].str.join(sep='')
df

Unnamed: 0,inputfield,desc,label
0,sampleConfigFile,The sample configuration file,required_inputs
1,dockerImagesFile,A file listing the used docker images,required_inputs
2,starIndex,A list of star index files,required_inputs
3,referenceFasta,A path to a reference fasta,required_inputs
4,referenceFastaFai,The path to the index associated with the refe...,required_inputs
...,...,...,...
51,mpRegionsPath,,Paths
52,odRegionsPath,,Paths
53,sdRegionsPath,,Paths
54,unRegionsPath,,Paths


In [55]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [77]:
# Model parameter
MAX_SEQ_LEN = 255
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)
import numpy as np
# Fields

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = [('label', label_field), ('desc', text_field),('inputfield',text_field)]
# TabularDataset

#https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
def train_validate_test_split(df, train_percent=.4, validate_percent=.4, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

traindf, validatedf, testdf = train_validate_test_split(df)
traindf.to_csv(path_or_buf="train.csv", sep=',', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression='infer', quoting=None, quotechar='"', line_terminator=None, chunksize=None, date_format=None, doublequote=True, escapechar=None, decimal='.', errors='strict')
validatedf.to_csv(path_or_buf="validate.csv", sep=',', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression='infer', quoting=None, quotechar='"', line_terminator=None, chunksize=None, date_format=None, doublequote=True, escapechar=None, decimal='.', errors='strict')
testdf.to_csv(path_or_buf="test.csv", sep=',', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression='infer', quoting=None, quotechar='"', line_terminator=None, chunksize=None, date_format=None, doublequote=True, escapechar=None, decimal='.', errors='strict')

train, valid, test = TabularDataset.splits(path=".", train='train.csv', validation='validate.csv',
                                           test='test.csv', format='CSV', fields=fields, skip_header=True)
# Iterators
train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.inputfield),
                            device=device, train=True, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.inputfield),
                            device=device, train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=16, device=device, train=False, shuffle=False, sort=False)
assert(set(validatedf['label']).issubset(set(traindf['label'])))

# Models

In [78]:
class BERT(nn.Module):

    def __init__(self):
        super(BERT, self).__init__()

        options_name = "bert-base-uncased"
        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]

        return loss, text_fea

# Training

In [79]:
# Save and Load Functions

def save_checkpoint(save_path, model, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_checkpoint(load_path, model):
    
    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

In [90]:
# Training Function

def train(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = train_iter,
          valid_loader = valid_iter,
          num_epochs = 5,
          eval_every = len(train_iter) // 2,
          file_path = ".",
          best_valid_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        for (labels, inputfield, desc), _ in train_loader:
            labels = labels.type(torch.LongTensor)           
            labels = labels.to(device)
            desc = desc.type(torch.LongTensor)  
            desc = desc.to(device)
            inputfield = inputfield.type(torch.LongTensor)  
            inputfield = inputfield.to(device)
            output = model(desc, inputfield, labels)
            loss, _ = output

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    

                    # validation loop
                    for (labels, inputfield, desc), _ in valid_loader:
                        labels = labels.type(torch.LongTensor)           
                        labels = labels.to(device)
                        desc = desc.type(torch.LongTensor)  
                        desc = desc.to(device)
                        inputfield = inputfield.type(torch.LongTensor)  
                        inputfield = inputfield.to(device)
                        output = model(desc, inputfield, labels)
                        loss, _ = output
                        
                        valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '/' + 'model.pt', model, best_valid_loss)
                    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')

In [91]:
model = BERT().to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)

train(model=model, optimizer=optimizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

TypeError: forward() takes 3 positional arguments but 4 were given

In [None]:
model

In [None]:
BERT()

In [None]:
train_loss_list, valid_loss_list, global_steps_list = load_metrics(destination_folder + '/metrics.pt')
plt.plot(global_steps_list, train_loss_list, label='Train')
plt.plot(global_steps_list, valid_loss_list, label='Valid')
plt.xlabel('Global Steps')
plt.ylabel('Loss')
plt.legend()
plt.show() 

# Evaluation

In [None]:
# Evaluation Function

def evaluate(model, test_loader):
    y_pred = []
    y_true = []

    model.eval()
    with torch.no_grad():
        for (labels, title, text, titletext), _ in test_loader:

                labels = labels.type(torch.LongTensor)           
                labels = labels.to(device)
                titletext = titletext.type(torch.LongTensor)  
                titletext = titletext.to(device)
                output = model(titletext, labels)

                _, output = output
                y_pred.extend(torch.argmax(output, 1).tolist())
                y_true.extend(labels.tolist())
    
    print('Classification Report:')
    print(classification_report(y_true, y_pred, labels=[1,0], digits=4))
    
    cm = confusion_matrix(y_true, y_pred, labels=[1,0])
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d")

    ax.set_title('Confusion Matrix')

    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')

    ax.xaxis.set_ticklabels(['FAKE', 'REAL'])
    ax.yaxis.set_ticklabels(['FAKE', 'REAL'])

In [None]:
best_model = BERT().to(device)

load_checkpoint(destination_folder + '/model.pt', best_model)

evaluate(best_model, test_iter)