In [1]:

from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset
from datasets import DatasetDict
import datasets

In [3]:
dataset_train = load_dataset('csv', data_files='train_all_untailored_seq.csv')
dataset_train=dataset_train['train'].rename_column("Sequence", "Seq")


Using custom data configuration default-f5bc78e0491ae8ec
Reusing dataset csv (/nfs/home/zlu21/.cache/huggingface/datasets/csv/default-f5bc78e0491ae8ec/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
#Train test split

dataset=dataset_train.train_test_split(test_size=0.01, shuffle=True)

Using custom data configuration zluvolyote--Dream_NLP_Validation-63e26103e7b5e8b8
Reusing dataset csv (/nfs/home/zlu21/.cache/huggingface/datasets/zluvolyote___csv/zluvolyote--Dream_NLP_Validation-63e26103e7b5e8b8/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Seq', 'Exp'],
        num_rows: 6739258
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'Index', 'Seq', 'Pred', 'Exp', 'Residual'],
        num_rows: 221215
    })
})

In [9]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding,AutoConfig
from torch.utils.data import DataLoader


BASE_MODEL = "bert-base-uncased"
LEARNING_RATE = 1e-5
MAX_LENGTH = 128
BATCH_SIZE = 128
EPOCHS = 3


model_checkpoint = "bert-base-uncased"

config = AutoConfig.from_pretrained(model_checkpoint,vocab_size=10,num_labels=1)
tokenizer = AutoTokenizer.from_pretrained("model_checkpoint")
model = AutoModelForSequenceClassification.from_config(config)

In [10]:
from re import findall
import itertools  
#Defining the function that splits the sequences into nuclitides seperated by spaces

def textProcess(s,k,fill=' '):
  return ' '.join(map(''.join, itertools.zip_longest(*[iter(s)]*k, fillvalue=fill)))

In [None]:
#Preprocess the input and tokenize the sequences, note that only the sequences and their expression values are used

def preprocess_function(examples):
    text=examples['Seq']
    label=examples['Exp']
    text=textProcess(text,1)
    examples["text"]=text
    examples = tokenizer(text, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    # Change this to real number
    examples["label"] = float(label)
    return examples

tokenized_dataset=DatasetDict()
for split in dataset:
    tokenized_dataset[split] = dataset[split].map(preprocess_function,num_proc=48)

In [13]:
tokenized_dataset

Dataset({
    features: ['Seq', 'Exp', 'text', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 6739258
})

In [14]:
def selectColumns(dataset,columns=["input_ids","token_type_ids","attention_mask","label"]):
    cols_to_remove= dataset.column_names
    for c in columns:
        cols_to_remove.remove(c)
    return dataset.remove_columns(cols_to_remove)


In [15]:
for split in tokenized_dataset:
    tokenized_dataset[split]=selectColumns(tokenized_dataset[split])



In [None]:
tokenized_dataset

In [18]:
!nvidia-smi -L


GPU 0: NVIDIA A40 (UUID: GPU-ff27e5c0-dfa7-5051-ab99-8cc865ee9aae)
GPU 1: NVIDIA A40 (UUID: GPU-5688da18-d5ae-9454-2fff-ded0abfb29e7)
GPU 2: NVIDIA A40 (UUID: GPU-0479f7ae-005d-dc81-4d0d-c19f38133fb8)
GPU 3: NVIDIA A40 (UUID: GPU-051d093e-64e2-d9ed-ac6a-4301da29526c)


In [19]:

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    return {"mse": mse, "mae": mae, "r2": r2}


In [20]:
from transformers import TrainingArguments

EPOCHS = 3
BATCH_SIZE=36
training_args = TrainingArguments(
    f"zluvolyote/DEREXP",
    save_total_limit=3,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    optim="adamw_torch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="r2",
    load_best_model_at_end=True,
    weight_decay=0.01,
    dataloader_num_workers= 2,
    remove_unused_columns=True,

)

Make sure your cuda version supports 'sm_86'

In [21]:
import torch
print(torch.cuda.get_arch_list())

['sm_37', 'sm_50', 'sm_60', 'sm_70', 'sm_75', 'sm_80', 'sm_86']


In [22]:
import torch  

from transformers import Trainer
class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        
        loss = torch.nn.functional.mse_loss(logits, labels.float())
        return (loss, outputs) if return_outputs else loss

In [23]:
trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics_for_regression,
    tokenizer=tokenizer,
)



In [24]:
!nvidia-smi

Sat Aug  6 13:47:23 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A40          On   | 00000000:17:00.0 Off |                    0 |
|  0%   44C    P0    83W / 300W |   1122MiB / 45634MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A40          On   | 00000000:65:00.0 Off |                    0 |
|  0%   39C    P8    26W / 300W |      3MiB / 45634MiB |      0%      Default |
|       

In [25]:
print(torch.cuda.current_device())
print(torch.cuda.device_count())

0
2


In [None]:
trainer.train()