In [39]:
# !pip install transformers --upgrade
# !pip install datasets

In [40]:
import time
import scipy.sparse as sp
from transformers import EsmTokenizer, EsmModel
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import warnings
warnings.filterwarnings('ignore')

In [41]:
path="data/"
# Read sequences
sequences = list()
with open(path+"sequences.txt", "r") as f:
    for line in f:
        sequences.append(line[:-1])

# Split data into training and test sets
sequences_train = list()
sequences_test = list()
proteins_test = list()
train_target = list()
with open(path+"graph_labels.txt", "r") as f:
    for i, line in enumerate(f):
        t = line.split(",")
        if len(t[1][:-1]) == 0:
            proteins_test.append(t[0])
            sequences_test.append(sequences[i])
        else:
            sequences_train.append(sequences[i])
            train_target.append(int(t[1][:-1]))

sequences_train = np.array(sequences_train)
train_target = np.array(train_target)
sequences_test = np.array(sequences_test)

In [42]:
# train set
y_train_g = train_target[:4400].copy()
sequences_train_g=sequences_train[:4400].copy()
# validation
y_val_g = train_target[4400:].copy()
sequences_val_g=sequences_train[4400:].copy()


In [43]:
# Initialize device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

**Import EsmForSequenceClassification which is pretrained in order to perform a fine tunig task**

In [23]:
from transformers import EsmForSequenceClassification

model = EsmForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D", num_labels=18,output_hidden_states=True) #esm2_t6_8M_UR50D

Downloading:   0%|          | 0.00/775 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--esm2_t6_8M_UR50D/snapshots/97bbd7b3dc8acb222027cde0d8e911f3de78d5b8/config.json
Model config EsmConfig {
  "_name_or_path": "/tmp/facebook/esm2_t6_8M_UR50D",
  "architectures": [
    "EsmForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "emb_layer_norm_before": false,
  "esmfold_config": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 320,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1280,
  "is_folding_model": false,
  "labe

Downloading:   0%|          | 0.00/31.4M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--facebook--esm2_t6_8M_UR50D/snapshots/97bbd7b3dc8acb222027cde0d8e911f3de78d5b8/pytorch_model.bin
Some weights of the model checkpoint at facebook/esm2_t6_8M_UR50D were not used when initializing EsmForSequenceClassification: ['lm_head.decoder.weight', 'esm.contact_head.regression.weight', 'lm_head.layer_norm.bias', 'esm.contact_head.regression.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing EsmForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenc

In [24]:
from transformers import EsmTokenizer

tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")

Downloading:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--facebook--esm2_t6_8M_UR50D/snapshots/97bbd7b3dc8acb222027cde0d8e911f3de78d5b8/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--facebook--esm2_t6_8M_UR50D/snapshots/97bbd7b3dc8acb222027cde0d8e911f3de78d5b8/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--facebook--esm2_t6_8M_UR50D/snapshots/97bbd7b3dc8acb222027cde0d8e911f3de78d5b8/tokenizer_config.json


In [25]:
#example ho we can take the last hidden layer embbeding
inputs = tokenizer("A", return_tensors="pt")

with torch.no_grad():
    print(model(**inputs).hidden_states[1].shape)


torch.Size([1, 3, 320])


In [26]:
my_dict_train = {'label': y_train_g,     'text': sequences_train_g}
my_dict_val = {'label': y_val_g,     'text': sequences_val_g}

In [27]:
from datasets import Dataset
dataset_train = Dataset.from_dict(my_dict_train)
dataset_val = Dataset.from_dict(my_dict_val)

In [28]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets_train = dataset_train.map(tokenize_function, batched=True)
tokenized_datasets_val = dataset_val.map(tokenize_function, batched=True)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [29]:
small_train_dataset = tokenized_datasets_train.shuffle(seed=42).select(range(4000))
small_eval_dataset = tokenized_datasets_val.select(range(400))

In [30]:
import numpy as np
from datasets import load_metric,list_metrics

metric = load_metric("accuracy")

In [31]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [36]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch",num_train_epochs=10)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [37]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

We keep the best model checkpoint fitted on validation set 

In [51]:
PATH="test_trainer/checkpoint-5000/pytorch_model.bin"
model.load_state_dict(torch.load(PATH))
model.eval()

EsmForSequenceClassification(
  (esm): EsmModel(
    (embeddings): EsmEmbeddings(
      (word_embeddings): Embedding(33, 320, padding_idx=1)
      (dropout): Dropout(p=0.0, inplace=False)
      (position_embeddings): Embedding(1026, 320, padding_idx=1)
    )
    (encoder): EsmEncoder(
      (layer): ModuleList(
        (0): EsmLayer(
          (attention): EsmAttention(
            (self): EsmSelfAttention(
              (query): Linear(in_features=320, out_features=320, bias=True)
              (key): Linear(in_features=320, out_features=320, bias=True)
              (value): Linear(in_features=320, out_features=320, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
              (rotary_embeddings): RotaryEmbedding()
            )
            (output): EsmSelfOutput(
              (dense): Linear(in_features=320, out_features=320, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (LayerNorm): LayerNorm((320,), eps=1e-05, eleme

Preprocess test data for submition 

In [11]:
soft = nn.Softmax(dim=1)
def tok(sample):
    return tokenizer(sample, return_tensors="pt")

In [None]:
with torch.no_grad():
    logits=np.array([ soft(model(**tok(s)).logits)[0].tolist() for s in sequences_test])


Submition 

In [56]:
import csv
y_hat_proba=logits
with open('Submissions/fakir_submission.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = list()
    for i in range(18):
        lst.append('class'+str(i))
    lst.insert(0, "name")
    writer.writerow(lst)
    for i, protein in enumerate(proteins_test):
        lst = y_hat_proba[i,:].tolist()
        lst.insert(0, protein)
        writer.writerow(lst)