# HuggingFace

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
df = pd.read_csv("../dataset_single.csv")

In [3]:
df["P1"] = (df["P1"]!=1).astype(int)

In [4]:
df.drop(["mol_id"], axis=1, inplace=True)

In [5]:
train_df, test_df = train_test_split(df, test_size=0.1)

In [6]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import os
os.environ["CURL_CA_BUNDLE"]=""

In [8]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
checkpoint = "mrm8488/chEMBL26_smiles_v2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# num_labels?
# model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)



In [9]:
tokenizer("CN(C)C1(c2nnnn2-c2ccc(Cl)cc2)CCCCC1")

{'input_ids': [0, 282, 12, 39, 13, 39, 21, 12, 71, 22, 479, 22, 17, 71, 22, 263, 12, 275, 13, 261, 22, 13, 295, 21, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
def tokenize_function(example):
    return tokenizer(example["smiles"], truncation=True)
  
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

100%|██████████| 5/5 [00:00<00:00, 13.92ba/s]
100%|██████████| 1/1 [00:00<00:00, 27.22ba/s]


In [11]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [12]:
batch_size=4

In [13]:
tf_train_dataset = tokenized_train_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["P1"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)

tf_validation_dataset = tokenized_test_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["P1"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

In [None]:
for batch in tf_train_dataset:
    break

In [None]:
batch

({'attention_mask': <tf.Tensor: shape=(4, 46), dtype=int64, numpy=
  array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1]])>, 'input_ids': <tf.Tensor: shape=(4, 46), dtype=int64, numpy=
  array([[  0, 267,  21, 263, 320,  39,  33,  50,  19, 270, 264,  51,  13,
          309,  22,  71,  12,  39,  13, 265,  23, 269, 321,  13,  83,  21,
            2,   1,   1,   1,   1,   1,   1,

In [None]:
tf_train_dataset

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(4, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(4, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(4,), dtype=tf.int64, name=None))>

In [16]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# faut-il inverser 1 et 0?
# oversampling?
# rajouter des metrics
# experiment.test?

num_epochs = 15
checkpoint = "mrm8488/chEMBL26_smiles_v2"
# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

opt = Adam(learning_rate=lr_scheduler)

model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1, from_pt=True)
loss = tf.keras.losses.BinaryFocalCrossentropy(from_logits=True)
metric = tf.keras.metrics.BinaryAccuracy(threshold=0)
model.compile(optimizer=opt, loss=loss, metrics=[metric])
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=num_epochs)

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15