In [1]:
import tensorflow as tf

from datetime import datetime
from keras import losses
from datasets import load_dataset
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification


2024-05-23 13:47:44.462986: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
%load_ext tensorboard

In [3]:
#chekc if GPU is available
physical_devices = tf.config.list_physical_devices('GPU')
print("GPUs available:", physical_devices)

GPUs available: []


2024-05-23 13:47:49.679473: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: forward compatibility was attempted on non supported HW
2024-05-23 13:47:49.679682: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:134] retrieving CUDA diagnostic information for host: bc7a507b51ff
2024-05-23 13:47:49.679719: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:141] hostname: bc7a507b51ff
2024-05-23 13:47:49.680135: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:165] libcuda reported version is: 545.23.6
2024-05-23 13:47:49.680206: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:169] kernel reported version is: 535.171.4
2024-05-23 13:47:49.680224: E external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:251] kernel version 535.171.4 does not match DSO version 545.23.6 -- cannot find working devices in this configuration


In [4]:
path = "../data/processed"
dataset = load_dataset(path)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 86914
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 15338
    })
})

In [5]:
print(dataset["train"][0])

{'text': 'Malec skóroval padáčkem jako Poborský. Zase střílí góly až v play off V\xa0základní části hokejové extraligy nic, tam brněnský obránce Tomáš\xa0Malec\xa0góly nedává. Naposled se během dvaapadesátikolového maratonu trefil v\xa0sezoně 2013/14. V\xa0play off je to u něho o něčem jiném, tam se slovenský poctivec s\xa0číslem 71 prosazuje každoročně. Povedlo se mu to i letos. Gólem vteřinu před koncem prvního finále v\xa0Liberci poslal Kometu do vedení na 3:2, navíc to byl gól hodně kuriózní.', 'label': 13}


In [6]:
#chek if datasets are splited correctly
train_label = dataset['train']['label']
test_label = dataset['test']['label']

unique_train_label = list(set(train_label))
unique_test_label = list(set(test_label))

# Sort the unique values
train_set_sorted = sorted(unique_train_label)
test_set_sorted = sorted(unique_test_label)

# Check if both contain the same elements
are_elements_same = (train_set_sorted == test_set_sorted)
print(train_set_sorted)
print(test_set_sorted)
print(are_elements_same)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
True


In [7]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

In [8]:
def tokenize_and_get_lengths(examples):
    tokenized_examples = tokenizer(examples['text'], truncation=False, padding=False)
    return {'length': [len(tokens) for tokens in tokenized_examples['input_ids']]}

# Použití funkce na celý dataset
measure = dataset.map(tokenize_and_get_lengths, batched=True, remove_columns=['text'])

# Zjištění maximální délky
max_seq_train_length = max(measure['train']['length'])
print(f'Max lenght of train sequence: {max_seq_train_length}')

max_seq_test_length = max(measure['test']['length'])
print(f'Max lenght of test sequence: {max_seq_train_length}')



Max lenght of train sequence: 337
Max lenght of test sequence: 337


In [None]:
#for my dataset is max lenght of sequence: 337
max_seq_length = 512
encodings = tokenizer(dataset['train']['text'], max_length=max_seq_length,truncation=True, padding=True)

In [None]:
input_ids = tf.constant(encodings['input_ids'])
attention_mask = tf.constant(encodings['attention_mask'])
labels = tf.constant(dataset['train']['label'])

In [None]:
print(tf.constant(encodings['input_ids']).shape)
print(tf.constant(encodings['attention_mask']).shape)
print(tf.constant(dataset['train']['label']).shape)

In [None]:
def create_tf_dataset(input_ids, attention_mask, labels):
    dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        },
        labels
    ))
    return dataset

In [None]:
#classic stetup
batch_size = 32
train_dataset = create_tf_dataset(input_ids, attention_mask, labels)
train_dataset = train_dataset.shuffle(10000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
log_dir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
#training
EPOCHS = 3
count_of_categories =24

model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=count_of_categories)
loss = losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
model.fit(train_dataset, batch_size=batch_size, epochs=EPOCHS,callbacks=[tensorboard_callback])

In [None]:
%tensorboard --logdir logs/fit

In [None]:
test_item = dataset['test'][10]
print(test_item['text'])
print(test_item['label'])

In [None]:
#prediction
inputs = tokenizer(test_item['text'], truncation=False, padding=False, return_tensors='tf')
outputs = model(inputs)
predicted_class_idx = tf.argmax(outputs.logits, axis=-1).numpy()[0]
print(predicted_class_idx)

In [None]:
save_directory = "../my_model"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)