In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import activations, losses
from datasets import load_dataset
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
import os

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')
print("GPUs available:", physical_devices)

In [None]:
path = "../data/processed"
dataset = load_dataset(path)
dataset

In [None]:
print(dataset["train"][0])

In [None]:
train_encoded_text = dataset['train']['encoded_text']
test_encoded_text = dataset['test']['encoded_text']

unique_train_encoded_text = list(set(train_encoded_text))
unique_test_encoded_text = list(set(test_encoded_text))

# Sort the unique values
train_set_sorted = sorted(unique_train_encoded_text)
test_set_sorted = sorted(unique_test_encoded_text)

# Check if both contain the same elements
are_elements_same = (train_set_sorted == test_set_sorted)
print(train_set_sorted)
print(test_set_sorted)
print(are_elements_same)

In [None]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def tokenize_and_get_lengths(examples):
    tokenized_examples = tokenizer(examples['text'], truncation=False, padding=False)
    return {'length': [len(tokens) for tokens in tokenized_examples['input_ids']]}

# Použití funkce na celý dataset
measure = dataset.map(tokenize_and_get_lengths, batched=True, remove_columns=['text'])

# Zjištění maximální délky
max_seq_length = max(measure['train']['length'])
print(f'Max lenght of sequence: {max_seq_length}')


In [None]:
max_seq_length = 512
encodings = tokenizer(dataset['train']['text'], max_length=max_seq_length,truncation=True, padding=True)

In [None]:
input_ids = tf.constant(encodings['input_ids'])
attention_mask = tf.constant(encodings['attention_mask'])
labels = tf.constant(dataset['train']['encoded_text'])

In [None]:
print(tf.constant(encodings['input_ids']).shape)
print(tf.constant(encodings['attention_mask']).shape)
print(tf.constant(dataset['train']['encoded_text']).shape)

In [None]:
def create_tf_dataset(input_ids, attention_mask, labels):
    dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        },
        labels
    ))
    return dataset

In [None]:
batch_size = 32
train_dataset = create_tf_dataset(input_ids, attention_mask, labels)
train_dataset = train_dataset.shuffle(10000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
EPOCHS = 3
count_of_categories =24

model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=count_of_categories)
loss = losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
model.fit(train_dataset, batch_size=batch_size, epochs=EPOCHS)