In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 1. SETUP (Φόρτωση πραγματικού μοντέλου)
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# 2. INPUT DATA
text = "Apple was founded by Steve Jobs in California."

In [5]:
# Tokenization
# return_tensors="pt": Φέρτο σε PyTorch Tensor
inputs = tokenizer(text, return_tensors="pt")
print(inputs['attention_mask'])
print(inputs['input_ids'])
print(f"1. Input IDs Shape: {inputs['input_ids'].shape}")

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[  101,  7302,  1108,  1771,  1118,  3036, 18235,  1116,  1107,  1756,
           119,   102]])
1. Input IDs Shape: torch.Size([1, 12])


In [None]:
# 3. INFERENCE
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    

AttributeError: 'TokenClassifierOutput' object has no attribute 'start_logits'

In [None]:
from torch import logit


print(outputs)
print(logits)

TokenClassifierOutput(loss=None, logits=tensor([[[ 7.5362, -0.5904, -0.8138, -1.1949, -1.6195, -0.4572, -1.1535,
          -1.4725, -1.1045],
         [-0.5476,  0.0943, -1.3713,  1.2435, -2.4477,  7.8553, -1.7852,
          -1.0442, -2.6816],
         [ 9.9153, -1.2348, -0.8506, -2.0552, -2.3920, -1.3129,  0.3082,
          -2.3134, -1.0793],
         [ 9.8475, -0.9621, -1.3455, -1.6084, -2.6286, -0.7910,  0.1654,
          -2.1704, -1.1955],
         [10.6818, -1.3400, -1.3847, -1.1385, -2.1305, -1.5486, -0.6992,
          -1.9663, -1.2993],
         [-0.6558, -1.2828, -2.4333,  9.0901, -1.4148, -0.6516, -2.5449,
          -0.6785, -1.6224],
         [-1.2180, -1.3712, -1.2147, -1.7568,  8.8958, -1.8238,  0.7215,
          -1.3092, -0.3991],
         [ 0.6910, -2.1982, -1.4579, -0.9891,  6.6751, -2.5478,  0.6119,
          -0.8784, -1.2875],
         [10.5854, -1.3067, -0.9439, -1.7781, -2.2146, -1.7052, -0.2939,
          -1.8484, -0.8532],
         [-0.6665, -1.0404, -2.3527, -0.40

In [None]:
print(f"2. Logits Shape: {logits.shape}")

2. Logits Shape: torch.Size([1, 12, 9])


In [None]:
# 4. POST-PROCESSING (TENSOR MANIPULATION)

# Βήμα Α: Βρες την κλάση με το μεγαλύτερο σκορ ΓΙΑ ΚΑΘΕ TOKEN.
# dim=2: Σημαίνει "σύγκρινε τους αριθμούς στον 3ο άξονα
predictions = torch.argmax(logits, dim=2)

print(f"3. Predictions Shape: {predictions.shape}")
print(f"Predictions: {predictions}")

3. Predictions Shape: torch.Size([1, 12])
Predictions: tensor([[0, 5, 0, 0, 0, 3, 4, 4, 0, 7, 0, 0]])


In [None]:
prediction_ids = predictions.squeeze().tolist()
input_ids = inputs['input_ids'].squeeze().tolist()
print(f"Input IDs: {input_ids}")
print(f"Prediction IDs: {prediction_ids}")

Input IDs: [101, 7302, 1108, 1771, 1118, 3036, 18235, 1116, 1107, 1756, 119, 102]
Prediction IDs: [0, 5, 0, 0, 0, 3, 4, 4, 0, 7, 0, 0]


In [None]:
id2label = model.config.id2label


print("\n--- ΑΠΟΤΕΛΕΣΜΑΤΑ ---")
for i, token_id in enumerate(input_ids):
    # Μετατροπή ID λέξης σε κείμενο
    token_str = tokenizer.convert_ids_to_tokens(token_id)
    
    # Μετατροπή ID πρόβλεψης σε Label (π.χ. 'B-ORG')
    label_id = prediction_ids[i]
    label_str = id2label[label_id]
    
    # Φιλτράρισμα: Δεν μας νοιάζουν τα special tokens ([CLS], [SEP]) ούτε τα 'O' (Outside/Ασήμαντα)
    if label_str != 'O' and token_str not in ['[CLS]', '[SEP]']:
        print(f"Token: {token_str:10} | Label: {label_str}")


--- ΑΠΟΤΕΛΕΣΜΑΤΑ ---
Token: Apple      | Label: B-ORG
Token: Steve      | Label: B-PER
Token: Job        | Label: I-PER
Token: ##s        | Label: I-PER
Token: California | Label: B-LOC


### word ids

In [None]:
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased")
text = "I love microservices architecture."

In [None]:
words = text.split()
print(words)

['I', 'love', 'microservices', 'architecture.']


In [None]:
inputs_bert = tokenizer_bert(words, is_split_into_words=True, return_tensors="pt")
print(inputs_bert)
print(inputs_bert['input_ids'])
print(inputs_bert['attention_mask'])
tokens_bert = tokenizer_bert.convert_ids_to_tokens(inputs_bert['input_ids'].squeeze().tolist())
tokens_bert_without_squeeze = tokenizer_bert.convert_ids_to_tokens(inputs_bert['input_ids'].tolist()[0])
print(tokens_bert_without_squeeze)
print(tokens_bert)

{'input_ids': tensor([[  101,  1045,  2293, 12702,  8043,  7903,  2229,  4294,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[  101,  1045,  2293, 12702,  8043,  7903,  2229,  4294,  1012,   102]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
['[CLS]', 'i', 'love', 'micro', '##ser', '##vic', '##es', 'architecture', '.', '[SEP]']
['[CLS]', 'i', 'love', 'micro', '##ser', '##vic', '##es', 'architecture', '.', '[SEP]']


In [None]:
word_ids_bert = inputs_bert.word_ids()
print(word_ids_bert)
print(inputs_bert.word_ids(batch_index=0))

[None, 0, 1, 2, 2, 2, 2, 3, 3, None]
[None, 0, 1, 2, 2, 2, 2, 3, 3, None]


In [None]:
with torch.no_grad():
    outputs_bert = model(**inputs_bert)
    logits_bert = outputs_bert.logits

In [None]:
predictions_bert = torch.argmax(logits_bert, dim=2)
print(f"Predictions BERT Shape: {predictions_bert.shape}")
print(f"Predictions BERT: {predictions_bert}")

Predictions BERT Shape: torch.Size([1, 10])
Predictions BERT: tensor([[0, 3, 3, 4, 4, 0, 0, 0, 0, 0]])


In [None]:
aligned_labels = []
for i, word_id in enumerate(word_ids_bert):
    if word_id is None:
        continue
    label_id = predictions_bert[0][i].item()
    label_str = id2label[label_id]
    aligned_labels.append((words[word_id], label_str))

print("\n--- BERT TOKENIZATION ΑΠΟΤΕΛΕΣΜΑΤΑ ---")
for token, label in aligned_labels:
    if label != 'O':
        print(f"Token: {token:15} | Label: {label}")


--- BERT TOKENIZATION ΑΠΟΤΕΛΕΣΜΑΤΑ ---
Token: I               | Label: B-PER
Token: love            | Label: B-PER
Token: microservices   | Label: I-PER
Token: microservices   | Label: I-PER
