In [1]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd


In [2]:
# Link προς το csv αρχείο
url = 'https://raw.githubusercontent.com/vgkourgkoutas/Computational_Intelligence/main/iphi2802.csv'
# Διαβάζουμε το αρχείο CSV με τις επιγραφές και τις χρονολογίες
df = pd.read_csv(url, delimiter='\t')
df.head()

Unnamed: 0,id,text,metadata,region_main_id,region_main,region_sub_id,region_sub,date_str,date_min,date_max,date_circa
0,315181,[φ]ιλεταιρος ευμενου περγαμευς μουσαις. καφισι...,Boiotia — Thespiai — mid-3rd c. BC — BCH 26 (1...,1698,Central Greece (IG VII-IX),1691,"Megaris, Oropia, and Boiotia (IG VII)",mid-3rd c. BC,-275.0,-226.0,0.0
1,201686,μαλκοιδων ηρωνος.,"Crete, W. — Tarrha — 1st-3rd c. AD — IC II xxi...",1699,"Aegean Islands, incl. Crete (IG XI-[XIII])",474,Crete,1st-3rd c. AD,1.0,300.0,0.0
2,153178,βασιλικος.,Makedonia (Bottiaia) — Pella — 3rd/2nd c. BC —...,1692,Northern Greece (IG X),1485,Macedonia,3rd/2nd c. BC,-300.0,-101.0,0.0
3,28582,αισκλαπιει μ [ανεθεκε --].,Epidauria — Epidauros — sinistr. — 6th/5th c. BC,1690,Peloponnesos (IG IV-[VI]),1643,"Epidauria (IG IV²,1)",6th/5th c. BC,-600.0,-401.0,0.0
4,333620,[---]ος αν[εθηκε δαματρι].,Italia — Herakleia (Policoro) — late 4th/early...,1696,"Sicily, Italy, and the West (IG XIV)",1689,"Italy, incl. Magna Graecia",late 4th/early 3rd c. BC,-350.0,-251.0,0.0


In [4]:
# Συνάρτηση RMSE για αξιολόγηση
def rmse(y_true, y_pred):
    return tf.keras.backend.sqrt(tf.keras.backend.mean(tf.keras.backend.square(y_pred - y_true)))

# Εκκαθάριση της συνεδρίας για αποφυγή προβλημάτων μνήμης
tf.keras.backend.clear_session()

# Επιλογή του προεκπαιδευμένου μοντέλου (μπορείτε να επιλέξετε οποιοδήποτε άλλο)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

texts = df["text"].tolist()
labels = df["average_date"] = (df['date_min'] + df['date_max']) / 2

# Κανονικοποίηση των χρονολογιών στο εύρος [0, 1]
scaler = MinMaxScaler()
labels_normalized = scaler.fit_transform(np.array(labels).reshape(-1, 1))

# Διαχωρισμός των δεδομένων σε σύνολα εκπαίδευσης και ελέγχου
X_train, X_test, y_train, y_test = train_test_split(texts, labels_normalized, test_size=0.2, random_state=42)

# Tokenization των κειμένων
train_encodings = tokenizer(X_train, truncation=True, padding=True, return_tensors="tf")
test_encodings = tokenizer(X_test, truncation=True, padding=True, return_tensors="tf")

# Compile του μοντέλου
model.compile(loss="mean_squared_error", metrics=[rmse])

# Εκπαίδευση του μοντέλου με Early Stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_rmse',mode='min', verbose=1)
model.fit(
    train_encodings["input_ids"],
    y_train,
    epochs=1,
    batch_size=8,
    validation_data=(test_encodings["input_ids"], y_test),
    callbacks=[early_stopping],
    verbose=1
)

# Αξιολόγηση του μοντέλου στο σύνολο ελέγχου
test_loss, test_rmse = model.evaluate(test_encodings["input_ids"], y_test, verbose=1)
print("Test RMSE:", test_rmse)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test RMSE: 0.22525103390216827
