In [1]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  0


In [2]:
## 1. Import Libraries
import pandas as pd
import numpy as np
import os
import csv
import gc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.models import Model

In [3]:
# ## 2. Load Datasets
def load_data_from_files(folder_path, columns):
    all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]
    dataframes = []
    for file in all_files:
        df = pd.read_csv(file, names=columns
                         , delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
        df['line_id'] = file.split('/')[-1].replace('.txt', '')
        dataframes.append(df)
    return pd.concat(dataframes, ignore_index=True)
    
train_data = load_data_from_files('C:\\Users\\ThugCom\\project\\superai_ss5\\super-ai-ss-5-named-entity-recognition\\train\\train'
                                  , ['word', 'pos', 'ner', 'clause_boundary'])
eval_data = load_data_from_files('C:\\Users\\ThugCom\\project\\superai_ss5\\super-ai-ss-5-named-entity-recognition\\eval\\eval'
                                 , ['word', 'pos', 'ner', 'clause_boundary'])
test_data = load_data_from_files('C:\\Users\\ThugCom\\project\\superai_ss5\\super-ai-ss-5-named-entity-recognition\\test\\test'
                                 , ['word', 'pos', 'clause_boundary'])

In [4]:
# ## 3. Preprocess Data

# Clean data
train_data['ner'] = train_data['ner'].apply(lambda x: 'O' if x == 'B' else x)
eval_data['ner'] = eval_data['ner'].apply(lambda x: 'O' if x == 'B' else x)

# Encode words, POS tags, and NER labels
word_encoder = LabelEncoder()
pos_encoder = LabelEncoder()
ner_encoder = LabelEncoder()
clause_encoder = LabelEncoder()

train_data['word'] = word_encoder.fit_transform(train_data['word'])
train_data['pos'] = pos_encoder.fit_transform(train_data['pos'])
train_data['ner'] = ner_encoder.fit_transform(train_data['ner'])
train_data['clause_boundary'] = clause_encoder.fit_transform(train_data['clause_boundary'])

# Create sequences for training
X_word = train_data.groupby('line_id')['word'].apply(list).values
X_pos = train_data.groupby('line_id')['pos'].apply(list).values
X_clause = train_data.groupby('line_id')['clause_boundary'].apply(list).values
Y_ner = train_data.groupby('line_id')['ner'].apply(list).values

del train_data
gc.collect()

# Pad sequences
# max_len = max([len(seq) for seq in X_word])
max_len = int(np.percentile([len(seq) for seq in X_word], 95))

X_word = pad_sequences(X_word, maxlen=max_len, padding='post')
X_pos = pad_sequences(X_pos, maxlen=max_len, padding='post')
X_clause = pad_sequences(X_clause, maxlen=max_len, padding='post')
Y_ner = pad_sequences(Y_ner, maxlen=max_len, padding='post')
# Y_ner = [to_categorical(i, num_classes=len(ner_encoder.classes_)) for i in Y_ner]
# Y_ner = np.array([to_categorical(i, num_classes=len(ner_encoder.classes_)) for i in Y_ner])

# Split train/eval dataset
X_train_word, X_val_word, X_train_pos, X_val_pos, X_train_clause, X_val_clause, Y_train, Y_val = train_test_split(
    X_word, X_pos, X_clause, Y_ner, test_size=0.2, random_state=42
)


In [5]:
print(f"Number of classes in ner_encoder: {len(ner_encoder.classes_)}")
print(f"Any NaN in Y_ner: {np.isnan(Y_ner).any()}")


Number of classes in ner_encoder: 40
Any NaN in Y_ner: False


In [6]:
# vocab_size ของ X_word
word_vocab_size = np.max(X_word) + 1
print(f"Vocab size for X_word: {word_vocab_size}")

# vocab_size ของ X_pos
pos_vocab_size = np.max(X_pos) + 1
print(f"Vocab size for X_pos: {pos_vocab_size}")

# vocab_size ของ X_clause
clause_vocab_size = np.max(X_clause) + 1
print(f"Vocab size for X_clause: {clause_vocab_size}")

print(f"Max value in X_word: {np.max(X_word)}, Min value: {np.min(X_word)}")
print(f"Max value in X_pos: {np.max(X_pos)}, Min value: {np.min(X_pos)}")
print(f"Max value in X_clause: {np.max(X_clause)}, Min value: {np.min(X_clause)}")

print(f"Max value in X_word: {np.max(X_word)}")
print(f"Max value in X_pos: {np.max(X_pos)}")
print(f"Max value in X_clause: {np.max(X_clause)}")

Vocab size for X_word: 28036
Vocab size for X_pos: 16
Vocab size for X_clause: 4
Max value in X_word: 28035, Min value: 0
Max value in X_pos: 15, Min value: 0
Max value in X_clause: 3, Min value: 0
Max value in X_word: 28035
Max value in X_pos: 15
Max value in X_clause: 3


In [7]:
# ## 4. Build and Compile the Model

input_word = Input(shape=(max_len,), name='word_input')
input_pos = Input(shape=(max_len,), name='pos_input')
input_clause = Input(shape=(max_len,), name='clause_input')

# embedding_word = Embedding(input_dim=len(word_encoder.classes_), output_dim=64, input_length=max_len)(input_word)
# embedding_pos = Embedding(input_dim=len(pos_encoder.classes_), output_dim=16, input_length=max_len)(input_pos)
# embedding_clause = Embedding(input_dim=2, output_dim=8, input_length=max_len)(input_clause)

embedding_word = Embedding(input_dim=word_vocab_size, output_dim=32, input_length=max_len)(input_word)
embedding_pos = Embedding(input_dim=pos_vocab_size, output_dim=8, input_length=max_len)(input_pos)
embedding_clause = Embedding(input_dim=clause_vocab_size, output_dim=4, input_length=max_len)(input_clause)

merged = tf.keras.layers.Concatenate()([embedding_word, embedding_pos, embedding_clause])
lstm_layer = Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(merged)
output = TimeDistributed(Dense(len(ner_encoder.classes_), activation='softmax'))(lstm_layer)

model = Model(inputs=[input_word, input_pos, input_clause], outputs=output)
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# del X_word, X_pos, X_clause, Y_ner  # ลบตัวแปรที่ไม่ได้ใช้
# gc.collect()  # เรียก Garbage Collector



In [8]:
# ## 5. Train the Model

# history = model.fit(
#     [X_train_word, X_train_pos, X_train_clause], np.array(Y_train),
#     validation_data=([X_val_word, X_val_pos, X_val_clause], np.array(Y_val)),
#     batch_size=32,
#     epochs=10,
#     verbose=1
# )
history = model.fit(
    [X_word, X_pos, X_clause],  # อินพุต (เป็นลิสต์สำหรับหลายอินพุต)
    Y_ner,                     # เอาต์พุต (one-hot หรือ integer)
    batch_size=12,
    epochs=10,
    validation_split=0.2
)


# Fit the model
# history = model.fit([X_word, X_pos, X_clause], np.array(Y_ner), batch_size=32, epochs=10)

Epoch 1/10
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m915s[0m 3s/step - accuracy: 0.9041 - loss: 0.7344 - val_accuracy: 0.9546 - val_loss: 0.1632
Epoch 2/10
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m773s[0m 3s/step - accuracy: 0.9587 - loss: 0.1458 - val_accuracy: 0.9772 - val_loss: 0.0834
Epoch 3/10
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m765s[0m 3s/step - accuracy: 0.9776 - loss: 0.0792 - val_accuracy: 0.9839 - val_loss: 0.0549
Epoch 4/10
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m750s[0m 3s/step - accuracy: 0.9839 - loss: 0.0547 - val_accuracy: 0.9870 - val_loss: 0.0424
Epoch 5/10
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m762s[0m 3s/step - accuracy: 0.9871 - loss: 0.0421 - val_accuracy: 0.9886 - val_loss: 0.0357
Epoch 6/10
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m946s[0m 4s/step - accuracy: 0.9888 - loss: 0.0354 - val_accuracy: 0.9897 - val_loss: 0.0316
Epoch 7/10
[1m253/253

In [33]:
# ## 6. Prepare Test Data

# เพิ่ม unknown_token เข้าไปใน word_encoder
import numpy as np

unknown_token = '<UNK>'  # สัญลักษณ์แทนคำที่ไม่รู้จัก
if unknown_token not in word_encoder.classes_:
    word_encoder.classes_ = np.append(word_encoder.classes_, unknown_token)

# แปลงข้อมูลใน test_data ด้วยการแทนคำที่ไม่รู้จักด้วย unknown_token
test_data['word'] = test_data['word'].apply(
    lambda x: x if x in word_encoder.classes_ else unknown_token
)

# แปลงข้อมูลเป็นตัวเลข
test_data['word'] = word_encoder.transform(test_data['word'])

if unknown_token not in pos_encoder.classes_:
    pos_encoder.classes_ = np.append(pos_encoder.classes_, unknown_token)

# แปลงข้อมูลใน test_data ด้วยการแทนคำที่ไม่รู้จักด้วย unknown_token
test_data['pos'] = test_data['pos'].apply(
    lambda x: x if x in pos_encoder.classes_ else unknown_token
)

# แปลงข้อมูลเป็นตัวเลข
test_data['pos'] = pos_encoder.transform(test_data['pos'])

# แปลงข้อมูลเป็นตัวเลข
test_data['clause_boundary'] = clause_encoder.transform(test_data['clause_boundary'])

if unknown_token not in clause_encoder.classes_:
    clause_encoder.classes_ = np.append(clause_encoder.classes_, unknown_token)

# แปลงข้อมูลใน test_data ด้วยการแทนคำที่ไม่รู้จักด้วย unknown_token
test_data['clause_boundary'] = test_data['clause_boundary'].apply(
    lambda x: x if x in clause_encoder.classes_ else unknown_token
)

# แปลงข้อมูลเป็นตัวเลข
test_data['clause_boundary'] = clause_encoder.transform(test_data['clause_boundary'])

test_word = test_data.groupby('line_id')['word'].apply(list).values
test_pos = test_data.groupby('line_id')['pos'].apply(list).values
test_clause = test_data.groupby('line_id')['clause_boundary'].apply(list).values

test_word = pad_sequences(test_word, maxlen=max_len, padding='post')
test_pos = pad_sequences(test_pos, maxlen=max_len, padding='post')
test_clause = pad_sequences(test_clause, maxlen=max_len, padding='post')


ValueError: y contains previously unseen labels: 4

In [29]:
print(test_word.shape)
print(test_pos.shape)
print(test_clause.shape)

(482, 2081)
(482, 2081)
(482, 2081)


In [19]:
# ## 7. Predict and Create Submission

predictions = model.predict([test_word, test_pos, test_clause])

# Convert predictions to class indices
predicted_classes = np.argmax(predictions, axis=-1)

# Generate submission
submission = []
for i, seq in enumerate(predicted_classes):
    for j, pred in enumerate(seq):
        submission.append({'index': f'test_{i}_{j}', 'name entity': pred})

submission_df = pd.DataFrame(submission)
submission_df.to_csv('submission.csv', index=False)

InvalidArgumentError: Graph execution error:

Detected at node functional_1/embedding_1/GatherV2 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "C:\Users\ThugCom\anaconda3\Lib\asyncio\windows_events.py", line 322, in run_forever

  File "C:\Users\ThugCom\anaconda3\Lib\asyncio\base_events.py", line 641, in run_forever

  File "C:\Users\ThugCom\anaconda3\Lib\asyncio\base_events.py", line 1987, in _run_once

  File "C:\Users\ThugCom\anaconda3\Lib\asyncio\events.py", line 88, in _run

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 534, in dispatch_queue

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 523, in process_one

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 429, in dispatch_shell

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 767, in execute_request

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\ipykernel\ipkernel.py", line 429, in do_execute

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\ThugCom\AppData\Local\Temp\ipykernel_13984\4140098615.py", line 3, in <module>

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 562, in predict

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 259, in one_step_on_data_distributed

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 249, in one_step_on_data

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 104, in predict_step

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\layers\layer.py", line 908, in __call__

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\ops\operation.py", line 46, in __call__

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\models\functional.py", line 182, in call

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\ops\function.py", line 171, in _run_through_graph

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\models\functional.py", line 637, in call

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\layers\layer.py", line 908, in __call__

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\ops\operation.py", line 46, in __call__

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\layers\core\embedding.py", line 140, in call

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\ops\numpy.py", line 5346, in take

  File "C:\Users\ThugCom\anaconda3\Lib\site-packages\keras\src\backend\tensorflow\numpy.py", line 2093, in take

indices[28,0] = 28036 is not in [0, 28036)
	 [[{{node functional_1/embedding_1/GatherV2}}]] [Op:__inference_one_step_on_data_distributed_187325]

In [None]:
submission_df