In [22]:
# import mlflow
from lib.constants import PROJECT_DIR, EXPERIMENT_NAME, MLFLOW_URI
from lib.dataset import load_train_data, load_test_data

# Make sure to have the MLFlow server on before running this code.
# mlflow.set_tracking_uri(uri=MLFLOW_URI)
# experiment = mlflow.set_experiment(EXPERIMENT_NAME)
X_train, y_train = load_train_data()
X_test = load_test_data()

In [23]:
import pandas as pd
from lib.sklearn.preprocess import nlp
from sklearn.pipeline import Pipeline

max_words = 750
custom_map = {
    row['asal']: row['tujuan']
    for _, row in pd.read_csv('custom-mapper.csv').iterrows()
}
preprocess_pipeline = Pipeline([
    ('tokenizer', nlp.TextTokenizer()),
    ('formalizer', nlp.WordsFormalizer()),
    ('custom_mapper', nlp.WordsMapper(custom_map)),
    ('lemmatization', nlp.WordsLemmatization()),
    ('special_char_filter', nlp.SpecialCharacterFilter()),
    ('unknown_words_filter', nlp.UnknownWordsFilter()),
    ('text_to_sequence', nlp.TokenSequenceTransformer(max_words=max_words))
])

preprocess_pipeline.fit(X_train + X_test)
X_train_transformed = preprocess_pipeline.transform(X_train)
X_test_transformed = preprocess_pipeline.transform(X_test)
X_train_transformed[:5]

[[4, 189, 3, 259, 245, 343, 329],
 [6, 16, 4, 30, 330, 1, 2, 260, 162, 379],
 [4, 16, 13, 21, 68],
 [5, 14, 1, 38],
 [4, 30, 1, 53, 119, 2, 75, 55]]

In [4]:
min_len = 10
max_len = 20

X_train_seq, y_train_seq = nlp.split_sequences(
    X_train_transformed,
    y=y_train,
    max_len=max_len,
    min_len=min_len
)
X_test_seq = nlp.split_sequences(
    X_test_transformed,
    max_len=max_len,
    min_len=min_len
)

len(X_train_seq)

8679

In [5]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

import tensorflow as tf
print(tf.config.list_physical_devices())

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


2024-03-12 21:50:00.525238: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2024-03-12 21:50:00.527026: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2024-03-12 21:50:02.080091: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-03-12 21:50:02.080155: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (blackbox): /proc/driver/nvidia/version does not exist


In [306]:
import numpy as np
import tensorflow.keras.backend as K


def f1_macro(y_true, y_pred):
    y_true = K.cast(y_true, 'float')
    y_pred = K.cast(y_pred, 'float')

    tp = K.sum(y_true * y_pred, axis=0)
    fp = K.sum(K.max(K.clip(y_pred - y_true, 0, 1), axis=1), axis=0)
    fn = K.sum(K.max(K.clip(y_true - y_pred, 0, 1), axis=1), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2 * p * r / (p + r + K.epsilon())
    return K.mean(f1)

def f1_macro_loss(y_true, y_pred):
    return 1 - f1_macro(y_true, y_pred)

In [317]:
from lib.sklearn.model.classifier import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, Dense, Embedding, LSTM, Maximum

model = Sequential([
    Embedding(input_dim=max_words, output_dim=32, input_length=max_len),
    Bidirectional(LSTM(64)),
    Dense(5, activation='softmax')
])

model_pipeline = KerasClassifier(
    model,
    optimizer='adam',
    loss=f1_macro_loss,
    metrics=[f1_macro],
    batch_size=64,
    epochs=100
)

In [318]:
model_pipeline.fit(X_train_seq, y_train_seq)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [315]:
labels = np.eye(len(np.unique(y_train_seq)))
y_true = np.array([labels[yi - 1] for yi in y_train_seq])

y_pred = model_pipeline.predict_proba(X_train_seq)
# f1_macro(y_true, y_pred)

array([[9.9996817e-01, 2.5228286e-05, 3.3673275e-06, 2.4877096e-07,
        2.9375899e-06],
       [1.4797124e-07, 6.3494889e-09, 1.0494427e-08, 8.7790859e-08,
        9.9999976e-01],
       [1.4175734e-08, 2.8724526e-10, 3.3442370e-11, 7.1463847e-07,
        9.9999928e-01],
       ...,
       [6.7499947e-12, 7.9060734e-14, 2.4154062e-13, 1.6217501e-10,
        1.0000000e+00],
       [3.1674927e-12, 3.2129183e-14, 1.2527712e-13, 4.2963331e-11,
        1.0000000e+00],
       [9.3718411e-10, 5.0294692e-11, 2.8184314e-11, 1.2837758e-07,
        9.9999988e-01]], dtype=float32)

In [319]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_test_seq = pad_sequences(
    X_test_transformed,
    maxlen=max_len
)

predictions = model_pipeline.predict(X_test_seq)
submission = pd.DataFrame({'ID': np.arange(len(predictions)), 'LABEL': predictions})
display(submission)
submission.to_csv('test_submission.csv', index=False)

Unnamed: 0,ID,LABEL
0,0,5
1,1,4
2,2,5
3,3,1
4,4,2
...,...,...
495,495,5
496,496,5
497,497,1
498,498,1


In [320]:
from collections import Counter

Counter(predictions)

Counter({5: 322, 4: 65, 1: 94, 2: 6, 3: 13})