In [None]:
!pip install -q tensorflow-addons
!pip install -q tensorflow
!pip install -q datasets
!pip install -q gensim
!pip install sklearn_crfsuite



In [None]:
import copy
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
import datasets
from gensim.models import KeyedVectors
import gensim.downloader as gensim_api
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
from sklearn.model_selection import train_test_split

# Load the data from the url
url = "https://drive.google.com/uc?id=1LYi4J9yquBzyVE1op9_uyFLPdo9-jq8t"
train_data = pd.read_csv(url, delimiter='\t', header=None, names=['word', 'tag'])

# Split the dataset into training and testing sets
#msk = np.random.rand(len(data)) < 0.8
#train_data = data[msk]

# Convert the words and tags into numerical representations
word2idx = {w: i+1 for i, w in enumerate(set(train_data['word']))}
tag2idx = {'O': 0, 'B_PRODUCT': 1, 'I_PRODUCT': 2}
train_data['word_idx'] = train_data['word'].map(word2idx)
train_data['tag_idx'] = train_data['tag'].map(tag2idx)

# Create tokens, id, and ner_tags list from the training data
tokens = []
ids = []
ner_tags = []
temp_token_list = []
temp_id_list = []
temp_ner_tag_list = []

# Iterate through the rows of the training data
for index, row in train_data.iterrows():
    # Check if the word column starts with "|"
    if row['word'][0] == '|':
        # Add the temporary lists to the main lists
        if len(temp_token_list)>0:
            tokens.append(temp_token_list)
            ids.append(temp_id_list)
            ner_tags.append(temp_ner_tag_list)
        # Clear the temporary lists
        temp_token_list = []
        temp_id_list = []
        temp_ner_tag_list = []
    else:
        # Add the word, tag, and tag_idx to the temporary lists
        temp_token_list.append(row['word'])
        temp_id_list.append(tag2idx[row['tag']])
        temp_ner_tag_list.append(row['tag'])

# Add the last example to the main lists
if len(temp_token_list)>0:
    tokens.append(temp_token_list)
    ids.append(temp_id_list)
    ner_tags.append(temp_ner_tag_list)

# Split the dataset into train and test datasets
tokens_train, tokens_test, ids_train, ids_test, ner_tags_train, ner_tags_test = train_test_split(
    tokens, ids, ner_tags, test_size=0.2, random_state=42)

# Create a dictionary with 'tokens', 'id', and 'ner_tags' keys for train and test datasets
train_dict = {'tokens': tokens_train, 'id': ids_train, 'ner_tags': ner_tags_train}
test_dict = {'tokens': tokens_test, 'id': ids_test, 'ner_tags': ner_tags_test}

# Create Dataset objects with the train_dict and test_dict dictionaries
train_dataset = Dataset.from_dict(train_dict)
test_data = Dataset.from_dict(test_dict)

In [None]:
for item in train_dataset:
  sample_tokens = item['tokens']
  sample_tag_ids = item["id"]
  print(sample_tokens)
  print(sample_tag_ids)
  break

raw_tags = ['O', 'B_PRODUCT', 'I_PRODUCT']
print(raw_tags)
print(type(raw_tags))

sample_tags = [raw_tags[i] for i in sample_tag_ids]

print(sample_tokens)
print(sample_tags)

tags = ['<PAD>'] + raw_tags
print(tags)

TAG_SIZE = len(tags)
VOCAB_SIZE = 20000
EMBEDDING_DIM = 300  # set the embedding dimension to 100

train_tokens = tf.ragged.constant(train_dataset["tokens"])
train_tokens = tf.map_fn(tf.strings.lower, train_tokens)

['CUTTING', 'SET', 'HSS', 'PIE']
[1, 2, 2, 2]
['O', 'B_PRODUCT', 'I_PRODUCT']
<class 'list'>
['CUTTING', 'SET', 'HSS', 'PIE']
['B_PRODUCT', 'I_PRODUCT', 'I_PRODUCT', 'I_PRODUCT']
['<PAD>', 'O', 'B_PRODUCT', 'I_PRODUCT']


In [None]:
word2vec_model = gensim_api.load("word2vec-google-news-300")
lookup_layer = tf.keras.layers.experimental.preprocessing.StringLookup(
    #vocabulary=list(word2vec_model.index_to_key)[:20000], mask_token=None
    vocabulary=list(word2vec_model.index_to_key), mask_token=None
)
embedding_matrix = np.zeros((lookup_layer.vocabulary_size(), EMBEDDING_DIM))
for i, word in enumerate(lookup_layer.get_vocabulary()):
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model.get_vector(word)

embedding_layer = tf.keras.layers.Embedding(
    lookup_layer.vocabulary_size(), EMBEDDING_DIM, mask_zero=True,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False
)


In [None]:
def create_data_generator(dataset):
  def data_generator():
    for item in dataset:
      yield item['tokens'], item['id']

  return data_generator

data_signature= (
        tf.TensorSpec(shape=(None,), dtype=tf.string),
        tf.TensorSpec(shape=(None, ), dtype=tf.int32)
)
#buraya bak.
train_data = tf.data.Dataset.from_generator(
    create_data_generator(train_dataset),
    output_signature=data_signature
)


In [None]:
def dataset_preprocess(tokens, tag_ids):
    preprocessed_tokens = preprocess_tokens(tokens)

    # increase by 1 for all tag_ids,
    # because `<PAD>` is added as the first element in tags list
    preprocessed_tag_ids = tag_ids + 1

    return preprocessed_tokens, preprocessed_tag_ids

def preprocess_tokens(tokens):
    tokens = tf.strings.lower(tokens)
    return lookup_layer(tokens)

BATCH_SIZE = 512

train_dataset = (
    train_data.map(dataset_preprocess)
    .padded_batch(batch_size=BATCH_SIZE).cache()
)

In [None]:
def build_embedding_bilstm_crf_model(
    vocab_size: int, embed_dims: int, lstm_unit: int, tag_size: int
) -> tf.keras.Model:
    x = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="x")

    word2vec_embedding = tf.keras.layers.Embedding(input_dim=lookup_layer.vocab_size(),
                                                   output_dim=300,
                                                   embeddings_initializer="uniform",
                                                   mask_zero=False,
                                                   name="word2vec_embedding")
    y = word2vec_embedding(x)

    y = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(lstm_unit, return_sequences=True)
    )(y)
    decode_sequence, potentials, sequence_length, kernel = tfa.layers.CRF(tag_size)(y)

    return tf.keras.Model(
        inputs=x, outputs=[decode_sequence, potentials, sequence_length, kernel]
    )

In [None]:
model = build_embedding_bilstm_crf_model(VOCAB_SIZE, 32, 64, TAG_SIZE)

preprocessed_tokens = preprocess_tokens(sample_tokens)
inputs = tf.expand_dims(preprocessed_tokens, axis=0)

outputs, *_ = model(inputs)
print(outputs[0])

@tf.function
def crf_loss_func(potentials, sequence_length, kernel, y):
    crf_likelihood, _ = tfa.text.crf_log_likelihood(
        potentials, y, sequence_length, kernel
    )
    flat_crf_loss = -1 * crf_likelihood
    crf_loss = tf.reduce_mean(flat_crf_loss)

    return crf_loss

optimizer = tf.keras.optimizers.Adam(0.02)
train_loss = tf.keras.metrics.Mean(name="train_loss")

@tf.function(experimental_relax_shapes=True)
def train_step(x, y):
    with tf.GradientTape() as tape:
        decoded_sequence, potentials, sequence_length, kernel = model(x)
        crf_loss = crf_loss_func(potentials, sequence_length, kernel, y)
        loss = crf_loss + tf.reduce_sum(model.losses)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    train_loss(loss)




In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    train_loss.reset_states()

    for x, y in train_dataset:
        train_step(x, y)

    print(f"Epoch {epoch + 1}, " f"Loss: {train_loss.result()}")

print("raw inputs: ", sample_tokens)

preprocessed_inputs = preprocess_tokens(
    sample_tokens
)

In [None]:
inputs = tf.reshape(preprocessed_inputs, shape=[1, -1])

outputs, *_ = model.predict(inputs)
prediction = [tags[i] for i in outputs[0]]

print("ground true tags: ", sample_tags)
print("predicted tags: ", prediction)

In [None]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# Prepare test data
test_tokens = tf.ragged.constant(test_data["tokens"])
test_tokens = tf.map_fn(tf.strings.lower, test_tokens)
test_dataset = (
    tf.data.Dataset.from_generator(
        create_data_generator(test_data), output_signature=data_signature
    )
    .map(dataset_preprocess)
    .padded_batch(batch_size=BATCH_SIZE)
)

# Evaluation
y_true = []
y_pred = []

for x, y in test_dataset:
    decoded_sequence, *_ = model.predict(x)
    y_pred.extend(decoded_sequence)
    y_true.extend(y.numpy().tolist())

# Convert tag IDs to tags
y_true = [[tags[i] for i in seq] for seq in y_true]
y_pred = [[tags[i] for i in seq] for seq in y_pred]

# Flatten the sequences
y_true_flat = [tag for seq in y_true for tag in seq]
y_pred_flat = [tag for seq in y_pred for tag in seq]

# Calculate metrics
f1_score = metrics.flat_f1_score(y_true, y_pred, average='weighted', labels=tags)
accuracy = metrics.flat_accuracy_score(y_true, y_pred)
precision = metrics.flat_precision_score(y_true, y_pred, average='weighted', labels=tags)

# Print metrics
print("F1 Score:", f1_score)
print("Accuracy:", accuracy)
print("Precision:", precision)

In [None]:
sample_tokens_two = ['JEWELLERY', 'MAKING', 'TOOLS', 'WOODEN', 'PIN']
sample_tag_two_ids = [0, 0, 0, 1, 2]

sample_tags_two = [raw_tags[i] for i in sample_tag_two_ids]

print(sample_tokens_two)
print(sample_tags_two)

print("raw inputs: ", sample_tokens_two)

preprocessed_inputs_two = preprocess_tokens(
    sample_tokens_two
)
# expend the batch dim
inputs = tf.reshape(preprocessed_inputs_two, shape=[1, -1])

outputs, *_ = model.predict(inputs)
prediction = [tags[i] for i in outputs[0]]
print(preprocessed_inputs_two)
# Keypoint: EU -> B-ORG, German -> B-MISC, British -> B-MISC
print("ground true tags: ", sample_tags_two)
print("predicted tags: ", prediction)


print(outputs)


In [None]:
# expend the batch dim

inputs = tf.reshape(preprocessed_inputs, shape=[1, -1])

outputs, *_ = model.predict(inputs)
prediction = [tags[i] for i in outputs[0]]
print(preprocessed_inputs)
print("raw inputs: ", sample_tokens)
# Keypoint: EU -> B-ORG, German -> B-MISC, British -> B-MISC
print("ground true tags: ", sample_tags)
print("predicted tags: ", prediction)


print(outputs)