In [None]:
!pip install simpletransformers

In [None]:
import tensorflow as tf
import tensorflow.compat.v1 as tf1
import numpy as np
from simpletransformers.ner import NERModel, NERArgs

tf1.disable_v2_behavior()

class BiLSTMModel():

    def __init__(self, vocab_size, n_tags, embedding_dim, n_hidden,
                 padding_idx):
        self.declare_placeholders()
        self.build_layers(vocab_size, embedding_dim, n_hidden, n_tags)
        self.compute_predictions()
        self.compute_loss(n_tags, padding_idx)
        self.optimize()

    def declare_placeholders(self):
        self.input_batch = tf1.placeholder(dtype=tf.int32, shape=[None, None],
                                           name='input_batch')
        self.true_tags = tf1.placeholder(dtype=tf.int32, shape=[None, None],
                                         name='true_tags')
        self.lengths = tf1.placeholder(dtype=tf.int32, shape=[None],
                                       name='lengths')
        self.dropout = tf1.placeholder_with_default(tf.cast(1.0, tf.float32),
                                                    shape=[])
        self.learn_rate = tf1.placeholder(dtype=tf.float32, shape=[])

    def build_layers(self, vocab_size, embedding_dim, n_hidden, n_tags):
        embedding_matrix = np.random.randn(
            vocab_size, embedding_dim) / np.sqrt(embedding_dim)
        embedding_matrix_variable = tf.Variable(initial_value=embedding_matrix,
                                                dtype=tf.float32)

        forward_cell = tf1.nn.rnn_cell.DropoutWrapper(
            tf1.nn.rnn_cell.LSTMCell(n_hidden), self.dropout, self.dropout)
        backward_cell = tf1.nn.rnn_cell.DropoutWrapper(
            tf1.nn.rnn_cell.LSTMCell(n_hidden), self.dropout, self.dropout)

        embeddings = tf.nn.embedding_lookup(embedding_matrix_variable,
                                            self.input_batch)

        (rnn_output_fw, rnn_output_bw), _ = tf1.nn.bidirectional_dynamic_rnn(
            forward_cell, backward_cell, embeddings,
            sequence_length=self.lengths, dtype=tf.float32)
        rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)

        self.logits = tf1.layers.dense(rnn_output, n_tags, activation=None)

    def compute_predictions(self):
        self.softmax_output = tf.nn.softmax(self.logits)
        self.predictions = tf.argmax(self.softmax_output, axis=-1)

    def compute_loss(self, n_tags, padding_idx):
        one_hot_true_tags = tf.one_hot(self.true_tags, n_tags)
        loss_tensor = tf.nn.softmax_cross_entropy_with_logits(
            one_hot_true_tags, self.logits)

        mask = tf.cast(tf.not_equal(self.input_batch, padding_idx), tf.float32)
        self.loss = tf.reduce_mean(mask * loss_tensor)

    def optimize(self):
        self.optimizer = tf1.train.AdamOptimizer(learning_rate=self.learn_rate)
        self.gradient_var = self.optimizer.compute_gradients(self.loss)

        clip_norm = tf.cast(1.0, tf.float32)
        self.gradient_var = [(tf.clip_by_norm(grad, clip_norm), var) for
                             (grad, var) in self.gradient_var]
        self.train_op = self.optimizer.apply_gradients(self.gradient_var)

    def train_batch(self, session, tokens, tags, lengths,
                    learn_rate, dropout):
        placeholders = {self.input_batch: tokens,
                        self.true_tags: tags,
                        self.learn_rate: learn_rate,
                        self.dropout: dropout,
                        self.lengths: lengths}

        session.run(self.train_op, feed_dict=placeholders)

    def predict_batch(self, session, tokens, lengths):
        placeholders = {self.input_batch: tokens,
                        self.lengths: lengths}

        predictions = session.run(self.predictions, feed_dict=placeholders)
        softmax_output = session.run(self.softmax_output,
                                     feed_dict=placeholders)

        return predictions, softmax_output


class TransformerNER():

    def __init__(self, epochs, batch_size, labels):
        self.model_args = NERArgs(num_train_epochs=epochs,
                                  train_batch_size=batch_size,
                                  eval_batch_size=batch_size,
                                  evaluate_during_training=True,
                                  output_dir='/output/transformerNER',
                                  best_model_dir='/output/transformerNER/best',
                                  overwrite_output_dir=True,
                                  fp16=False,
                                  labels_list=labels,
                                  do_lower_case=True)

        self.model = NERModel(
            'bert', 'cahya/bert-base-indonesian-1.5G',
            labels=labels, args=self.model_args
        )

    def train(self, train_data, eval_data):
        self.model.train_model(train_data, eval_data=eval_data)

    def evaluate(self, data):
        return self.model.eval_model(data)

    def predict(self, test_data):
        return self.model.predict(test_data)


In [None]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import sys
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train = pd.read_csv('/kaggle/input/scl-address-element-extraction/train.csv')
label = train.pop('POI/street').values
raw_addr = [i[0].lstrip(' ').rstrip(' ') for i in train.drop(columns=['id'], axis=1).values]

poi_tokens = []
street_tokens = []

for data in label:
    splitted = data.split('/')
    poi_tokens.append(re.findall(r'[\w]+|[^\s\w]', splitted[0]))
    street_tokens.append(re.findall(r'[\w]+|[^\s\w]', splitted[1]))

raw_addr = [re.findall(r'[\w]+|[^\s\w]', i) for i in raw_addr]

tokens = []
tags = []

token_dict = {}

tag_set = ['O', 'B-POI', 'B-STREET', 'I-POI', 'I-STREET']

for i in range(len(raw_addr)):
    tokens_lst = raw_addr[i].copy()
    tags_lst = ['O' for i in range(len(raw_addr[i]))]

    for j in range(len(tokens_lst)):
        match = 1
        for k in range(len(poi_tokens[i])):
            if j + k >= len(tokens_lst):
                match = 0
                break

            regex = '^' + re.escape(tokens_lst[j + k])
            matched = bool(re.match(regex, poi_tokens[i][k]))
            if matched is False:
                match = 0
                break

        if match == 1 and len(poi_tokens[i]) > 0:
            for k in range(len(poi_tokens[i])):
                
                if k == 0:
                    tags_lst[j + k] = 'B-POI'
                else:
                    tags_lst[j + k] = 'I-POI'

                if tokens_lst[j + k] in token_dict:
                    if poi_tokens[i][k] in token_dict[tokens_lst[j + k]]:
                        token_dict[tokens_lst[j + k]][poi_tokens[i][k]] += 1
                    else:
                        token_dict[tokens_lst[j + k]][poi_tokens[i][k]] = 1
                else:
                    token_dict[tokens_lst[j + k]] = {}
                    token_dict[tokens_lst[j + k]][poi_tokens[i][k]] = 1

                tokens_lst[j + k] = poi_tokens[i][k]

            break

    for j in range(len(tokens_lst)):
        match = 1
        for k in range(len(street_tokens[i])):
            if j + k >= len(tokens_lst):
                match = 0
                break

            if tags_lst[j + k] != 'O':
                match = 0
                break

            regex = '^' + re.escape(tokens_lst[j + k])
            matched = bool(re.match(regex, street_tokens[i][k]))
            if matched is False:
                match = 0
                break

        if match == 1 and len(street_tokens[i]) > 0:
            for k in range(len(street_tokens[i])):
                
                if k == 0:
                    tags_lst[j + k] = 'B-STREET'
                else:
                    tags_lst[j + k] = 'I-STREET'

                if tokens_lst[j + k] in token_dict:
                    if street_tokens[i][k] in token_dict[tokens_lst[j + k]]:
                        token_dict[tokens_lst[j + k]][street_tokens[i][k]] += 1
                    else:
                        token_dict[tokens_lst[j + k]][street_tokens[i][k]] = 1
                else:
                    token_dict[tokens_lst[j + k]] = {}
                    token_dict[tokens_lst[j + k]][street_tokens[i][k]] = 1

                tokens_lst[j + k] = street_tokens[i][k]

            break

    for j in range(len(tokens_lst)):
        if tokens_lst[j].isdigit():
            tokens_lst[j] = '15'

    tokens.append(tokens_lst)
    tags.append(tags_lst)

    # print(i)

train_tokens, valid_tokens, train_tags, valid_tags = train_test_split(tokens,
    tags, test_size=0.2)

train_sent_id = [i for i in range(len(train_tokens))
                 for j in range(len(train_tokens[i]))]
train_tokens = [token for token_lst in train_tokens for token in token_lst]
train_tags = [tag for tag_lst in train_tags for tag in tag_lst]
train_df = pd.DataFrame(list(zip(train_sent_id, train_tokens, train_tags)),
                        columns=['sentence_id', 'words', 'labels'])

valid_sent_id = [i for i in range(len(valid_tokens))
                 for j in range(len(valid_tokens[i]))]
valid_tokens = [token for token_lst in valid_tokens for token in token_lst]
valid_tags = [tag for tag_lst in valid_tags for tag in tag_lst]
valid_df = pd.DataFrame(list(zip(valid_sent_id, valid_tokens, valid_tags)),
                        columns=['sentence_id', 'words', 'labels'])


In [None]:
model = TransformerNER(4, 64, tag_set)

In [None]:
model.train(train_df, valid_df)

In [None]:
train = pd.read_csv('/kaggle/input/scl-address-element-extraction/train.csv')
label = train.pop('POI/street').values
raw_addr = [i[0].lstrip(' ').rstrip(' ') for i in train.drop(columns=['id'], axis=1).values]

poi_tokens = []
street_tokens = []

for data in label:
    splitted = data.split('/')
    poi_tokens.append(re.findall(r'[\w]+|[^\s\w]', splitted[0]))
    street_tokens.append(re.findall(r'[\w]+|[^\s\w]', splitted[1]))

raw_addr = [re.findall(r'[\w]+|[^\s\w]', i) for i in raw_addr]

tokens = []
tags = []

token_dict = {}

tag_set = ['O', 'B-POI', 'B-STREET', 'I-POI', 'I-STREET']

for i in range(len(raw_addr)):
    tokens_lst = raw_addr[i].copy()
    tags_lst = ['O' for i in range(len(raw_addr[i]))]

    for j in range(len(tokens_lst)):
        match = 1
        for k in range(len(street_tokens[i])):
            if j + k >= len(tokens_lst):
                match = 0
                break

            regex = '^' + re.escape(tokens_lst[j + k])
            matched = bool(re.match(regex, street_tokens[i][k]))
            if matched is False:
                match = 0
                break

        if match == 1 and len(street_tokens[i]) > 0:
            for k in range(len(street_tokens[i])):
                
                if k == 0:
                    tags_lst[j + k] = 'B-STREET'
                else:
                    tags_lst[j + k] = 'I-STREET'

                if tokens_lst[j + k] in token_dict:
                    if street_tokens[i][k] in token_dict[tokens_lst[j + k]]:
                        token_dict[tokens_lst[j + k]][street_tokens[i][k]] += 1
                    else:
                        token_dict[tokens_lst[j + k]][street_tokens[i][k]] = 1
                else:
                    token_dict[tokens_lst[j + k]] = {}
                    token_dict[tokens_lst[j + k]][street_tokens[i][k]] = 1

                tokens_lst[j + k] = street_tokens[i][k]

            break

    for j in range(len(tokens_lst)):
        match = 1
        for k in range(len(poi_tokens[i])):
            if j + k >= len(tokens_lst):
                match = 0
                break

            if tags_lst[j + k] != 'O':
                match = 0
                break

            regex = '^' + re.escape(tokens_lst[j + k])
            matched = bool(re.match(regex, poi_tokens[i][k]))
            if matched is False:
                match = 0
                break

        if match == 1 and len(poi_tokens[i]) > 0:
            for k in range(len(poi_tokens[i])):
                
                if k == 0:
                    tags_lst[j + k] = 'B-POI'
                else:
                    tags_lst[j + k] = 'I-POI'

                if tokens_lst[j + k] in token_dict:
                    if poi_tokens[i][k] in token_dict[tokens_lst[j + k]]:
                        token_dict[tokens_lst[j + k]][poi_tokens[i][k]] += 1
                    else:
                        token_dict[tokens_lst[j + k]][poi_tokens[i][k]] = 1
                else:
                    token_dict[tokens_lst[j + k]] = {}
                    token_dict[tokens_lst[j + k]][poi_tokens[i][k]] = 1

                tokens_lst[j + k] = poi_tokens[i][k]

            break

    for j in range(len(tokens_lst)):
        if tokens_lst[j].isdigit():
            tokens_lst[j] = '15'

    tokens.append(tokens_lst)
    tags.append(tags_lst)

    # print(i)

train_tokens, valid_tokens, train_tags, valid_tags = train_test_split(tokens,
    tags, test_size=0.2)

train_sent_id = [i for i in range(len(train_tokens))
                 for j in range(len(train_tokens[i]))]
train_tokens = [token for token_lst in train_tokens for token in token_lst]
train_tags = [tag for tag_lst in train_tags for tag in tag_lst]
train_df = pd.DataFrame(list(zip(train_sent_id, train_tokens, train_tags)),
                        columns=['sentence_id', 'words', 'labels'])

valid_sent_id = [i for i in range(len(valid_tokens))
                 for j in range(len(valid_tokens[i]))]
valid_tokens = [token for token_lst in valid_tokens for token in token_lst]
valid_tags = [tag for tag_lst in valid_tags for tag in tag_lst]
valid_df = pd.DataFrame(list(zip(valid_sent_id, valid_tokens, valid_tags)),
                        columns=['sentence_id', 'words', 'labels'])

In [None]:
model2 = TransformerNER(3, 64, tag_set)

In [None]:
import torch

torch.cuda.empty_cache()

In [None]:
model2.train(train_df, valid_df)

In [None]:
for tokens in token_dict:
    max_key = max(token_dict[tokens], key=token_dict[tokens].get)
    token_dict[tokens] = max_key

In [None]:
test = pd.read_csv('/kaggle/input/scl-address-element-extraction/test.csv').drop(columns=['id'], axis=1).values
test = [i[0].lstrip(' ').rstrip(' ') for i in test]

test_split = [re.findall(r'[\w]+|[^\s\w]', i) for i in test]
test_split_corrected = [re.findall(r'[\w]+|[^\s\w]', i) for i in test]

for i in range(len(test_split)):
    for j in range(len(test_split[i])):
        if test_split[i][j] in token_dict:
            test_split_corrected[i][j] = token_dict[test_split[i][j]]

test_split_intermediary = []
test_text = []

for i in range(len(test_split)):
  pos = 0
  test_split_list = []
  text = ''
  original_text = ''
  for j in range(len(test_split[i])):
      tmp = ''
      while test[i][pos] == ' ':
          pos += 1
          tmp += ' '
      test_split_list.append(tmp)
      pos += len(test_split[i][j])
      if test_split_corrected[i][j].isdigit():
          text += tmp + ' 15'
      else:
          text += tmp + ' ' + test_split_corrected[i][j]
      original_text += tmp + ' ' + test_split_corrected[i][j]
    
  test[i] = original_text
  test_text.append(text)
  test_split_intermediary.append(test_split_list)


In [None]:
raw_predictions = model.predict(test_text)
predictions = raw_predictions[0]

In [None]:
predictions

In [None]:
prediction_intermediary = []
prediction_tokens = []

for i in range(len(predictions)):
    pos = 0
    text_pos = 0
    intermed = []
    tokens = []
    for j in range(len(predictions[i])):
        tmp = ''
        while test_text[i][pos] == ' ':
            pos += 1
            text_pos += 1
            tmp += ' '
        pred = list(predictions[i][j].keys())[0]
        pos += len(pred)
        # print(pred, len(tmp), pos, test_text[i])
        if pred == '15':
            pred = ''
            # print(test[i])
            for k in range(text_pos, len(test[i])):
                if test[i][k].isdigit() is True:
                    pred += test[i][k]
                    text_pos = k+1
                else:
                    # print(test[i])
                    break
            # if next(iter(predictions[i][j].values())) != 'O':
            #     print('OK')
        else:
            text_pos += len(pred)
        tmp = tmp[:-1]
        intermed.append(tmp)
        tokens.append(pred)
    prediction_intermediary.append(intermed)
    prediction_tokens.append(tokens)

In [None]:
result_poi = []

for i in range(len(predictions)):
    res = ''
    start_idx = -1
    end_idx = -1
    tmp = ''
    for j in range(len(predictions[i])):
        tag = next(iter(predictions[i][j].values()))
        if tag == 'B-POI':
            start_idx = j
            end_idx = j
            for k in range(j + 1, len(predictions[i])):
                tag = next(iter(predictions[i][k].values()))
                if tag == 'I-POI':
                    end_idx = k
                else:
                    break

            break

    if start_idx != -1:
        for j in range(start_idx, end_idx + 1):
            tmp += prediction_intermediary[i][j] + prediction_tokens[i][j]

        res += tmp.lstrip(' ')

    # res += '/'
    # start_idx = -1
    # end_idx = -1
    # tmp = ''

    # for j in range(len(predictions[i])):
    #     tag = next(iter(predictions[i][j].values()))
    #     if tag == 'B-STREET':
    #         start_idx = j
    #         end_idx = j
    #         for k in range(j + 1, len(predictions[i])):
    #             tag = next(iter(predictions[i][k].values()))
    #             if tag == 'I-STREET':
    #                 end_idx = k
    #             else:
    #                 break

    #         break

    # if start_idx != -1:
    #     tmp = ''
    #     for j in range(start_idx, end_idx + 1):
    #         tmp += prediction_intermediary[i][j] + prediction_tokens[i][j]

    #     res += tmp.lstrip(' ')

    result_poi.append(res)

result_poi

In [None]:
!nvidia-smi

In [None]:
raw_predictions2 = model.predict(test_text)
predictions2 = raw_predictions2[0]

In [None]:
result_street = []

for i in range(len(predictions2)):
    # res = ''
    # start_idx = -1
    # end_idx = -1
    # tmp = ''
    # for j in range(len(predictions2[i])):
    #     tag = next(iter(predictions2[i][j].values()))
    #     if tag == 'B-POI':
    #         start_idx = j
    #         end_idx = j
    #         for k in range(j + 1, len(predictions2[i])):
    #             tag = next(iter(predictions2[i][k].values()))
    #             if tag == 'I-POI':
    #                 end_idx = k
    #             else:
    #                 break

    #         break

    # if start_idx != -1:
    #     for j in range(start_idx, end_idx + 1):
    #         tmp += prediction_intermediary[i][j] + prediction_tokens[i][j]

    #     res += tmp.lstrip(' ')

    res = '/'
    start_idx = -1
    end_idx = -1
    tmp = ''

    for j in range(len(predictions2[i])):
        tag = next(iter(predictions2[i][j].values()))
        if tag == 'B-STREET':
            start_idx = j
            end_idx = j
            for k in range(j + 1, len(predictions2[i])):
                tag = next(iter(predictions2[i][k].values()))
                if tag == 'I-STREET':
                    end_idx = k
                else:
                    break

            break

    if start_idx != -1:
        tmp = ''
        for j in range(start_idx, end_idx + 1):
            tmp += prediction_intermediary[i][j] + prediction_tokens[i][j]

        res += tmp.lstrip(' ')

    result_street.append(res)

result_street

In [None]:
result = [''.join([a, b]) for a,b in zip(result_poi, result_street)]

In [None]:
df = pd.DataFrame(result, columns=['POI/street'])
df

In [None]:
df.to_csv('submission.csv', index=True, index_label='id', columns=['POI/street'])

In [None]:
import os
os.chdir(r'../working')
from IPython.display import FileLink
FileLink(r'submission.csv')