# Predicting on test set

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install seqeval

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 6.5MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 21.3MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 31.7MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256

In [3]:
import re
import string
import pickle

import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertForTokenClassification, BertTokenizerFast
from datasets import load_metric
from transformers import EarlyStoppingCallback

In [4]:
test_path = '/content/drive/MyDrive/shopee_code_league/test.csv'

test_df = pd.read_csv(test_path)

In [5]:
test_df.head()

Unnamed: 0,id,raw_address
0,0,s. par 53 sidanegara 4 cilacap tengah
1,1,"angg per, baloi indah kel. lubuk baja"
2,2,"asma laun, mand imog,"
3,3,"ud agung rej, raya nga sri wedari karanganyar"
4,4,"cut mutia, 35 baiturrahman"


In [6]:
test_df.shape

(50000, 2)

# Preprocessing

## Define pretrained tokenizer


In [7]:
model_name = 'indobenchmark/indobert-base-p1'
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=229167.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




## Preparing data

Steps:
1. Split raw address into individual word/punctuation and apply pretrained tokenizer
1. Convert list of dicts to dict
1. Perform padding
1. Create torch dataset

In [8]:
# 1. Split raw address into individual word/punctuation and apply pretrained tokenizer
tokenized_raw_addresses = []
tokens_len = []
for index in test_df.index:
    # Find individual tokens including punctuation
    tokens = re.findall(r"[\w]+|[^\s\w]", test_df.loc[index,'raw_address'])

    # Apply pretrained tokenizer
    tokenized_raw_address = tokenizer(tokens, is_split_into_words=True)
    tokenized_raw_addresses.append(tokenized_raw_address)

    # calculate tokens length - need for output
    tokens_len_row = len(tokenized_raw_address['input_ids']) - 2 # minus [cls] and [sep]
    tokens_len.append(tokens_len_row)

In [9]:
# 2. Convert list of dicts to dict
# perform padding 
X_dict = {}
X_dict['input_ids'] = []
X_dict['attention_mask'] = []
X_dict['token_type_ids'] = []

for i in range(len(tokenized_raw_addresses)):
    inputs_ids = tokenized_raw_addresses[i]['input_ids']
    attention_mask = tokenized_raw_addresses[i]['attention_mask']
    token_type_ids = tokenized_raw_addresses[i]['token_type_ids']

    X_dict['input_ids'].append(inputs_ids)
    X_dict['attention_mask'].append(attention_mask)
    X_dict['token_type_ids'].append(token_type_ids)

In [10]:
# 3. Perform padding

# Add 0 to padding for X
for k,v in X_dict.items():
    X_dict[k] = pad_sequences(v, maxlen=100, value=0, dtype="long", padding='post')

In [11]:
# 4. Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if isinstance(self.labels, np.ndarray):
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

test_dataset = Dataset(X_dict)

# Model 1: POI/Street extraction model

## Making prediction

Post processing steps:
1. Get the argmax of the prediction
1. Trim length to remove unwanted tokens and padding
1. Decode input ids into words

In [12]:
# load trained model
model_path = '/content/drive/MyDrive/shopee_code_league/best_pos_model/checkpoint-27000'
model =  BertForTokenClassification.from_pretrained(model_path, num_labels=3)
trained_pos_trainer = Trainer(model)

In [13]:
# Making prediction
predictions,_,_ = trained_pos_trainer.predict(test_dataset)
predictions_argmax = np.argmax(predictions, axis=2)

In [14]:
# Trim length to remove unwanted tokens and padding
input_ids = []
pred_tokens = []
for i in range(len(predictions_argmax)):
    pred_tokens_row = predictions_argmax[i][1: tokens_len[i]+ 1]
    input_ids_row = X_dict['input_ids'][i][1: tokens_len[i]+ 1]

    pred_tokens.append(pred_tokens_row)
    input_ids.append(input_ids_row)

# Decode into words
y_pred = []
for i in range(len(pred_tokens)):

    street_single = []
    poi_single = []
    for pred_tag, input_id_token in zip(pred_tokens[i],input_ids[i]):
        if pred_tag == 1:
            poi_single.append(input_id_token)
        elif pred_tag ==2:
            street_single.append(input_id_token)

    poi_single_decoded = tokenizer.decode(poi_single)
    street_single_decoded = tokenizer.decode(street_single)
    y_pred.append(poi_single_decoded + '/' + street_single_decoded)

In [17]:
# Add to df
submission_df = test_df.copy()
submission_df['y_pred'] = y_pred

In [18]:
submission_df.head()

Unnamed: 0,id,raw_address,y_pred
0,0,s. par 53 sidanegara 4 cilacap tengah,/s. par
1,1,"angg per, baloi indah kel. lubuk baja",/angg per
2,2,"asma laun, mand imog,",asma laun/mand imog
3,3,"ud agung rej, raya nga sri wedari karanganyar",ud agung rej/raya nga
4,4,"cut mutia, 35 baiturrahman",/cut mutia


# Model 2: Abbrev model

## Making prediction

Post processing steps:
1. Get argmax
1. Trim length to remove unwanted tokens and padding
1. Get argmax
1. Using abbreviation mapping created from training data, create a list of expansion required for each instance in the test data

In [19]:
# load trained model
model_path = '/content/drive/MyDrive/shopee_code_league/abbrev_model/checkpoint-18000'
model =  BertForTokenClassification.from_pretrained(model_path, num_labels=2)
trained_abbrev_trainer = Trainer(model)

In [20]:
# Making prediction
predictions,_,_ = trained_abbrev_trainer.predict(test_dataset)
predictions_argmax = np.argmax(predictions, axis=2)

In [21]:
pred_tokens = []
for i in range(len(predictions_argmax)):
    tokens = predictions_argmax[i][1: tokens_len[i]+ 1]
    pred_tokens.append(tokens)

In [22]:
# load abbreviation mapping learnt from training data
with open('/content/drive/MyDrive/shopee_code_league/submission_pred/abbrev_model/derived_abbrev_dict.txt', "rb") as f:   #Pickling
    derived_abbrev_dict = pickle.load(f)

In [23]:
# find words needed to expand
token_ids_to_expand = []
abbrev_to_expand = []
abbrev_expansion = []
for row_num, row in enumerate(pred_tokens):
    if  1 in row:
        token_ids_to_expand_row = []
        
        input_ids = X_dict['input_ids'][row_num][1: len(row)+1] # exclude cls token and sep
        for token_num, token in enumerate(row):
            if token == 1:
                token_id = input_ids[token_num]
                token_ids_to_expand_row.append(token_id)

        token_ids_to_expand.append(token_ids_to_expand_row)

        # decode - tokens include subwords
        original_abbrev = tokenizer.decode(token_ids_to_expand_row)
        abbrev_list = re.findall(r"[\w]+|[^\s\w]", original_abbrev)

        expansion_dict = {}
        for abbrev in abbrev_list:
            if abbrev in derived_abbrev_dict.keys():
                expansion = derived_abbrev_dict[abbrev]
                expansion_dict[abbrev] = expansion
        abbrev_expansion.append(expansion_dict)

    else:
        abbrev_expansion.append({})

In [24]:
# Preview expansion mapping applied to prediction
abbrev_expansion[-10:]

[{'indon': 'indonesia', 'tre': 'trengg'},
 {},
 {},
 {},
 {},
 {},
 {},
 {'mart': 'martabak'},
 {},
 {}]

# Combining model prediction

Steps:
1. Apply abbreviation mapping predicted by abbrev model onto the prediction of the POI/street model

In [25]:
# perform abbrev expansion 
poi_street_pred = list(submission_df['y_pred'])

expanded_poi_street_pred = []
for row_id, abbrev_expansion_row in enumerate(abbrev_expansion):
    if len(abbrev_expansion_row.keys()) == 0:
        expanded_poi_street_pred.append(poi_street_pred[row_id])
    else:
        poi_street_row = poi_street_pred[row_id]
        for abbrev in abbrev_expansion_row.keys():
            if abbrev in poi_street_row:
                expansion = abbrev_expansion_row[abbrev]
                poi_street_row = poi_street_row.replace(abbrev, expansion)
        expanded_poi_street_pred.append(poi_street_row)

In [26]:
submission_df['POI/street'] = expanded_poi_street_pred
final_df = submission_df[['id','POI/street']]

In [27]:
final_df.head()

Unnamed: 0,id,POI/street
0,0,/s. par
1,1,/angg per
2,2,asma laundry/mand imog
3,3,ud agung rejeki/raya nga
4,4,/cut mutia


In [None]:
final_df.to_csv('submission.csv',index=False)