# Import Library

In [1]:
import numpy as np
import pandas as pd

import spacy
from spacy.util import minibatch, compounding
import random
from pathlib import Path

# Data Pre-Processing

In [2]:
# read file
df = pd.read_csv("train.csv")

In [3]:
# split via /
df[['POI','street']] = df['POI/street'].str.split('/',expand=True)

In [4]:
df.head()

Unnamed: 0,id,raw_address,POI/street,POI,street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/,,
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung
3,3,"toko dita, kertosono",toko dita/,toko dita,
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru


In [5]:
# Make the data into a format where it can be passed into spacy for training
df_copy = df.copy()
training_data = []

for i in df_copy.index:
    #entities = []
    if i < len(df_copy):
        raw_address = df_copy['raw_address'][i]
        label_street = 'street'
        label_POI = 'POI'
        
        if df_copy['street'][i] != '' and df_copy['street'][i] is not None:
            start_index_street = raw_address.find(df_copy['street'][i])
            end_index_street = start_index_street + len(df_copy['street'][i])
        else:
            start_index_street = 0
            end_index_street = 0
        
        if df_copy['POI'][i] != '' and df_copy['POI'][i] is not None:
            start_index_POI = raw_address.find(df_copy['POI'][i])
            end_index_POI = start_index_POI + len(df_copy['POI'][i])
        else:
            start_index_POI = 0
            end_index_POI = 0
        
        entity = [(start_index_POI, end_index_POI, label_POI), (start_index_street, end_index_street, label_street)]
        
 
        
        #print (df_copy['street'][i])
        #entities.append(entity)
        training_data.append((raw_address, {'entities': entity}))

In [6]:
training_data[0:40]

[('jl kapuk timur delta sili iii lippo cika 11 a cicau cikarang pusat',
  {'entities': [(0, 0, 'POI'), (0, 40, 'street')]}),
 ('aye, jati sampurna', {'entities': [(0, 0, 'POI'), (0, 0, 'street')]}),
 ('setu siung 119 rt 5 1 13880 cipayung',
  {'entities': [(0, 0, 'POI'), (5, 10, 'street')]}),
 ('toko dita, kertosono', {'entities': [(0, 9, 'POI'), (0, 0, 'street')]}),
 ('jl. orde baru', {'entities': [(0, 0, 'POI'), (0, 13, 'street')]}),
 ('raya samb gede, 299 toko bb kids',
  {'entities': [(20, 32, 'POI'), (0, 14, 'street')]}),
 ('kem mel raya, no 4 bojong rawalumbu rt 1 36 rawalumbu',
  {'entities': [(0, 0, 'POI'), (0, 12, 'street')]}),
 ('tela keuramat kuta alam', {'entities': [(0, 0, 'POI'), (0, 4, 'street')]}),
 ('gg. i wates magersari', {'entities': [(0, 0, 'POI'), (0, 5, 'street')]}),
 ('bunga ncole ix 2', {'entities': [(0, 0, 'POI'), (0, 14, 'street')]}),
 ('cikahuripan sd neg boj 02 klap boj, no 5 16877',
  {'entities': [(-1, 18, 'POI'), (26, 34, 'street')]}),
 ('yaya atohar,', 

# Building the Model

In [7]:
# Create the model
nlp = spacy.blank('en')  # create blank Language class
print("Created blank 'en' model")
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

Created blank 'en' model


In [8]:
# Add new entity labels to entity recognizer

LABEL = ['POI', 'street']

for i in LABEL:
    ner.add_label(i)
    
# Inititalizing optimizer
nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
optimizer = nlp.begin_training()


In [None]:
# Get names of other pipes to disable them during training to train only NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(20):
        random.shuffle(training_data)
        losses = {}
        batches = minibatch(training_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.60,
                       losses=losses)
        print('Losses', losses)

Losses {'ner': 677.0289769749318}
Losses {'ner': 351.63775604157865}
Losses {'ner': 314.8814412750272}
Losses {'ner': 297.9176560843053}
Losses {'ner': 286.7003926519788}
Losses {'ner': 277.6180079357415}
Losses {'ner': 276.82253913640835}
Losses {'ner': 273.05797386966077}
Losses {'ner': 272.240336209524}
Losses {'ner': 266.6980887021742}
Losses {'ner': 265.3995578565067}


In [None]:
# Save Model

# Test some data

In [None]:
# Test the trained model
test_text = 'ngupasan sur 33 gondomanan'
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

In [None]:
# Test the trained model
test_text = 'sepa raya no 13 periuk rt 1 7 periuk'
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

In [None]:
# Test the trained model
test_text = 'sd n 1 gumu ahmad yani,'
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

In [None]:
# Test the trained model
test_text = 'mie ayam bintang, kali rejo wonosobo'
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

In [None]:
# Test the trained model (testing data)
test_text = 's. par 53 sidanegara 4 cilacap tengah'
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

In [None]:
# Test the trained model (testing data)
test_text = 'angg per, baloi indah kel. lubuk baja'
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

In [None]:
# Test the trained model (testing data)
test_text = 'tebet barat tebet barat vi no 12 4 tebet'
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    if ent.label_ == 'street':
        print("STREET")
    print(ent.label_, ent.text)
    print(type(ent.label_))
    print(type(ent.text))
    

# Export testing result into submission.csv

In [None]:
# read file
df_test = pd.read_csv("test.csv")

In [None]:
df_test.head()

In [None]:
df_test['POI'] = ''
df_test['street'] = ''

In [None]:
df_test.head()

In [None]:
for i in df_test.index:
    if i < len(df_test):
        raw_address = df_test['raw_address'][i]
        doc = nlp(raw_address)    
        
        for ent in doc.ents:
            if ent.label_ == 'POI':
                print(ent.label_, ent.text)
                df_test['POI'][i] = df_test['POI'][i] + ent.text
            if ent.label_ == 'street':
                print(ent.label_, ent.text)
                df_test['street'][i] = df_test['street'][i] + ent.text
            #print(ent.label_, ent.text)

In [None]:
df_test.head(20)

In [None]:
df_submission = df_test.copy()

In [None]:
df_submission.head(20)

In [None]:
df_submission['POI/street'] = df_submission[['POI', 'street']].agg('/'.join, axis=1)

In [None]:
del df_submission['raw_address']
del df_submission['street']
del df_submission['POI']

In [None]:
df_submission.head(20)

In [None]:
df_submission.to_csv('submission21.csv', index=False)