In [1]:
import spacy
from tqdm import tqdm
import warnings
spacy.require_gpu()

In [23]:
import pandas as pd

train = pd.read_csv("../input/scl-2021-ds/train.csv")
test = pd.read_csv("../input/scl-2021-ds/test.csv")
ss = pd.read_csv("../input/scl-2021-ds/sampleSubmission.csv")

In [24]:
train.to_csv("shopee_train.csv")
test.to_csv("shopee_test.csv")

In [4]:
train.head()

In [5]:
test.head()

In [6]:
ss.head()

## String Split for 2 different task (POI & Street)

In [25]:
splitstring = train["POI/street"].str.partition('/')

In [26]:
train["POI"] = splitstring[0]
train["street"] = splitstring[2]

In [27]:
train.head(11)

In [9]:
train = train[train["POI/street"] != '/']

In [10]:
import json

a = "aaaa bbb cc"
b = "aaaaa/bb"


def jsonTransformer(raw, target):
    splitstring = target.partition('/')
    ent1 = splitstring[0].strip()
    ent2 = splitstring[2].strip()

    temp = []    

    startIndex1 = raw.find(ent1)
    endIndex1 = startIndex1 + len(ent1)
    startIndex2 = raw.find(ent2)
    endIndex2 = startIndex2 + len(ent2)
    
    ## ugly code to avoid error;( 
    if startIndex1 == -1:
        endIndex1 = -1
    if startIndex2 == -1:
        endIndex2 = -1
        
    x = range(startIndex1,endIndex1)
    y = range(startIndex2,endIndex2)
    xs = set(x)
    if(len(xs.intersection(y)) > 0):
        temp = []
    ##       sementara kayak gini duu

    elif startIndex1 != endIndex1 and startIndex2 != endIndex2:
        temp = [(startIndex1, endIndex1, "POI"),(startIndex2, endIndex2, "street")]
    elif startIndex1 == endIndex1 and startIndex2 == endIndex2:
        temp = []
    elif startIndex1 == endIndex1:
        temp = [(startIndex2, endIndex2, "street")]
    elif startIndex2 == endIndex2:
        temp = [(startIndex1, endIndex1, "POI")]


    return raw, {"entities": temp}
    
jsonTransformer(a, b)    

In [11]:
TRAIN_DATA = []

for index, row in tqdm(train.iterrows()):
    TRAIN_DATA.append(jsonTransformer(row.raw_address, row["POI/street"].strip()))
    


## Train Model

In [19]:
model = None

if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

#set up the pipeline

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')
    
LABEL = ['POI', 'street']    
for i in LABEL:
    ner.add_label(i) 

if model is None:
    optimizer = nlp.begin_training()
else:
    optimizer = nlp.entity.create_optimizer()    

In [20]:

from spacy.gold import GoldParse
from spacy.util import minibatch, compounding
import random

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():  # only train NER
    warnings.filterwarnings("ignore")
    for itn in range(50):
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
        for batch in tqdm(batches):
            texts, annotations = zip(*batch) 

            nlp.update(texts, annotations, sgd=optimizer, 
                       drop=0.35, losses=losses)
        print('Losses', losses)


## Save Model

In [None]:
output_dir = "./"
new_model_name = "spacy_50iter"


nlp.meta['name'] = new_model_name  # rename model
nlp.to_disk(output_dir)
print("Saved model to", output_dir)


## Load Model

In [12]:
model = "../input/ner-spacy-model/NER/100iter"

if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

#set up the pipeline

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')
    
LABEL = ['POI', 'street']    
for i in LABEL:
    ner.add_label(i) 

if model is None:
    optimizer = nlp.begin_training()
else:
    optimizer = nlp.entity.create_optimizer()    

## Cont. Training

In [None]:
import warnings
from spacy.gold import GoldParse
from spacy.util import minibatch, compounding
import random

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():  # only train NER
    warnings.filterwarnings("ignore")
    for itn in range(30):
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
        for batch in tqdm(batches):
            texts, annotations = zip(*batch) 

            nlp.update(texts, annotations, sgd=optimizer, 
                       drop=0.35, losses=losses)
        print('Losses', itn," ", losses)

        
output_dir = "./"
new_model_name = "spacy_130iter"


nlp.meta['name'] = new_model_name  # rename model
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
        

## Check Model

In [28]:
train["street_pred"] = ""
train["POI_pred"]= ""

for index, row in tqdm(train.iterrows()):
    doc2 = nlp(row.raw_address)
    
    #print(index)
    for ent in doc2.ents:
        if(ent.label_ == "street"): train.at[index, 'street_pred'] = ent.text
            #print(ent.label_, ent.text, "street d")
        elif(ent.label_ == "POI"): train.at[index, 'POI_pred'] = ent.text
            #print(ent.label_, ent.text, "POI d")
            
train.head()

In [29]:
train['POI/street_pred'] = train["POI_pred"] + "/" + train["street_pred"]
train.head()

In [30]:
import numpy as np
arrActual = np.array(train['POI/street'])
arrPred = np.array(train['POI/street_pred'])
correct = (arrActual == arrPred)
correct.sum() / correct.size

In [31]:
cols = ['raw_address', 'POI/street', 'POI/street_pred']
falsedf = train[cols][train["POI/street"] != train["POI/street_pred"]]
falsedf.head()

In [32]:
falsedf[falsedf['POI/street_pred'] == "/"]

In [33]:
falsedf = falsedf[falsedf['POI/street_pred'] != "/"]
falsedf.head(20)

In [40]:
train['countword_street'] = train['street'].apply(len)
train['countword_POI'] = train['POI'].apply(len)
train['countword_street_pred'] = train['street_pred'].apply(len)
train['countword_POI_pred'] = train['POI_pred'].apply(len)

train.head()

In [51]:
import seaborn as sns

sns.displot(train[train['countword_street'] != 0], x="countword_street", log=True)

In [52]:
sns.displot(train[train['countword_street_pred'] != 0], x="countword_street_pred", log=True)

In [55]:
sns.displot(train[train['countword_POI'] != 0], x="countword_POI", log=True)

In [56]:
sns.displot(train[train['countword_POI_pred'] != 0], x="countword_POI_pred", log=True)

## Create Prediction

In [None]:
test["street_pred"] = ""
test["POI_pred"]= ""

for index, row in tqdm(test.iterrows()):
    doc2 = nlp(row.raw_address)
    
    #print(index)
    for ent in doc2.ents:
        if(ent.label_ == "street"): test.at[index, 'street_pred'] = ent.text
            #print(ent.label_, ent.text, "street d")
        elif(ent.label_ == "POI"): test.at[index, 'POI_pred'] = ent.text
            #print(ent.label_, ent.text, "POI d")
            
test.head(1000)

## Submission

In [None]:
test["POI/street"] = test["POI_pred"]+ "/" + test["street_pred"]
test.head()

In [None]:
test[["id", "POI/street"]].to_csv("submoission130iter.csv", index = False)