In [3]:
import pandas as pd
import os
import json

import numpy as np

from nltk import word_tokenize
from nltk.tag import pos_tag

import spacy

import sklearn_crfsuite

## Load models

We will use both simulataneously. This is possible, and probably an overkill, since the two languages are quite similar in regards to location naming

In [24]:
model_uk = spacy.load("uk_core_news_sm")
model_ru = spacy.load("ru_core_news_sm")

## Spans to Token Labels

In [25]:
def string_to_indices(s):
    if len(s) < 3:
        return []
    ints = [int(l[:-1]) if i%2 == 1 else int(l[1:]) for i,l in enumerate(s[1:-1].split(", "))]
    return [ints[i:i+2] for i in range(0,len(ints),2)]

In [29]:
dataset_dir = "./nlp-telegram-locations-extractions"
ru_geo_dataset_path = os.path.join(dataset_dir, "ru_geo_dataset.csv")
uk_geo_dataset_path = os.path.join(dataset_dir, "uk_geo_dataset.csv")

## Extract Features
- label
- token uk, pos tag uk for uk train, run for ru train
- pos tag ru for test
- token uk, pos tag uk - from Ukrainian model
- token ru, pos tag ru - from russian model

In [4]:
def to_data(df,nlp):
    Samples = df['text'].values
    Markers = df["loc_markers"].values
    
    DATA = []
    
    for i,(sample,markers,pos) in enumerate(zip(Samples,Markers,nlp.pipe(Samples))):
        if i%1000 == 0:
            print(i,len(Samples))
        marker_indices = string_to_indices(markers)
    
        if marker_indices:
            lbl = []
            s = 0
            for ms,me in marker_indices:
                lbl.append(((s,ms),0))
                lbl.append(((ms,me),1))
                s = me
            if marker_indices[0][0] == 0:
                lbl.pop(0)
            if lbl[-1][0][1] != len(sample):
                lbl.append(((s,len(sample)),0))

            label = []
            for i,((s,e),k) in enumerate(lbl):
                t = nlp(sample[s:e])
                if i == 0:
                    label += ["O"]*len(t)
                else:
                    #label += ["B"]+["B"]*(len(t)-1)
                    label += ["B"]*len(t)
        else:
            #tokenized = word_tokenize(sample)
            label = ["O"]*len(pos)
        
        try:
            assert(len(label)==len(word_tokenize(sample)))
            DATA.append([(l,p.text,p.pos_) for l,p in zip(label,pos)])
        except AssertionError as e:
            continue

    return DATA



In [6]:
df_uk = pd.read_csv(uk_geo_dataset_path, skiprows=lambda x: (x!=0 and x < 20000) or x > 70000)
df_ru = pd.read_csv(ru_geo_dataset_path, skiprows=lambda x: (x!=0 and x < 20000) or x > 70000)

data_uk = to_data(df_uk, model_uk)
data_ru = to_data(df_ru, model_ru)

import json
with open("data_uk_train.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(data_uk))
with open("data_ru_train.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(data_ru))

0 50001
1000 50001
2000 50001
3000 50001
4000 50001
5000 50001
6000 50001
7000 50001
8000 50001
9000 50001
10000 50001
11000 50001
12000 50001
13000 50001
14000 50001
15000 50001
16000 50001
17000 50001
18000 50001
19000 50001
20000 50001
21000 50001
22000 50001
23000 50001
24000 50001
25000 50001
26000 50001
27000 50001
28000 50001
29000 50001
30000 50001
31000 50001
32000 50001
33000 50001
34000 50001
35000 50001
36000 50001
37000 50001
38000 50001
39000 50001
40000 50001
41000 50001
42000 50001
43000 50001
44000 50001
45000 50001
46000 50001
47000 50001
48000 50001
49000 50001
50000 50001
0 50001
1000 50001
2000 50001
3000 50001
4000 50001
5000 50001
6000 50001
7000 50001
8000 50001
9000 50001
10000 50001
11000 50001
12000 50001
13000 50001
14000 50001
15000 50001
16000 50001
17000 50001
18000 50001
19000 50001
20000 50001
21000 50001
22000 50001
23000 50001
24000 50001
25000 50001
26000 50001
27000 50001
28000 50001
29000 50001
30000 50001
31000 50001
32000 50001
33000 50001
34000 

In [9]:
df_uk_test = pd.read_csv(uk_geo_dataset_path, skiprows=lambda x: x > 20000)
df_ru_test = pd.read_csv(ru_geo_dataset_path, skiprows=lambda x: x > 20000)

#df_uk_test = pd.read_csv(uk_geo_dataset_path, skiprows=lambda x: x > 2000)
#df_ru_test = pd.read_csv(ru_geo_dataset_path, skiprows=lambda x: x > 2000)

data_uk_test = to_data(df_uk_test, model_ru)
data_ru_test = to_data(df_ru_test, model_ru)

import json
with open("data_uk_test.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(data_uk_test))
with open("data_ru_test.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(data_ru_test))

0 20000
1000 20000
2000 20000
3000 20000
4000 20000
5000 20000
6000 20000
7000 20000
8000 20000
9000 20000
10000 20000
11000 20000
12000 20000
13000 20000
14000 20000
15000 20000
16000 20000
17000 20000
18000 20000
19000 20000
0 20000
1000 20000
2000 20000
3000 20000
4000 20000
5000 20000
6000 20000
7000 20000
8000 20000
9000 20000
10000 20000
11000 20000
12000 20000
13000 20000
14000 20000
15000 20000
16000 20000
17000 20000
18000 20000
19000 20000


In [13]:
data_ru[17]

[['O', '«', 'PUNCT'],
 ['O', 'Она', 'PRON'],
 ['O', 'всегда', 'ADV'],
 ['O', 'была', 'AUX'],
 ['O', 'уставшей', 'VERB'],
 ['O', ',', 'PUNCT'],
 ['O', 'периодически', 'ADV'],
 ['O', 'болела', 'VERB'],
 ['O', 'простудой', 'NOUN'],
 ['O', ',', 'PUNCT'],
 ['O', 'а', 'CCONJ'],
 ['O', 'на', 'ADP'],
 ['O', 'ее', 'DET'],
 ['O', 'ногах', 'NOUN'],
 ['O', 'вечно', 'ADV'],
 ['O', 'появлялись', 'VERB'],
 ['O', 'синяки', 'NOUN'],
 ['O', '.', 'PUNCT']]

In [17]:
data_uk[17]

[['O', 'Луки', 'NOUN'],
 ['O', ',', 'PUNCT'],
 ['O', 'ідея', 'NOUN'],
 ['O', 'якого', 'DET'],
 ['O', 'належить', 'VERB'],
 ['O', 'Владиці', 'NOUN'],
 ['O', 'Володимиру', 'PROPN'],
 ['O', '(', 'PUNCT'],
 ['B', 'Війтишину', 'PROPN'],
 ['B', ')', 'PUNCT'],
 ['B', '.', 'PUNCT']]

## Use both ru and ua pos tags

In [30]:
def to_data_multiple(df):
    Samples = df['text'].values
    Markers = df["loc_markers"].values
    
    DATA = []
    
    for i,(sample,markers,pos_uk,pos_ru) in enumerate(zip(Samples,Markers,model_uk.pipe(Samples),model_ru.pipe(Samples))):
        if i%1000 == 0:
            print(i,len(Samples))
        marker_indices = string_to_indices(markers)
    
        if marker_indices:
            lbl = []
            s = 0
            for ms,me in marker_indices:
                lbl.append(((s,ms),0))
                lbl.append(((ms,me),1))
                s = me
            if marker_indices[0][0] == 0:
                lbl.pop(0)
            if lbl[-1][0][1] != len(sample):
                lbl.append(((s,len(sample)),0))

            label = []
            tokenized = []
            pos = []
            for i,((s,e),k) in enumerate(lbl):
                t = word_tokenize(sample[s:e])
                if i == 0:
                    label += ["O"]*len(t)
                else:
                    #label += ["B"]+["B"]*(len(t)-1)
                    label += ["B"]*len(t)
                tokenized += t
        else:
            tokenized = word_tokenize(sample)
            label = ["O"]*len(tokenized)
        
        try:
            assert(len(label)==len(word_tokenize(sample)))
            DATA.append([(l,pu.text,pu.pos_,pr.text,pr.pos_) for t,l,pu,pr in zip(tokenized,label,pos_uk,pos_ru)])
        except AssertionError as e:
            continue

    return DATA



In [31]:
df_uk = pd.read_csv(uk_geo_dataset_path, skiprows=lambda x: (x!=0 and x < 20000) or x > 70000)
df_ru = pd.read_csv(ru_geo_dataset_path, skiprows=lambda x: (x!=0 and x < 20000) or x > 70000)

data_uk = to_data_multiple(df_uk)
data_ru = to_data_multiple(df_ru)

import json
with open("data_uk_train_multiple.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(data_uk))
with open("data_ru_train_multiple.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(data_ru))

0 50001
1000 50001
2000 50001
3000 50001
4000 50001
5000 50001
6000 50001
7000 50001
8000 50001
9000 50001
10000 50001
11000 50001
12000 50001
13000 50001
14000 50001
15000 50001
16000 50001
17000 50001
18000 50001
19000 50001
20000 50001
21000 50001
22000 50001
23000 50001
24000 50001
25000 50001
26000 50001
27000 50001
28000 50001
29000 50001
30000 50001
31000 50001
32000 50001
33000 50001
34000 50001
35000 50001
36000 50001
37000 50001
38000 50001
39000 50001
40000 50001
41000 50001
42000 50001
43000 50001
44000 50001
45000 50001
46000 50001
47000 50001
48000 50001
49000 50001
50000 50001
0 50001
1000 50001
2000 50001
3000 50001
4000 50001
5000 50001
6000 50001
7000 50001
8000 50001
9000 50001
10000 50001
11000 50001
12000 50001
13000 50001
14000 50001
15000 50001
16000 50001
17000 50001
18000 50001
19000 50001
20000 50001
21000 50001
22000 50001
23000 50001
24000 50001
25000 50001
26000 50001
27000 50001
28000 50001
29000 50001
30000 50001
31000 50001
32000 50001
33000 50001
34000 

In [32]:
import json
with open("data_uk_train_multiple.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(data_uk))
with open("data_ru_train_multiple.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(data_ru))

In [33]:
df_uk_test = pd.read_csv(uk_geo_dataset_path, skiprows=lambda x: x > 20000)
df_ru_test = pd.read_csv(ru_geo_dataset_path, skiprows=lambda x: x > 20000)

data_uk_test = to_data_multiple(df_uk_test)
data_ru_test = to_data_multiple(df_ru_test)

0 20000
1000 20000
2000 20000
3000 20000
4000 20000
5000 20000
6000 20000
7000 20000
8000 20000
9000 20000
10000 20000
11000 20000
12000 20000
13000 20000
14000 20000
15000 20000
16000 20000
17000 20000
18000 20000
19000 20000
0 20000
1000 20000
2000 20000
3000 20000
4000 20000
5000 20000
6000 20000
7000 20000
8000 20000
9000 20000
10000 20000
11000 20000
12000 20000
13000 20000
14000 20000
15000 20000
16000 20000
17000 20000
18000 20000
19000 20000


In [34]:
import json
with open("data_uk_test_multiple.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(data_uk_test))
with open("data_ru_test_multiple.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(data_ru_test))