# 2. Передобробка даних

In [2]:
import pycld2 as cld2
import pandas as pd
import numpy as np
from tqdm import tqdm
from tokenize_uk import tokenize_uk
from utils import load_1551, load_stopwords, run_experiment, dump_data
from vectors import vectorize
import warnings
import re

warnings.filterwarnings('ignore')
tqdm.pandas()

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split

def run_experiment_v2(name, x, y):
    model = KNeighborsClassifier(n_neighbors=10, metric='cosine', n_jobs=-1)
    return run_experiment(name, model, x, y)

In [5]:
data = load_1551()

100%|██████████| 127329/127329 [01:05<00:00, 1947.21it/s]


## 2.1 Фільтрація мови

In [6]:
def detect_lang(text):
    try:
        # BUG: https://github.com/mikemccand/chromium-compact-language-detector/issues/22
        sanitized = ''.join(x for x in text if x.isprintable())
        return cld2.detect(sanitized)[2][0][1]
    except:
        return 'error'
    
data['lang'] = data['text'].progress_apply(detect_lang)

100%|██████████| 126789/126789 [00:17<00:00, 7158.64it/s]


In [7]:
data['lang'].value_counts()

uk    69407
ru    52633
un     4586
sr      147
uz        8
en        4
bg        3
mk        1
Name: lang, dtype: int64

In [8]:
uk_data = data[data['lang'] == 'uk'].groupby('category').filter(lambda x: len(x) >= 50)

In [9]:
print(uk_data.text.sample(n=5).values)

['В другому під’їзді будинку не працює ліфт'
 'Прошу пофарбувати двері під’їзду 4 з зовнішньої сторони. Низ дверей ржавіє.'
 'Доброго дня! В моєму будинку(Ольжича 11) станом на 19.10 досі немає опалення.\r\nТелефонували неодноразово в диспетчерську Шевченківського району, там кажуть, що наш будинок підключено, але по факту це не є так. Я опитала сусідів з сусідніх підїздів, у них опалення немає(тобто зрозуміло, що проблема не у нас, а у всіх).\r\nЯ телефонувала в кол-центр Київенерго, там кажуть, що до них не поступало жодної заявки, що по нашему будинку є якісь проблеми. \r\nБудь-ласка, допоможіть нам у вирішенні цієї проблеми, вдома реально дуже холодно. Я задовбалась одягати вологий одяг, купляти ліки пачками і спати під двома ковдрами.\r\nБуду вдячна за оперативне втручання.'
 'Провести дератизацію в підвальному приміщенні.'
 'Вимагаю демонтувати мегаборд(900м.кв.) за адресою Наддніпрянське шосе між залізничним мостом та мостом ”Патона” праворуч в напрямку Подолу. Відписки надсилат

In [38]:
# clean urls
uk_data['text'] = uk_data['text'].progress_apply(lambda x: re.sub(r'http\S+', '', x))

100%|██████████| 63101/63101 [00:00<00:00, 116477.06it/s]


In [39]:
# clean html
html_pattern = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
uk_data['text'] = uk_data['text'].progress_apply(lambda x: re.sub(html_pattern, '', x))

100%|██████████| 63101/63101 [00:01<00:00, 47732.22it/s]


In [42]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

uk_data['encoded_category'] = le.fit_transform(uk_data['category'])

In [43]:
uk_data['tokens'] = uk_data['text'].progress_apply(tokenize_uk.tokenize_words)

100%|██████████| 63101/63101 [00:22<00:00, 2781.94it/s]


In [44]:
tokens, categories = uk_data.tokens, uk_data.encoded_category

In [45]:
vectors = tokens.progress_apply(vectorize)

100%|██████████| 63101/63101 [02:54<00:00, 362.32it/s]


In [46]:
x = np.vstack(vectors)
y = np.hstack(categories)
run_experiment_v2('uk', x, y)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,197,198,199,200,201,202,203,accuracy,macro avg,weighted avg
precision,0.305085,0.0,0.144796,0.029412,0.294118,0.2,0.107143,0.214286,0.47619,0.255396,...,0.625,0.285714,0.45,0.304348,0.263158,0.125,0.0,0.357136,0.25385,0.33913
recall,0.151261,0.0,0.424779,0.050633,0.416667,0.136364,0.09375,0.166667,0.625,0.696078,...,0.172414,0.090909,0.107143,0.159091,0.080645,0.047619,0.0,0.357136,0.173767,0.357136
f1-score,0.202247,0.0,0.215973,0.037209,0.344828,0.162162,0.1,0.1875,0.540541,0.373684,...,0.27027,0.137931,0.173077,0.208955,0.123457,0.068966,0.0,0.357136,0.186325,0.326044
support,119.0,14.0,226.0,79.0,24.0,22.0,32.0,18.0,32.0,204.0,...,29.0,22.0,84.0,220.0,62.0,21.0,22.0,0.357136,20824.0,20824.0


## 2.2 Видалення стопслів

In [17]:
STOPWORDS = load_stopwords()

def remove_stopwords(tokens):
    return [t for t in tokens if t not in STOPWORDS]

no_stopwords_tokens = tokens.progress_apply(remove_stopwords)

100%|██████████| 63101/63101 [00:12<00:00, 5219.66it/s]


In [18]:
x = np.vstack(no_stopwords_tokens.apply(vectorize))
run_experiment_v2('uk+stopwords', x, y)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,197,198,199,200,201,202,203,accuracy,macro avg,weighted avg
precision,0.365854,0.0,0.157983,0.073684,0.277778,0.105263,0.103448,0.076923,0.512195,0.295011,...,0.625,0.142857,0.357143,0.460784,0.153846,0.3,0.0,0.373175,0.276645,0.354933
recall,0.252101,0.0,0.415929,0.088608,0.416667,0.090909,0.09375,0.055556,0.65625,0.666667,...,0.172414,0.045455,0.119048,0.213636,0.064516,0.142857,0.0,0.373175,0.180914,0.373175
f1-score,0.298507,0.0,0.228989,0.08046,0.333333,0.097561,0.098361,0.064516,0.575342,0.409023,...,0.27027,0.068966,0.178571,0.291925,0.090909,0.193548,0.0,0.373175,0.194965,0.340721
support,119.0,14.0,226.0,79.0,24.0,22.0,32.0,18.0,32.0,204.0,...,29.0,22.0,84.0,220.0,62.0,21.0,22.0,0.373175,20824.0,20824.0


## 2.3 Використання лем

In [19]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer(lang='uk')

def lemmatize(tokens):
    return [morph.parse(t)[0].normal_form for t in tokens]

lemmatized_tokens = no_stopwords_tokens.progress_apply(lemmatize)

100%|██████████| 63101/63101 [09:54<00:00, 106.11it/s]


In [20]:
x = np.vstack(lemmatized_tokens.apply(vectorize))
run_experiment_v2('uk+stopwords+lemma', x, y)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,197,198,199,200,201,202,203,accuracy,macro avg,weighted avg
precision,0.333333,0.0,0.157969,0.034783,0.419355,0.1875,0.041667,0.222222,0.44898,0.284314,...,0.545455,0.6,0.333333,0.358779,0.142857,0.25,0.076923,0.373031,0.274984,0.354413
recall,0.12605,0.0,0.495575,0.050633,0.541667,0.136364,0.03125,0.111111,0.6875,0.710784,...,0.206897,0.136364,0.047619,0.213636,0.016129,0.047619,0.045455,0.373031,0.184948,0.373031
f1-score,0.182927,0.0,0.239572,0.041237,0.472727,0.157895,0.035714,0.148148,0.54321,0.406162,...,0.3,0.222222,0.083333,0.267806,0.028986,0.08,0.057143,0.373031,0.199507,0.342093
support,119.0,14.0,226.0,79.0,24.0,22.0,32.0,18.0,32.0,204.0,...,29.0,22.0,84.0,220.0,62.0,21.0,22.0,0.373031,20824.0,20824.0


In [50]:
output = pd.DataFrame({ 'data': lemmatized_tokens, 'target': uk_data['category'] })

dump_data('cleaned', output)

**Далі:** [використання doc2vec](03-vectors-doc2vec.ipynb)