# 2. Передобробка даних

In [None]:
import pycld2 as cld2
import pandas as pd
import numpy as np
import spacy
from tokenize_uk import tokenize_uk
from utils import load_1551, load_stopwords

In [32]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split

LOG = []

def run_model(x, y):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

    le = LabelEncoder()

    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.fit_transform(y_test)

    neigh = KNeighborsClassifier(n_neighbors=10, metric='cosine', n_jobs=-1)
    neigh.fit(X_train, y_train_encoded)

    y_pred = neigh.predict(X_test)
    result = pd.DataFrame(classification_report(y_test_encoded, y_pred, output_dict=True))
    LOG.append(result)
    
    return result

In [3]:
nlp = spacy.load('/tmp/uk_vectors')

def vec(text):
    return nlp(text)[0].vector

def vectorize(text):
    v = vec('unk')
    
    tokens = tokenize_uk.tokenize_words(text)
    
    for t in tokens:        
        v += vec(t)
            
    v /= len(tokens)

    return v

In [4]:
data = load_1551()

100%|██████████| 127329/127329 [01:18<00:00, 1612.02it/s]


## 2.1 Фільтрація мови

In [5]:
def detect_lang(text):
    try:
        # BUG: https://github.com/mikemccand/chromium-compact-language-detector/issues/22
        sanitized = ''.join(x for x in text if x.isprintable())
        return cld2.detect(sanitized)[2][0][1]
    except:
        return 'error'
    
data['lang'] = data['text'].apply(detect_lang)

In [6]:
data['lang'].value_counts()

uk    69407
ru    52633
un     4586
sr      147
uz        8
en        4
bg        3
mk        1
Name: lang, dtype: int64

In [7]:
cleaned_data = data[data['lang'] == 'uk'].groupby('category').filter(lambda x: len(x) >= 10)

In [8]:
x = np.vstack(cleaned_data.text.apply(vectorize))
y = np.hstack(cleaned_data.category)

In [12]:
run_model(x, y)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,409,410,411,412,413,414,415,accuracy,macro avg,weighted avg
precision,0.666667,0.272727,0.268657,0.0,1.0,0.0,0.0,0.0,0.043478,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.329024,0.186505,0.313016
recall,1.0,1.0,0.174757,0.0,0.666667,0.0,0.0,0.0,0.066667,0.095238,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.329024,0.113778,0.329024
f1-score,0.8,0.428571,0.211765,0.0,0.8,0.0,0.0,0.0,0.052632,0.108108,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.329024,0.119517,0.296953
support,2.0,3.0,103.0,8.0,6.0,6.0,13.0,4.0,15.0,21.0,...,3.0,11.0,16.0,7.0,6.0,8.0,4.0,0.329024,22430.0,22430.0


## 2.2 Видалення стопслів

In [14]:
STOPWORDS = load_stopwords()

def vectorize_v2(text):
    v = vec('unk')
    
    tokens = tokenize_uk.tokenize_words(text)
    
    for t in tokens:
        if not t in STOPWORDS:   
            v += vec(t)
            
    v /= len(tokens)

    return v

x = np.vstack(cleaned_data.text.apply(vectorize_v2))
y = np.hstack(cleaned_data.category)

run_model(x, y)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,409,410,411,412,413,414,415,accuracy,macro avg,weighted avg
precision,0.25,0.272727,0.333333,0.2,1.0,0.0,0.166667,0.0,0.0,0.090909,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.343469,0.189803,0.323118
recall,0.5,1.0,0.223301,0.125,0.666667,0.0,0.076923,0.0,0.0,0.047619,...,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.343469,0.119252,0.343469
f1-score,0.333333,0.428571,0.267442,0.153846,0.8,0.0,0.105263,0.0,0.0,0.0625,...,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.343469,0.126628,0.309526
support,2.0,3.0,103.0,8.0,6.0,6.0,13.0,4.0,15.0,21.0,...,3.0,11.0,16.0,7.0,6.0,8.0,4.0,0.343469,22430.0,22430.0


## 2.3 Використання лем

In [21]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer(lang='uk')

def vectorize_v3(text):
    v = vec('unk')
    
    tokens = tokenize_uk.tokenize_words(text)
    
    for t in tokens:
        if not t in STOPWORDS:   
            l = morph.parse(t)[0].normal_form
            v += vec(l)
            
    v /= len(tokens)

    return v
    
x = np.vstack(cleaned_data.text.apply(vectorize_v3))
y = np.hstack(cleaned_data.category)

run_model(x, y)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,409,410,411,412,413,414,415,accuracy,macro avg,weighted avg
precision,0.0,0.25,0.218182,0.0,0.8,0.0,0.285714,0.0,0.0,0.111111,...,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.35078,0.198246,0.327597
recall,0.0,1.0,0.116505,0.0,0.666667,0.0,0.153846,0.0,0.0,0.047619,...,0.0,0.181818,0.125,0.0,0.0,0.0,0.0,0.35078,0.123473,0.35078
f1-score,0.0,0.4,0.151899,0.0,0.727273,0.0,0.2,0.0,0.0,0.066667,...,0.0,0.307692,0.2,0.0,0.0,0.0,0.0,0.35078,0.132085,0.315933
support,2.0,3.0,103.0,8.0,6.0,6.0,13.0,4.0,15.0,21.0,...,3.0,11.0,16.0,7.0,6.0,8.0,4.0,0.35078,22430.0,22430.0


In [29]:
import pickle

with open('data.pickle', 'wb') as f:  
    le = LabelEncoder()
    
    y_encoded = le.fit_transform(y)
    
    data = {
        'input': x,
        'target': y
    }
    
    pickle.dump(data, f)

**Далі:** [Використання довільних алгоритмів класифікації](03-vectors-algo.ipynb)