In [1]:
import pandas as pd
import numpy as np
import nltk

import re #регулярные выражения
import math
from collections import Counter
import requests
import time

alphabet = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'

# nltk.download('stopwords') # to use stopwords
# nltk.download('punkt') # to use word_tokenize

In [4]:
# служебные функция для расчета памяти, заниаемой dataframe и csr
BYTES_TO_MB_DIV = 0.000001
def print_memory_usage_of_data_frame(df):
    mem = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
    print("Memory usage is " + str(mem) + " MB")
    
def get_csr_memory_usage(matrix):
    mem = (matrix.data.nbytes + matrix.indptr.nbytes + matrix.indices.nbytes) * BYTES_TO_MB_DIV
    print("Memory usage is " + str(mem) + " MB")

In [5]:
# демонстрация идеи токенизации и стоп-слов
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords as stopwords
from nltk.tokenize import word_tokenize

example_sent = """Это простое предложение показывает фильтрацию на стоп-слова"""
 
stop_words = set(stopwords.words('russian'))
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
filtered_sentence = []
 
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

from nltk.corpus import stopwords
print("Стоп-слова русского языка:")
print(stopwords.words('russian'))

['Это', 'простое', 'предложение', 'показывает', 'фильтрацию', 'на', 'стоп-слова']
['Это', 'простое', 'предложение', 'показывает', 'фильтрацию', 'стоп-слова']
Стоп-слова русского языка:
['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 'мож

## Создадим функции, позволяющие генерировать следующие ошибки в словах:

1.   **Пропуск буквы:** ${резонанс - рзонанс}$
2.   **Дублирование буквы:** ${резонанс - реезонанс}$
3.   **Перестановка букв:** ${резонанс - рзеонанс}$
4.   **Опечатка:** ${резонанс - ркзонанс}$
5.   **Некоторые орфографические ошибки:**  ${резонанс - ризонанс}$


In [43]:
# все ошибки на расстоянии 2 (расстояние Левинштейна)
def distance2(word):
    return {e2 for e1 in distance1(word) if e1 for e2 in distance1(e1)}

# все ошибки на расстоянии 1 (расстояние Левинштейна)
def distance1(word):
    pairs      = splits(word)
    transposes = [a+b[1]+b[0]+b[2:]  for (a,b) in pairs if len(b)>1]                           # перестановки
    replaces   = [a+c+b[1:]          for (a,b) in pairs if b for c in replaces_set[b[0]] if b] # замены: 
                                                                                                        # опечатки, 
                                                                                                        # пропуски и дублирования букв, 
                                                                                                        # орфографические ошибки

    last_replaces = [word[0:-1] + c for c in replaces_set[word[-1]]]                           # замены в конце слова
    #return set(transposes + replaces + last_replaces)
    return set(replaces + last_replaces + transposes)

def splits(word):
    return [(word[:i], word[i:])
            for i in range (len(word)+1)]

# список возможных замен в слове
replaces_set = pd.Series([          
               ['с', 'в', 'у', 'к', 'е', 'п', 'м', 'аа', ''],
               ['ь', 'о', 'л', 'д', 'ю', 'бб', ''],
               ['ч', 'ы', 'ц', 'у', 'к', 'а', 'с', 'вв', ''],
               ['р', 'н', 'ш', 'л', 'о', 'гг', ''],
               ['б', 'л', 'з', 'ж', 'ю', 'дд', ''],
               ['а', 'к', 'н', 'р', 'п', 'ее', 'ё', 'и', ''],
               ['ёё', 'й', 'е', ''],
               ['ю', 'д', 'щ', 'з', 'х', 'э', 'жж', ''],
               ['д', 'щ', 'х', 'э', 'ж', 'зз', 'с', ''],
               ['м', 'а', 'п', 'р', 'т', 'ии', ''],
               ['ц', 'ы', 'ф', 'йй',''],
               ['а', 'в', 'у', 'е', 'п', 'а', 'кк', ''],
               ['ь', 'о', 'г', 'ш', 'щ', 'д', 'б', 'лл', ''],
               ['с', 'а', 'п', 'и', 'мм', ''],
               ['р', 'п', 'е', 'г', 'о', 'нн', ''],
               ['т', 'р', 'н', 'г', 'ш', 'л', 'ь', 'оо', ''],
               ['м', 'а', 'к', 'е', 'н', 'р', 'и', 'пп', ''],
               ['и', 'п', 'е', 'н', 'г', 'о', 'т', 'рр', 'р', ''],
               ['ч', 'в', 'а', 'м', 'сс', ''],
               ['и', 'п', 'р', 'о', 'ь', 'тт', ''],
               ['в', 'ы', 'ц', 'к', 'а', 'уу', ''],
               ['я', 'ч', 'ы', 'ц', 'й', 'фф', ''],
               ['ж', 'з', 'ъ', 'э', 'хх', ''],
               ['ы', 'ф', 'й', 'у', 'в', 'ы', 'цц', ''],
               ['я', 'ф', 'ы', 'в', 'с', 'чч', ''],
               ['л', 'о', 'г', 'щ', 'д', 'шш', 'шь', ''],
               ['л', 'ш', 'з', 'ж', 'д', 'щщ', 'шь', 'ж', ''],
               ['э', 'х', 'ъъ', 'ь', ''],
               ['ч', 'я', 'ф', 'й', 'ц', 'у', 'в', 'ыы', ''],
               ['т', 'о', 'л', 'б', 'ьь', ''],
               ['ж', 'з', 'х', 'ъ', ''],
               ['б', 'д', 'ж', 'юю', ''],
               ['ф', 'ы', 'ч', 'яя', ''],
               ], 
               index = ['а','б','в','г', 'д', 'е', 'ё', 
                            'ж', 'з', 'и', 'й', 'к', 
                            'л', 'м', 'н', 'о', 'п', 
                            'р', 'с', 'т', 'у', 'ф', 
                            'х', 'ц', 'ч', 'ш', 'щ', 
                            'ъ', 'ы', 'ь', 'э', 'ю', 'я'])


**Импортируем текст**

In [44]:
import codecs
with codecs.open('top10000.txt', 'r', encoding = 'utf-8') as file:
#with codecs.open('russian.txt', 'r', encoding = 'windows 1251') as file:
    TEXT = file.read().replace('\n', ' ') # для текста, в котором слова разделены '\n'
    # TEXT = file.read().replace(' ', ' ') # для текста, в котором слова разделены ' '
def tokens(text):
    return re.findall(r'[а-ё]+', text.lower())
list_of_words = tokens(TEXT)

# удалим все слова, короче 3 символов (потому что слова из 2х символов неинформативны):
tmp = [w for w in list_of_words if len(w) > 2]
list_of_words = tmp

# пока ограничимся 38701 словами
# del list_of_words[:38701]

# выведем первые 20 слов
print(list_of_words[:20])
print(len(list_of_words))

['что', 'тот', 'быть', 'весь', 'это', 'как', 'она', 'они', 'так', 'сказать', 'этот', 'который', 'может', 'человек', 'один', 'еще', 'такой', 'только', 'себя', 'свое']
8714


**Для каждого слова рассчитаем возможыне ошибки на расстоянии 1 или 2. Затем произведем токенизацию**

In [45]:
# words = []
corpus = [] # все сгенерированные слова
labels = [] # метки для классификации в порядке ошибочных слов



start = time.time()

for item in list_of_words:
    errors = distance1(item)#.union(distance2(item))# ??? как оптимизировать - неясно
    errors = errors.union(item)
    # words.append([item, errors])
    #tmp = [w for w in errors if len(w) > 2]
    #errors = tmp
    corpus += list(errors)
    labels += [item]*len(errors)

end = time.time()
duration = round(end - start, 2)
print("Set generation: " + str(duration) + " secs")
print("    Total size:", len(labels))

Set generation: 0.51 secs
    Total size: 517012


In [46]:
# сохранение данных, для быстрого анализа в дальнейшем
import pickle

with open('distance1.data', 'wb') as file:
    # сохраняем данные как двоичный поток
    pickle.dump([corpus, labels], file)

In [47]:
# токенизация слова на буквы
'''
def _tokenize(word):
    return [a + a for a in word]
'''
'''
def _tokenize(word):
    return re.findall(r'[а-ё]', word.lower())

'''
def _tokenize(word):
    out = []
    for i in range(len(word)):
        out.append(word[i])
        out.append(word[i] + str(i))
    return out



_tokenize('резонанс')

['р',
 'р0',
 'е',
 'е1',
 'з',
 'з2',
 'о',
 'о3',
 'н',
 'н4',
 'а',
 'а5',
 'н',
 'н6',
 'с',
 'с7']

**Векторизация текста посредством *TfidfVectorizer (TF-IDF)***

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [49]:

vectorizer = TfidfVectorizer(tokenizer = lambda x: _tokenize(x))

start = time.time()
V = vectorizer.fit_transform(corpus)
end = time.time()
duration = round(end - start, 2)
print("Vectorization: " + str(duration) + " secs")

print("Найденных токенов:")
print(len(vectorizer.get_feature_names_out()))


Vectorization: 5.16 secs
Найденных токенов:
430


***bool* - векторизация:**

In [50]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_bin = CountVectorizer(binary = True, tokenizer = lambda x: _tokenize(x))
V_bin = vectorizer_bin.fit_transform(corpus)
print("Найденных токенов:")
print(len(vectorizer_bin.get_feature_names_out()))
#print(V_bin.shape())
#print(V_bin.toarray())

Найденных токенов:
430


***hash* - векторизация:**

In [51]:
from sklearn.feature_extraction.text import HashingVectorizer
vectorizer_hash = HashingVectorizer(n_features = 430, tokenizer = lambda x: _tokenize(x))
V_hash = vectorizer_hash.fit_transform(corpus)
print("Shape:")
print(V_hash.shape)



Shape:
(517012, 430)


In [13]:
# для наглядности создадим dataframe с данными на основе векторизации TF-IDF
df = pd.DataFrame(V.toarray(), columns = vectorizer.get_feature_names_out())

In [14]:
print("Data size:", len(corpus))
#print(13573258/len(corpus))
print(corpus[:10])

Data size: 517012
['чоо', 'чт', 'ыто', 'сто', 'тчо', 'фто', 'вто', 'чио', 'ято', 'чтоо']


In [52]:
df['labels'] = labels
df

Unnamed: 0,а,а0,а1,а10,а11,а2,а3,а4,а5,а6,...,ё11,ё2,ё3,ё4,ё5,ё6,ё7,ё8,ё9,labels
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,что
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,что
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,что
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,что
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,что
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,яблоко
517008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,яблоко
517009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,яблоко
517010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,яблоко


In [53]:
print_memory_usage_of_data_frame(df)
get_csr_memory_usage(V)

Memory usage is 1782.658 MB
Memory usage is 76.630592 MB


In [54]:
from sklearn.model_selection import train_test_split
from sklearn.metrics  import f1_score, accuracy_score, precision_score, recall_score, jaccard_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

## Перейдем к обучению модели

**Создадим pipeline для более простого тестирования в дальнейшем**

In [55]:
import pickle
# читаем сохраненные данные
with open('distance1.data', 'rb') as file:  
    # сохраняем данные как двоичный поток
    data = pickle.load(file)
corpus = data[0]
labels = data[1]

In [56]:
# варианты моделей:
# DecisionTreeClassifier()

# MultinomialNB()
# KNeighborsClassifier()
# LogisticRegression()

# требует доп настройки
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.datasets import make_classification

## pipline, основанный на TfidfVectorizer и DecisionTreeClassifier()
from sklearn.pipeline import Pipeline
model_pipeline = Pipeline([
    ("vecorizer", TfidfVectorizer(tokenizer = lambda x: _tokenize(x))),
    ("model",     MultinomialNB())
]
)
X = corpus
y = labels

#LogisticRegression(solver = 'sag', random_state = 42)

**обучим модель**

In [58]:
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
end = time.time()
duration = round(end - start, 2)
print("Train-test split: " + str(duration) + " secs")
start = time.time()
model_pipeline.fit(X_train, y_train)
end = time.time()
duration = round(end - start, 2)
print("Training: " + str(duration) + " secs")

Train-test split: 0.33 secs
Training: 17.07 secs


**Теперь протестируем обученную модель**

In [59]:
start = time.time()
pred = model_pipeline.predict(X_test)
end = time.time()
duration = round(end - start, 2)
print("Testing: " + str(duration) + " secs")

Testing: 4.33 secs


In [60]:
print(' accuracy:', accuracy_score(y_test, pred))
print('precision:', precision_score(y_test, pred, average = 'macro', zero_division=0))
print('   recall:', recall_score(y_test, pred, average = 'macro', zero_division=0))
print('  jaccard:', jaccard_score(y_test, pred, average = 'macro', zero_division=0))
print('       f1:', f1_score(y_test, pred, average = 'macro', zero_division=0))

 accuracy: 0.838011914586342
precision: 0.9051902923536509
   recall: 0.8031043114571247
  jaccard: 0.7373575537321745
       f1: 0.8292856930306153


**Финт ушами: тестируем модель на данных, в которых допустимо расстояние 2**

In [61]:
# читаем сохраненные данные
with open('distance2.data', 'rb') as file:  
    # сохраняем данные как двоичный поток
    data = pickle.load(file)

In [64]:
X2 = data[0]
y2 = data[1]
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.02, random_state = 42)

In [65]:
start = time.time()
pred2 = model_pipeline.predict(X_test2)
end = time.time()
duration = round(end - start, 2)
print("Testing: " + str(duration) + " secs")

Testing: 14.73 secs


In [67]:
print(' accuracy:', accuracy_score(y_test2, pred2))
print('precision:', precision_score(y_test2, pred2, average = 'macro', zero_division=0))
print('   recall:', recall_score(y_test2, pred2, average = 'macro', zero_division=0))
print('  jaccard:', jaccard_score(y_test2, pred2, average = 'macro', zero_division=0))
print('       f1:', f1_score(y_test2, pred2, average = 'macro', zero_division=0))

 accuracy: 0.8676181246879864
precision: 0.8058546754440171
   recall: 0.7610693370628323
  jaccard: 0.6656015158602356
       f1: 0.7674963388145969


## Проведем "живой" тест на модели

In [18]:
pins = 'быь иьи не бвть'
pins = re.findall(r'[а-ё]+', pins.lower())


for pin in pins:
    if len(pin) > 2:
        print(model_pipeline.predict([pin])[0], end = ' ')
    else:
        print(pin, end = ' ')

быть или не быть 

In [75]:
print(model_pipeline.predict(['грфпрут']))

['группа']


## bin

In [None]:
def convert_to_sparse_pandas(df, exclude_columns = []):
    df = df.copy()
    exclude_columns = set(exclude_columns)

    for (columnName, columnData) in df.iteritems():
        if columnName in exclude_columns:
            continue
        df[columnName] = pd.arrays.SparseArray(columnData.values, dtype = np.float16)

    return df

df_sparse = convert_to_sparse_pandas(df, exclude_columns = ["label"])
# display(df_sparse.dtypes)
print_memory_usage_of_data_frame(df_sparse)

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

print(accuracy_score(y_test, pred))
print(f1_score(y_test, pred, average = 'macro'))

print(mean_absolute_error(y_test, pred))

0.6571622078283824
0.5426987035755111
53.354150671547565


In [None]:
y = df['label']
X = df[df.columns.difference(['label'])]

y_sparse = df_sparse['label']
X_sparse = df_sparse[df_sparse.columns.difference(['label'])]

y_csr = df['label']
X_csr = V

y_binary = df['label']
X_binary = V_bin

print('X:')
print_memory_usage_of_data_frame(X)
print('X_sparse:')
print_memory_usage_of_data_frame(X_sparse)
print('X_csr:')
get_csr_memory_usage(X_csr)
print('X_binary:')
get_csr_memory_usage(X_binary)

X:
Memory usage is 233.391 MB
X_sparse:
Memory usage is 175.044 MB
X_csr:
Memory usage is 68.581552 MB
X_binary:
Memory usage is 68.581552 MB


In [None]:
def test(mod, Xtest, Ytest):
    start = time.time()
    pred = mod.predict(Xtest)
    end = time.time()
    duration = round(end - start, 2)
    print("Testing: " + str(duration) + " secs")
    print('accuracy:', accuracy_score(Ytest, pred))
    print('      f1:', f1_score(Ytest, pred, average = 'macro'))
    print('    mean:', mean_absolute_error(Ytest, pred))
    print("\n")