In [None]:
!pip install pandas
!pip install nltk
!pip install sklearn

In [1]:
import pandas as pd
import numpy as np
import nltk

import re #регулярные выражения
import math
from collections import Counter
import requests
import time

alphabet = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'

nltk.download('stopwords') # to use stopwords
nltk.download('punkt') # to use word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wequalwo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wequalwo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# служебные функция для расчета памяти, заниаемой dataframe и csr
BYTES_TO_MB_DIV = 0.000001
def print_memory_usage_of_data_frame(df):
    mem = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
    print("Memory usage is " + str(mem) + " MB")
    
def get_csr_memory_usage(matrix):
    mem = (matrix.data.nbytes + matrix.indptr.nbytes + matrix.indices.nbytes) * BYTES_TO_MB_DIV
    print("Memory usage is " + str(mem) + " MB")

In [3]:
# десмонтсранция идеи токенизации и стоп-слов
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords as stopwords
from nltk.tokenize import word_tokenize

example_sent = """Это простое предложение показывает фильтрацию на стоп-слова"""
 
stop_words = set(stopwords.words('russian'))
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
filtered_sentence = []
 
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

from nltk.corpus import stopwords
print("Стоп-слова русского языка:")
print(stopwords.words('russian'))

['Это', 'простое', 'предложение', 'показывает', 'фильтрацию', 'на', 'стоп-слова']
['Это', 'простое', 'предложение', 'показывает', 'фильтрацию', 'стоп-слова']
Стоп-слова русского языка:
['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 'мож

## Создадим функции, позволяющие генерировать следующие ошибки в словах:

1.   **Пропуск буквы:** ${резонанс - рзонанс}$
2.   **Дублирование буквы:** ${резонанс - реезонанс}$
3.   **Перестановка букв:** ${резонанс - рзеонанс}$
4.   **Опечатка:** ${резонанс - ркзонанс}$
5.   **Некоторые орфографические ошибки:**  ${резонанс - ризонанс}$


In [4]:
# все ошибки на расстоянии 2 (расстояние Левинштейна)
def distance2(word):
    return {e2 for e1 in distance1(word) if e1 for e2 in distance1(e1)}

# все ошибки на расстоянии 1 (расстояние Левинштейна)
def distance1(word):
    pairs      = splits(word)
    transposes = [a+b[1]+b[0]+b[2:]  for (a,b) in pairs if len(b)>1]                           # перестановки
    replaces   = [a+c+b[1:]          for (a,b) in pairs if b for c in replaces_set[b[0]] if b] # замены: 
                                                                                                        # опечатки, 
                                                                                                        # пропуски и дублирования букв, 
                                                                                                        # орфографические ошибки

    last_replaces = [word[0:-1] + c for c in replaces_set[word[-1]]]                           # замены в конце слова
    return set(transposes + replaces + last_replaces)

def splits(word):
    return [(word[:i], word[i:])
            for i in range (len(word)+1)]

# список возможных замен в слове
replaces_set = pd.Series([          
               ['с', 'в', 'у', 'к', 'е', 'п', 'м', 'о', 'аа', ''],
               ['ь', 'о', 'л', 'д', 'ж', 'ю', 'п', 'бб', ''],
               ['ч', 'ы', 'ц', 'у', 'к', 'а', 'с', 'вв', 'ф', ''],
               ['р', 'н', 'ш', 'л', 'о', 'гг', 'к', ''],
               ['б', 'л', 'ш', 'щ', 'з', 'ж', 'ю', 'дд', 'т', ''],
               ['а', 'к', 'н', 'р', 'п', 'ее', 'и', 'ё', 'о', ''],
               ['ёё', 'й', 'йо', 'е', ''],
               ['ю', 'д', 'щ', 'з', 'х', 'э', 'жж', 'ш', ''],
               ['д', 'щ', 'х', 'э', 'ж', 'зз', 'с', ''],
               ['м', 'а', 'п', 'р', 'т', 'ии', 'е', 'ы', ''],
               ['ц', 'ы', 'ф', 'йй',''],
               ['а', 'в', 'у', 'е', 'п', 'п', 'а', 'кк', 'г', ''],
               ['ь', 'о', 'г', 'ш', 'щ', 'д', 'б', 'лл', ''],
               ['с', 'а', 'п', 'и', 'мм', 'н', ''],
               ['р', 'п', 'е', 'г', 'о', 'нн', 'м', ''],
               ['т', 'р', 'н', 'г', 'ш', 'л', 'ь', 'оо', 'а', 'е', 'ё', 'у', ''],
               ['м', 'а', 'к', 'е', 'н', 'р', 'и', 'пп', 'б', ''],
               ['и', 'п', 'е', 'н', 'г', 'о', 'т', 'рр', 'р', ''],
               ['ч', 'в', 'а', 'м', 'сс', 'з', ''],
               ['и', 'п', 'р', 'о', 'ь', 'тт', 'д', ''],
               ['в', 'ы', 'ц', 'к', 'а', 'уу', 'ю', 'о', ''],
               ['я', 'ч', 'ы', 'ц', 'й', 'фф', 'в', ''],
               ['ж', 'з', 'ъ', 'э', 'хх', ''],
               ['ы', 'ф', 'й', 'у', 'в', 'ы', 'цц', 'ч', ''],
               ['я', 'ф', 'ы', 'в', 'с', 'чч', ''],
               ['л', 'о', 'г', 'щ', 'д', 'шш', 'щ', 'шь', 'щь', 'ж', ''],
               ['л', 'ш', 'з', 'ж', 'д', 'щщ', 'щь', 'шь', 'ж', ''],
               ['э', 'х', 'ъъ', 'ь', ''],
               ['ч', 'я', 'ф', 'й', 'ц', 'у', 'в', 'и', 'ыы', ''],
               ['т', 'о', 'л', 'б', 'ьь', 'ъ', ''],
               ['ж', 'з', 'х', 'ъ', ''],
               ['б', 'л', 'д', 'ж', 'э', 'у', 'йу', ''],
               ['ф', 'ы', 'ч', 'яя', 'а', 'йа', ''],
               ], 
               index = ['а','б','в','г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'])


**Импортируем текст**

In [52]:
import codecs
with codecs.open('top10000.txt', 'r', encoding = 'utf-8') as file:
    TEXT = file.read().replace('\n', ' ') # для текста, в котором слова разделены '\n'
    # TEXT = file.read().replace(' ', ' ') # для текста, в котором слова разделены ' '
def tokens(text):
    return re.findall(r'[а-я]+', text.lower())
list_of_words = tokens(TEXT)

# удалим все слова, короче 3 символов (потому что слова из 2х символов неинформативны):
tmp = [w for w in list_of_words if len(w) > 2]
list_of_words = tmp

# пока ограничимся 200 словами
del list_of_words[200:]

# выведем первые 20 слов
print(list_of_words[:20])
print(len(list_of_words))

['что', 'тот', 'быть', 'весь', 'это', 'как', 'она', 'они', 'так', 'сказать', 'этот', 'который', 'может', 'человек', 'один', 'еще', 'такой', 'только', 'себя', 'свое']
200


**Для каждого слова рассчитаем возможыне ошибки на расстоянии 1 или 2. Затем произведем токенизацию**

In [53]:
# words = []
corpus = [] # все сгенерированные слова
labels = [] # метки для классификации в порядке ошибочных слов

for item in list_of_words:
    errors = distance1(item).union(distance2(item))# ??? как оптимизировать - неясно
    corpus += list(errors)
    # words.append([item, errors])
    labels += [item]*len(errors)

In [54]:
# токенизация слова на буквы
'''
def _tokenize(word):
    return [a + a for a in word]
'''
def _tokenize(word):
    return re.findall(r'[а-ё]', word.lower())
_tokenize('резонанс')

['р', 'е', 'з', 'о', 'н', 'а', 'н', 'с']

**Векторизация текста посредством *TfidfVectorizer (TF-IDF)***

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer = lambda x: _tokenize(x))
V = vectorizer.fit_transform(corpus)
print("Найденные токены:")
print(vectorizer.get_feature_names_out())

Найденные токены:
['а' 'б' 'в' 'г' 'д' 'е' 'ж' 'з' 'и' 'й' 'к' 'л' 'м' 'н' 'о' 'п' 'р' 'с'
 'т' 'у' 'ф' 'х' 'ц' 'ч' 'ш' 'щ' 'ъ' 'ы' 'ь' 'э' 'ю' 'я' 'ё']


***bool* - векторизация:**

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_bin = CountVectorizer(binary = True, tokenizer = lambda x: _tokenize(x))
V_bin = vectorizer_bin.fit_transform(corpus)
print("Найденные токены:")
print(vectorizer_bin.get_feature_names_out())
#print(V_bin.toarray())

Найденные токены:
['а' 'б' 'в' 'г' 'д' 'е' 'ж' 'з' 'и' 'й' 'к' 'л' 'м' 'н' 'о' 'п' 'р' 'с'
 'т' 'у' 'ф' 'х' 'ц' 'ч' 'ш' 'щ' 'ъ' 'ы' 'ь' 'э' 'ю' 'я' 'ё']


In [57]:
# для наглядности создадим dataframe с данными на основе векторизации TF-IDF
df = pd.DataFrame(V.toarray(), columns = vectorizer_bin.get_feature_names_out())

In [58]:
df['labels'] = labels
df

Unnamed: 0,а,б,в,г,д,е,ж,з,и,й,...,ш,щ,ъ,ы,ь,э,ю,я,ё,labels
0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,что
1,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.678188,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,что
2,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.00000,0.517981,0.0,0.0,0.0,0.000000,что
3,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,что
4,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.58915,0.000000,0.0,0.0,0.0,0.741603,что
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273085,0.000000,0.0,0.0,0.611472,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,утро
273086,0.000000,0.0,0.0,0.000000,0.522181,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,утро
273087,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.664696,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,утро
273088,0.422471,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,утро


In [59]:
from sklearn.model_selection import train_test_split
from sklearn.metrics  import f1_score, accuracy_score
from sklearn.metrics import mean_absolute_error

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

## Перейдем к обучению модели

**Создадим pipeline для более простого тестирования в дальнейшем**

In [60]:
print("Data size:", len(corpus))

Data size: 273090


In [61]:
# варианты моделей:
# DecisionTreeClassifier()

# MultinomialNB()
# KNeighborsClassifier()
# LogisticRegression()

# требует доп настройки
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.datasets import make_classification

## pipline, основанный на TfidfVectorizer и DecisionTreeClassifier()
from sklearn.pipeline import Pipeline
model_pipeline = Pipeline([
    ("vecorizer", TfidfVectorizer(tokenizer = lambda x: _tokenize(x))),
    ("model", DecisionTreeClassifier())
]
)
X = corpus
y = labels

**обучим модель**

In [62]:
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
end = time.time()
duration = round(end - start, 2)
print("Train-test split: " + str(duration) + " secs")

start = time.time()
model_pipeline.fit(X_train, y_train)
end = time.time()
duration = round(end - start, 2)
print("Training: " + str(duration) + " secs")

Train-test split: 0.3 secs
Training: 22.08 secs


In [63]:
print(model_pipeline.predict(['мама'])[0])

сам


In [41]:
from sklearn.pipeline import Pipeline


class OnlinePipeline(Pipeline):
    def partial_fit(self, X, y = None):
        for i, step in enumerate(self.steps):
            name, est = step
            est.partial_fit(X, y)
            if i < len(self.steps) - 1:
                X = est.transform(X)
        return self

model_pipeline = OnlinePipeline([
    ("vecorizer", TfidfVectorizer(tokenizer = lambda x: _tokenize(x))),
    ("model", DecisionTreeClassifier())
]
)

X = corpus
y = labels

In [83]:
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
end = time.time()
duration = round(end - start, 2)
print("Train-test split: " + str(duration) + " secs")

a = int(len(X_train)/3)
sum_t = 0


div = 100
part = int(len(X_train)/div)

'''for i in range(1, div + 1):
    start = time.time()
    print((i*(part) - part), ":", i*(part))
    model_pipeline.fit(X_train[(i*(part) - part): i*(part)], y_train[(i*(part) - part): i*(part)])
    end = time.time()
    duration = round(end - start, 2)
    sum_t += duration
    if(i % 10 == 1):
        print("Training round " + str(i) + ": " + str(sum_t) + " secs")'''
    

#print("        Training: " + str(sum_t) + " secs")
t = int(len(X_train)/2)
print(len(X_train[0: t]))
print(len(X_train[t:]))
'''t = 273090/2
model_pipeline.fit(X_train[0: t], y_train[t: 273090]'''


Train-test split: 0.09 secs
109236
109236


't = 273090/2\nmodel_pipeline.fit(X_train[0: t], y_train[t: 273090]'

**Теперь протестируем обученную модель**

In [74]:
start = time.time()
pred = model_pipeline.predict(X_test)
end = time.time()
duration = round(end - start, 2)

print("Testing: " + str(duration) + " secs")
print('accuracy:', accuracy_score(y_test, pred))
print('      f1:', f1_score(y_test, pred, average = 'macro'))
# print('    mean:', mean_absolute_error(y_test, pred))

Testing: 0.34 secs
accuracy: 0.4090592844849683
      f1: 0.2998841157518703


    Testing: 0.37 secs
    accuracy: 0.721703467721264
          f1: 0.5964781878693777

## Проведем "живой" тест на модели

In [24]:
pins = input()
pins = re.findall(r'[а-я]+', pins.lower())

for pin in pins:
    if len(pin) > 2:
        print(model_pipeline.predict([pin])[0], end = ' ')
    else:
        print(pin, end = ' ')

быть или не ыиь
быть или не или 

In [32]:
# для проверки, есть ли слово в исходном словаре
print('привет' in list_of_words)

False


## bin

In [303]:
def convert_to_sparse_pandas(df, exclude_columns = []):
    df = df.copy()
    exclude_columns = set(exclude_columns)

    for (columnName, columnData) in df.iteritems():
        if columnName in exclude_columns:
            continue
        df[columnName] = pd.arrays.SparseArray(columnData.values, dtype = np.float16)

    return df

df_sparse = convert_to_sparse_pandas(df, exclude_columns = ["label"])
# display(df_sparse.dtypes)
print_memory_usage_of_data_frame(df_sparse)

Memory usage is 182.116 MB


In [105]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

print(accuracy_score(y_test, pred))
print(f1_score(y_test, pred, average = 'macro'))

print(mean_absolute_error(y_test, pred))

0.6571622078283824
0.5426987035755111
53.354150671547565


In [304]:
y = df['label']
X = df[df.columns.difference(['label'])]

y_sparse = df_sparse['label']
X_sparse = df_sparse[df_sparse.columns.difference(['label'])]

y_csr = df['label']
X_csr = V

y_binary = df['label']
X_binary = V_bin

print('X:')
print_memory_usage_of_data_frame(X)
print('X_sparse:')
print_memory_usage_of_data_frame(X_sparse)
print('X_csr:')
get_csr_memory_usage(X_csr)
print('X_binary:')
get_csr_memory_usage(X_binary)

X:
Memory usage is 233.391 MB
X_sparse:
Memory usage is 175.044 MB
X_csr:
Memory usage is 68.581552 MB
X_binary:
Memory usage is 68.581552 MB


In [307]:
for i in range(len(models)):
    print(list_of_names[i])
    test(models[i], X_tests_arr[i], Y_tests_arr[i])

Scipy sparse matrix
Testing: 0.32 secs
accuracy: 0.011158744881569125
      f1: 0.014289106223557658
    mean: 188.53618532678777


Scipy sparse matrix binary
Testing: 0.49 secs
accuracy: 0.708187227111282
      f1: 0.5981014691555474
    mean: 45.30782978530869




In [306]:
def test(mod, Xtest, Ytest):
    start = time.time()
    pred = mod.predict(Xtest)
    end = time.time()
    duration = round(end - start, 2)
    print("Testing: " + str(duration) + " secs")
    print('accuracy:', accuracy_score(Ytest, pred))
    print('      f1:', f1_score(Ytest, pred, average = 'macro'))
    print('    mean:', mean_absolute_error(Ytest, pred))
    print("\n")

In [313]:
models = []
list_of_names = []
X_tests_arr = []
Y_tests_arr = []

model = DecisionTreeClassifier()
# model = BernoulliNB()

# model = GaussianNB()
# model = MultinomialNB()
# msodel = GaussianNB()
# model = MultinomialNB()
# model = KNeighborsClassifier()
# model = LogisticRegression()


#     'Pandas dataframe': [X, y],
vector_dict = {
     'Scipy sparse matrix': [X_csr, y_csr],
     'Scipy sparse matrix binary': [X_binary, y_binary]
    }

for key, item in vector_dict.items():
    print(key)
    list_of_names.append(key)
    start = time.time()
    if (key != 'Pandas dataframe'):
        XX = item[0].toarray()
    else:
        XX = X
        
    X_train, X_test, y_train, y_test = train_test_split(XX, y, test_size = 0.2, random_state = 42)
    end = time.time()
    duration = round(end - start, 2)
    print("Train-test split: " + str(duration) + " secs")

    start = time.time()
    model.fit(X_train, y_train)

    end = time.time()
    duration = round(end - start, 2)
    
    models.append(model)
    X_tests_arr.append(X_test)
    Y_tests_arr.append(y_test)
    print("Training: " + str(duration) + " secs")
    print("\n")

Scipy sparse matrix
Train-test split: 0.26 secs


KeyboardInterrupt: 

In [None]:
tokets_list = vectorizer_bin.get_feature_names_out()

def _tokenize2(pin):
    return [a + a for a in pin]

pins = input()
pins = re.findall(r'[а-я]+', pins.lower())

for pin in pins:
    if(len(pin) < 3):
        print(pin, end = ' ')
        continue
    pin_vec = [0]*33
    pin = _tokenize2(pin)
    for i in range(33):
        if tokets_list[i] in pin:
            pin_vec[i] = 1

    predict = models[0].predict([pin_vec])
    print(list_of_words[predict[0]], end = ' ')