## Импортики, ничего интересного

In [43]:
import os
import re

from functools import lru_cache, partial
from json import load as json_load

from nltk import download as nltk_download
from nltk.corpus import stopwords
from pandarallel import pandarallel
from pandas import DataFrame, concat, read_csv
from pymorphy2 import MorphAnalyzer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tabulate import tabulate
from tqdm.notebook import tqdm

# cores_count = os.popen("grep -m 1 'cpu cores' /proc/cpuinfo").read().split()[-1]
# pandarallel.initialize(progress_bar=False, nb_workers=int(cores_count))
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Лемматизация и клининг регулярками

Для начала напишем лемматизацию и клинер, воспользовавшись «знаменитым» списком стоп-слов от Артемия Лебедева.

На самом деле, я его добавил после того, как обнаружил при обучении LSTM, что неправильно обработал данные, на всякий случай, но лучше пусть он будет, чем его не будет.

In [44]:
nltk_download('stopwords')

with open('../lib/models/stopwords.json') as json_file:
    additional_stopwords = json_load(json_file)
    json_file.close()

my_stopwords = set(stopwords.words('russian'))
my_stopwords.update(additional_stopwords)
my_stopwords.update(stopwords.words('english'))
morph = MorphAnalyzer()

[nltk_data] Downloading package stopwords to /home/yk4r2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def regexp_replacer(cyrillic: str, symbol: str, string: str) -> str:
    string = re.sub(cyrillic, symbol, string)
    return string

def clean(string: str) -> str:
    string = regexp_replacer(r'[^0-9a-zA-Zа-яА-ЯёЁ\.,\(\)]+', ' ', string)
    string = regexp_replacer(r'([^\w ])', r' \1', string)
    string = regexp_replacer(r'([^ \w])', r'\1', string)
    string = regexp_replacer(r' +', r' ', string)
    string = regexp_replacer(r'^ ', r'', string)
    string = regexp_replacer(r'[\W_]+', ' ', string)
    string = string.lower()
    return string

def find_from_dict(searcher: dict, string: str) -> list:
    occurrencies = []
    for name, regexp in searcher.items():
         occurrencies.append(int(bool(re.search(regexp, string))))
    return occurrencies

def replace_from_dict(replacer: dict, string: str) -> str:
    for cyrillic, symbol in replacer.items():
        string = regexp_replacer(cyrillic, str(symbol), string)
    return string

@lru_cache(maxsize=100000)
def lemmatizer(word: str, morph) -> list:
    return morph.parse(word)[0].normal_form

In [5]:
def process_text(text: str) -> list:
    text = clean(str(text)).split()
    text = [
        word for word in text
        if word not in russian_stopwords
        and word not in english_stopwords
    ]
    return ' '.join(map(lambda word: lemmatizer(word, morph), text))

## Очистка датасетов и поиск контактов регулярочками

In [23]:
%%time
df_val = DataFrame()
df_train = DataFrame()

for chunk in read_csv('../data/val.csv', chunksize=100000):
    df_val = concat([df_val, chunk])
for chunk in read_csv('../data/train.csv', chunksize=100000):
    df_train = concat([df_train, chunk])

CPU times: user 9.99 s, sys: 267 ms, total: 10.3 s
Wall time: 10.3 s


Собственно, регулярочки

In [24]:
%%time
with open('../lib/models/regexps/regexp.json') as json_file:
    regexps = json_load(json_file)
    json_file.close()

our_find_from_dict = partial(find_from_dict, regexps)
regexps_train = df_train['description'].parallel_apply(lambda string: our_find_from_dict(string))
regexps_val = df_val['description'].parallel_apply(lambda string: our_find_from_dict(string))
regexps_train = DataFrame((item for item in regexps_train), columns = regexps.keys())
regexps_val = DataFrame((item for item in regexps_val), columns = regexps.keys())

df_val.description = df_val['description'].fillna('').parallel_apply(clean)
df_val.title = df_val['title'].fillna('').parallel_apply(clean)
df_train.description = df_train['description'].fillna('').parallel_apply(clean)
df_train.title = df_train['title'].fillna('').parallel_apply(clean)

df_train = concat([df_train, regexps_train], axis=1)
df_val = concat([df_val, regexps_val], axis=1)

CPU times: user 5.23 s, sys: 4.76 s, total: 9.99 s
Wall time: 42.6 s


Тут я заменяю слова типа "собака", "точка" и подобные на нормальные аналоги, чтобы проще искать их потом.

Я знаю, что это можно было более оптимально написать как функцию clean, но я уже 4 часа обрабатываю датасет, поэтому пусть будет так.

In [25]:
with open('../lib/models/regexps/numbers_and_punctuation.json') as f:
    punctuation = json_load(f)
    f.close()

df_val.description = df_val['description'].parallel_apply(lambda string: replace_from_dict(punctuation, string))
df_val.title = df_val['title'].parallel_apply(lambda string: replace_from_dict(punctuation, string))
df_train.description = df_train['description'].parallel_apply(lambda string: replace_from_dict(punctuation, string))
df_train.title = df_train['title'].parallel_apply(lambda string: replace_from_dict(punctuation, string))

In [26]:
!free -mh

              total        used        free      shared  buff/cache   available
Mem:           31Gi        10Gi       9,8Gi        11Mi        10Gi        19Gi
Swap:         2,0Gi       6,0Mi       2,0Gi


Лемматизация и клининг

In [27]:
%%time
df_val['description'] = df_val.description.parallel_apply(process_text)
df_val['title'] = df_val.title.parallel_apply(process_text)
print('val done!')
pandarallel.initialize(progress_bar=True)
df_train['description'] = df_train.description.parallel_apply(process_text)
pandarallel.initialize(progress_bar=False)
df_train['title'] = df_train.title.parallel_apply(process_text)

val done!
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=61531), Label(value='0 / 61531')))…

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 7.55 s, sys: 3.58 s, total: 11.1 s
Wall time: 1min 30s


In [28]:
df_train.iloc[457437].title + ' ' + df_train.iloc[457437].description

'девочка белый девочка белый цена документ документ дорогой 200 миникнуть привить подрастать беленький короткий носик очень пушистый ласковый дополнительный инфо тело фото наш доставка мочь организовать'

In [29]:
# А в скале, например, так сделать нельзя, потому что статическая типизация лучше динамической.

df_train['title_and_description'] = df_train.title + ' ' + df_train.description
df_val['title_and_description'] = df_val.title + ' ' + df_val.description
df_train.drop(['title', 'description'], axis=1, inplace=True)
df_val.drop(['title', 'description'], axis=1, inplace=True)
df_train.drop(['subcategory', 'price', 'region', 'city', 'datetime_submitted'], axis=1, inplace=True)
df_val.drop(['subcategory', 'price', 'region', 'city', 'datetime_submitted'], axis=1, inplace=True)

In [30]:
df_train.columns

Index(['category', 'is_bad', 'phone_normal', 'phone_biased', 'email',
       'youtube', 'home_phone', 'site', 'phone_operators',
       'title_and_description'],
      dtype='object')

## Вот так я "отбирал" регулярки и прочие полезности

In [31]:
formatter = lambda x: '{:.4f}'.format(x)
table = []

for regexp_name in tqdm(regexps.keys()):
    accuracy = accuracy_score(df_train['is_bad'], df_train[regexp_name])
    f1 = f1_score(df_train['is_bad'], df_train[regexp_name])
    auc = roc_auc_score(df_train['is_bad'], df_train[regexp_name])
    table.append([regexp_name, formatter(accuracy), formatter(f1), formatter(auc)])

print(tabulate(table, headers=['name', 'accuracy', 'f1', 'auc'], tablefmt='orgtbl'))

  0%|          | 0/7 [00:00<?, ?it/s]

| name            |   accuracy |     f1 |    auc |
|-----------------+------------+--------+--------|
| phone_normal    |     0.7023 | 0.5208 | 0.6907 |
| phone_biased    |     0.8026 | 0.3482 | 0.6036 |
| email           |     0.7575 | 0.0401 | 0.5068 |
| youtube         |     0.7582 | 0.0129 | 0.5024 |
| home_phone      |     0.6146 | 0.5041 | 0.6808 |
| site            |     0.7186 | 0.2075 | 0.5258 |
| phone_operators |     0.7573 | 0.0166 | 0.5024 |


Оставил те, у которых auc выше 0.5, номера и числа заменил на цифры, соцсети объединил и потестил ещё раз.

## Сохранение датафрейма

In [32]:
df_train.to_csv('../data/train_preprocessed.csv', index=False)
df_val.to_csv('../data/val_preprocessed.csv', index=False)