# Бейзлайн функциональности "Определение обсценной лексики"

## Загрузки и импорты

### Библиотеки

In [1]:
!pip install --upgrade gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/5c/4e/afe2315e08a38967f8a3036bbe7e38b428e9b7a90e823a83d0d49df1adf5/gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 81.5MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.8.3


In [2]:
!pip install pymorphy2[fast]

Collecting pymorphy2[fast]
[?25l  Downloading https://files.pythonhosted.org/packages/07/57/b2ff2fae3376d4f3c697b9886b64a54b476e1a332c67eee9f88e7f1ae8c9/pymorphy2-0.9.1-py3-none-any.whl (55kB)
[K     |██████                          | 10kB 15.8MB/s eta 0:00:01[K     |███████████▉                    | 20kB 9.3MB/s eta 0:00:01[K     |█████████████████▊              | 30kB 5.6MB/s eta 0:00:01[K     |███████████████████████▋        | 40kB 5.2MB/s eta 0:00:01[K     |█████████████████████████████▌  | 51kB 4.8MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.5MB/s 
[?25hCollecting pymorphy2-dicts-ru<3.0,>=2.4
[?25l  Downloading https://files.pythonhosted.org/packages/3a/79/bea0021eeb7eeefde22ef9e96badf174068a2dd20264b9a378f2be1cdd9e/pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2MB)
[K     |████████████████████████████████| 8.2MB 5.2MB/s 
Collecting dawg-python>=0.7.1
  Downloading https://files.pythonhosted.org/packages/6a/84/ff1ce2071d4c650ec85

### Загрузка предобученной векторной модели FastText Araneum Russicum

In [71]:
!wget https://rusvectores.org/static/models/rusvectores4/fasttext/araneum_none_fasttextcbow_300_5_2018.tgz

--2021-03-01 15:19:59--  https://rusvectores.org/static/models/rusvectores4/fasttext/araneum_none_fasttextcbow_300_5_2018.tgz
Resolving rusvectores.org (rusvectores.org)... 116.203.104.23
Connecting to rusvectores.org (rusvectores.org)|116.203.104.23|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2691248108 (2.5G) [application/x-gzip]
Saving to: ‘araneum_none_fasttextcbow_300_5_2018.tgz’


2021-03-01 15:22:15 (19.0 MB/s) - ‘araneum_none_fasttextcbow_300_5_2018.tgz’ saved [2691248108/2691248108]



### Импорты

In [72]:
import tarfile
import gensim
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from pymorphy2 import MorphAnalyzer
from tqdm import tqdm
from string import punctuation

tokenizer = nltk.tokenize.WhitespaceTokenizer()
morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [73]:
vector_archive_path = "/content/araneum_none_fasttextcbow_300_5_2018.tgz"
vector_model_path = "/content/araneum_none_fasttextcbow_300_5_2018.model"
model_path = "/content/drive/MyDrive/thesis_datasets/obscene_detection.model"
obscene_data_path = "/content/drive/MyDrive/thesis_datasets/obscene_preprocessed_corpus.txt"
not_obscene_data_path = "/content/drive/MyDrive/thesis_datasets/2ch_corpus_preprocessed.txt"
corpus_path = "/content/drive/MyDrive/thesis_datasets/obscene_data.csv"

In [74]:
fasttext_file = tarfile.open(vector_archive_path, "r")
fasttext_file.extractall()

## Подготовка корпуса слов для обучения

In [9]:
# with open(obscene_data_path, "r") as obscene_data_file:
#   obscene_data = obscene_data_file.readlines()

In [10]:
# obscene_data[:20]

['6ля\n',
 '6ляд\n',
 'fuck\n',
 'pizd\n',
 'pizda\n',
 'pizdah\n',
 'pizdakh\n',
 'pizdam\n',
 'pizdami\n',
 'pizdax\n',
 'pizde\n',
 'pizdoi\n',
 'pizdoj\n',
 'pizdoy\n',
 'pizdu\n',
 'pizdy\n',
 'scheise\n',
 'scheisse\n',
 'shit\n',
 'suck\n']

In [11]:
# len(obscene_data)

7352

In [39]:
# with open(not_obscene_data_path, "r") as not_obscene_data_file:
#   not_obscene_data = not_obscene_data_file.readlines()

In [40]:
# not_obscene_data[:20]

['приближённый\n',
 'хаскеле-то\n',
 'лье\n',
 'плейофф\n',
 'привязывать\n',
 'атб\n',
 '3128\n',
 'комуфляж\n',
 'голова-туловище\n',
 'чикаго-кун\n',
 'уберняша\n',
 'редактироваться\n',
 'погаснуть\n',
 'ньюфажига\n',
 'лолмимо\n',
 'aodv\n',
 'efkz@yandex\n',
 '60кт\n',
 'military\n',
 'многопользовательский\n']

In [41]:
# len(not_obscene_data)

114660

In [42]:
# def corpus_preprocess(words: list) -> list:
    
#     words = [word.strip('\n').strip(punct) for word in words]

#     return words

In [43]:
# obscene_data = corpus_preprocess(obscene_data)
# obscene_data[:10]

['хyе',
 'вымандошиваться',
 'поебется',
 'пизdатого',
 'хуёвый',
 'коблуха',
 'уебское',
 'клиторман',
 'малоебучая',
 'выпиздит']

In [44]:
# not_obscene_data = corpus_preprocess(not_obscene_data)
# not_obscene_data[:10]

['приближённый',
 'хаскеле-то',
 'лье',
 'плейофф',
 'привязывать',
 'атб',
 '3128',
 'комуфляж',
 'голова-туловище',
 'чикаго-кун']

In [45]:
# obscene_data = list(set(obscene_data))
# not_obscene_data = list(set(not_obscene_data))

In [46]:
# obscene_df = pd.DataFrame(data = obscene_data, columns = ['words'])
# not_obscene_df = pd.DataFrame(data = not_obscene_data, columns = ['words'])

In [55]:
# obscene_df['labels'] = [1 for _ in range(len(obscene_df))]
# not_obscene_df['labels'] = [0 for _ in range(len(obscene_df))]

In [56]:
# obscene_df.head()

Unnamed: 0,words,labels
0,хyе,1
1,вымандошиваться,1
2,поебется,1
3,пизdатого,1
4,хуёвый,1


In [58]:
# not_obscene_df.head()

Unnamed: 0,words,labels
0,такое?клон,0
1,бдсм'щий,0
2,445771,0
3,сюжеты/приём,0
4,заутра,0


In [59]:
# corpus = pd.concat([obscene_df, not_obscene_df], axis=0)

In [60]:
# corpus.head()

Unnamed: 0,words,labels
0,хyе,1
1,вымандошиваться,1
2,поебется,1
3,пизdатого,1
4,хуёвый,1


In [61]:
# len(corpus)

121706

In [62]:
# len(corpus.labels.unique())

2

In [63]:
# corpus.labels.unique()

array([1, 0])

In [64]:
# corpus = corpus.sample(frac=1)

In [65]:
# corpus.head()

Unnamed: 0,words,labels
24647,снький,0
1680,хуячиться,1
55719,негейт,0
88143,парсер,0
92740,прекрасном:3,0


In [68]:
# corpus.to_csv(corpus_path)

In [69]:
# len(corpus)

121706

## Бейзлайн

In [75]:
fasttext = gensim.models.KeyedVectors.load(vector_model_path)

In [93]:
corpus = pd.read_csv(corpus_path)
corpus.dropna(inplace=True)

In [94]:
len(corpus)

121705

In [77]:
def get_embeddings(words: list, vector_model=fasttext):
    vectors = np.zeros((len(words), 300))
    for i, word in enumerate(words):
        vector = vector_model[word]
        vectors[i] = vector
    return vectors

In [97]:
X = get_embeddings([word for word in corpus.words.values])
y = corpus.labels.values

In [98]:
X.shape, y.shape

((121705, 300), (121705,))

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
logreg = LogisticRegression().fit(X_train, y_train)

In [105]:
y_pred = logreg.predict(X_test)

In [108]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     22826
           1       0.58      0.14      0.23      1515

    accuracy                           0.94     24341
   macro avg       0.76      0.57      0.60     24341
weighted avg       0.92      0.94      0.92     24341



In [110]:
f1_score(y_test, y_pred)

0.2252922422954304