In [1]:
import gensim
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

Чтобы использовать модель, нужно скачать [архив](https://rusvectores.org/static/models/rusvectores4/fasttext/araneum_none_fasttextcbow_300_5_2018.tgz) и распаковать его.

In [2]:
data = pd.read_csv('better_words_2.csv', encoding='utf-8', sep='\t', names=['word', 'info', 'modified'])

In [3]:
model = gensim.models.KeyedVectors.load("araneum_none_fasttextcbow_300_5_2018.model")

In [4]:
model.most_similar('раскритиковать')

[('раскритиковывать', 0.6009185314178467),
 ('критиковать', 0.4883984327316284),
 ('критика', 0.4764508605003357),
 ('покритиковать', 0.47199517488479614),
 ('высказывание', 0.46378931403160095),
 ('интерпретировать', 0.45879337191581726),
 ('критик', 0.45685070753097534),
 ('самокритика', 0.4509151577949524),
 ('ритик', 0.45008182525634766),
 ('телекритика', 0.4466124176979065)]

In [5]:
# with open('forms.txt') as file: # словарь словоформ
#     forms = file.readlines()
# forms = [word.strip('\"»«\n)(') for word in forms]
# forms = [word.lower() for word in forms if word]

# forms = pd.Series(forms)

In [6]:
# forms.to_csv('forms.csv', encoding='utf-8', index=False)

In [5]:
with open('forms.txt') as file: # словарь словоформ
    forms = file.readlines()
forms = [word.strip('\"»«\n)(') for word in forms]
forms = [word.lower() for word in forms if word]

with open('slovar_edited.csv', encoding='utf-8') as file: # словарь англицизмов
    slovar = file.readlines()
slovar = [word.strip('\"»«\n)(') for word in slovar]
slovar = [word.lower() for word in slovar if len(word) > 1]


Данные очень несбалансированные, поэтому сделаем выборку.

In [6]:
len(forms), len(slovar)

(2334516, 16107)

Также отдельно отложим данные для валидации.

In [7]:
forms = pd.DataFrame({'word': forms, 'label': 0}).sample(60000)
forms_train = forms.head(50000)
forms_valid = forms.tail(10000)
slovar = pd.DataFrame({'word': slovar, 'label': 1}).sample(frac=1)
slovar_train = slovar.head(13107)
slovar_valid = slovar.tail(3000)
training = pd.concat([forms_train, slovar_train])
validation = pd.concat([forms_valid, slovar_valid]) 

In [8]:
training['model'] = training.word.apply(lambda x: model[x])
validation['model'] = validation.word.apply(lambda x: model[x])

In [9]:
X = np.vstack(training.model.values)
y = training['label'].values

In [10]:
X.shape

(63107, 300)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
clf_1 = LogisticRegression(random_state=0, n_jobs=-1, C=4)

In [14]:
clf_1.fit(X_train, y_train)

LogisticRegression(C=4, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=0,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [17]:
def sample_data(num, clf):
    for word in data.sample(num).modified.values:
        res = clf.predict(model[word].reshape(1, -1))
        if res[0] == 1:
            print(f"{word}: {res}")

In [18]:
sample_data(200, clf_1)

эксодарлинг: [1]
цемус: [1]
сигулдской: [1]
эбсолют: [1]
хурум: [1]
эвергрины: [1]
джонхерт: [1]
эмплом: [1]
бентаму: [1]
шаолин: [1]
пейджик: [1]
тьса: [1]
хьюзмолер: [1]
уитстон: [1]
хайвелд: [1]
дигов: [1]
нихон: [1]


### F1 Score

In [19]:
y_pred = clf_1.predict(X_test)
f1_score(y_test, y_pred)

0.8031142781736607

In [20]:
validation.loc[validation.label == 1].head()

Unnamed: 0,word,label,model
1449,бифуркация,1,"[0.06252903, -0.02508882, -0.023080893, -0.080..."
4696,имиджмейкинг,1,"[0.014999646, -0.044523843, 0.07029855, 0.0075..."
8132,необычный,1,"[-0.07833143, -0.10165572, 0.06670411, 0.00448..."
14272,фреймлайт,1,"[0.003905304, -0.026356818, -0.001231998, -0.0..."
3294,гросс,1,"[0.04398974, 0.0130719915, 0.016659278, 0.0120..."


In [21]:
def predict_word(word, classifier):
    return classifier.predict(model[word].reshape(1, -1))[0]

In [99]:
validation['prediction'] = validation.word.apply(predict_word, args=(clf_1,))

In [100]:
f1_score(validation.label.values, validation.prediction.values)

0.8132654983983418

In [78]:
from sklearn.linear_model import SGDClassifier

In [74]:
sgd = SGDClassifier(class_weight='balanced')

In [75]:
sgd.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight='balanced',
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [76]:
validation['prediction'] = validation.word.apply(predict_word, args=(sgd, ))

In [117]:
sample_data(200, sgd)

ритмовай: [1]
ринмо: [1]
нипи: [1]
свм: [1]
жут: [1]
симсоноф: [1]
фрайята: [1]
баретер: [1]
вайть: [1]
литчман: [1]
плэнинг: [1]
манеджемент: [1]
сифуэнтесом: [1]
еврохолуев: [1]
босовой: [1]
презентосал: [1]
фотоприкол: [1]
бобато: [1]
альхага: [1]
скайпику: [1]
синатрыча: [1]
диайвайщика: [1]
кенас: [1]
криптосистемой: [1]
алкобиологи: [1]
читлом: [1]
арнего: [1]
мегакризисом: [1]
камандуе: [1]
сиаму: [1]
пухкете: [1]
бимиш: [1]
логистиктрансэкспрес: [1]
пауэле: [1]


In [116]:
f1_score(validation.label.values, validation.prediction.values)

0.8470781893004116

In [81]:
from sklearn.neural_network import MLPClassifier

In [102]:
clf_2 = MLPClassifier(max_iter=300)

In [103]:
%%time

clf_2.fit(X_train, y_train)

Wall time: 2min 29s


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=300,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [104]:
validation['prediction'] = validation.word.apply(predict_word, args=(clf_2, ))

In [105]:
f1_score(validation.label.values, validation.prediction.values)

0.8519462859085501

In [118]:
sample_data(200, clf_2)

эйлатскую: [1]
сутинен: [1]
кать: [1]
сейачс: [1]
раджай: [1]
антияхвизм: [1]
афроамериканопопыми: [1]
офигительный: [1]
бодоне: [1]
унгард: [1]
спэшиэл: [1]
псевдочастники: [1]
фьюитьнулась: [1]
вэйдерам: [1]
чстота: [1]
тамуридис: [1]
неймингу: [1]
хиповата: [1]
линкбейтинга: [1]
фури: [1]
тейа: [1]
ганен: [1]
свинго: [1]
фэйрбразер: [1]
онтакт: [1]
нифъаль: [1]
сиэн: [1]
нол: [1]
шутерну: [1]
макгоуну: [1]
роканрол: [1]
амеркианцы: [1]
котманду: [1]
аруактар: [1]
мейдфорвардера: [1]
сильверкрос: [1]
френды: [1]
уолдману: [1]
максимаркет: [1]
фебруаре: [1]


In [106]:
from sklearn.svm import LinearSVC

In [107]:
clf_3 = LinearSVC(class_weight='balanced')

In [108]:
clf_3.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [109]:
validation['prediction'] = validation.word.apply(predict_word, args=(clf_3, ))

In [110]:
f1_score(validation.label.values, validation.prediction.values)

0.8470781893004116

In [119]:
sample_data(200, clf_3)

украиноцентричные: [1]
театър: [1]
гинчо: [1]
штекно: [1]
рамштиайнов: [1]
дзун: [1]
шлемоподобный: [1]
фамлией: [1]
паркують: [1]
берсеркоподобная: [1]
постопив: [1]
мэнамской: [1]
амадан: [1]
мицубиську: [1]
надувака: [1]
интелектуальными: [1]
меркаванем: [1]
экуной: [1]
бокены: [1]
уфал: [1]
евродэнсом: [1]
блоджы: [1]
рашнбизнес: [1]
тренхарда: [1]
авиатура: [1]
френджу: [1]
райн: [1]
пингарея: [1]
волтер: [1]
имьютабл: [1]
шиладжит: [1]
актерство: [1]
вэбсайта: [1]
бихевиористкие: [1]
гиперкапния: [1]
хелмеру: [1]
вильяфафила: [1]
пактеик: [1]


In [120]:
# from sklearn.svm import OneClassSVM

In [26]:
# index = []
# word = []

# for word in data.modified:
#     res = clf.predict(model[word].reshape(1, -1))
#     if res[0] == 1:
        

In [27]:
# with open('stand_angl.csv', 'w', encoding='utf-8') as file:
#     file.writelines('word\n')
#     for word in data.sample(frac=1).modified.values:
#         res = clf.predict(model[word].reshape(1, -1))
#         if res[0] == 1:
#             line = word + '\n'
#             file.writelines(line)

In [41]:
# with open('stand_angl_2.csv', 'w', encoding='utf-8') as file:
#     file.writelines('word\n')
#     for word in data.sample(frac=1).modified.values:
#         res = clf.predict(model[word].reshape(1, -1))
#         if res[0] == 1:
#             line = word + '\n'
#             file.writelines(line)