In [1]:
import gensim
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

Чтобы использовать модель, нужно скачать [архив](https://rusvectores.org/static/models/rusvectores4/fasttext/araneum_none_fasttextcbow_300_5_2018.tgz) и распаковать его.

In [2]:
data = pd.read_csv('better_words.csv', encoding='utf-8', sep='\t', names=['index', 'word', 'info', 'modified'])

In [3]:
model = gensim.models.KeyedVectors.load("araneum_none_fasttextcbow_300_5_2018.model")

In [4]:
with open('forms.txt') as file: # словарь словоформ
    forms = file.readlines()
forms = [word.strip('\"»«\n)(') for word in forms]
forms = [word.lower() for word in forms if word]

with open('slovar_edited.csv', encoding='utf-8') as file: # словарь англицизмов
    slovar = file.readlines()
slovar = [word.strip('\"»«\n)(') for word in slovar]
slovar = [word.lower() for word in slovar if len(word) > 1]


Данные очень несбалансированные, поэтому сделаем выборку.

In [5]:
len(forms), len(slovar)

(2334516, 16107)

Также отдельно отложим данные для валидации.

In [6]:
forms = pd.DataFrame({'word': forms, 'label': 0}).sample(60000)
forms_train = forms.head(50000)
forms_valid = forms.tail(10000)
slovar = pd.DataFrame({'word': slovar, 'label': 1}).sample(frac=1)
slovar_train = slovar.head(13107)
slovar_valid = slovar.tail(3000)
training = pd.concat([forms_train, slovar_train])
validation = pd.concat([forms_valid, slovar_valid]) 

In [7]:
training['model'] = training.word.apply(lambda x: model[x])
validation['model'] = validation.word.apply(lambda x: model[x])

In [8]:
X = np.vstack(training.model.values)
y = training['label'].values

In [9]:
X.shape

(63107, 300)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [19]:
for word in data.sample(100).modified.values:
    res = clf.predict(model[word].reshape(1, -1))
    if res[0] == 1:
        print(f"{word}: {res}")

вэлентайн: [1]
федаи: [1]
вартс: [1]
логфайла: [1]
аутнау: [1]
фаел: [1]
эдв: [1]
анрексист: [1]


### F1 Score

In [20]:
y_pred = clf.predict(X_test)
f1_score(y_test, y_pred)

0.7820669104699988

In [21]:
validation.loc[validation.label == 1].head()

Unnamed: 0,word,label,model,prediction
5367,каучсёрфинг,1,"[0.010669713, -0.0027955084, -0.007030979, 0.0...",1
1548,блудхаунд,1,"[0.022531068, -0.0035569028, 0.009551105, -0.0...",0
14452,футер,1,"[-0.02691076, -0.09293588, 0.06733015, -0.0311...",1
2414,виггер,1,"[-0.036411233, -0.006401194, 0.02648523, -0.00...",1
16038,юнайт,1,"[-0.015594717, -0.0003088545, -0.0026805222, -...",1


In [22]:
def predict_word(word):
    return clf.predict(model[word].reshape(1, -1))[0]

In [23]:
validation['prediction'] = validation.word.apply(predict_word)

In [24]:
# tp = len(validation.loc[(validation.label == 1) & (validation.prediction == 1)])
# fp = len(validation.loc[(validation.label == 0) & (validation.prediction == 1)])
# tn = len(validation.loc[(validation.label == 0) & (validation.prediction == 0)])
# fn = len(validation.loc[(validation.label == 1) & (validation.prediction == 0)])

In [25]:
# tp, fp, tn, fn

In [26]:
# precision = tp / (tp + fp)
# precision

In [27]:
# recall = tn / (tn + fp)
# recall

In [28]:
# f1 = 2 * (precision * recall) / (precision + recall)
# f1

In [29]:
f1_score(validation.label.values, validation.prediction.values)

0.7950454809367138