In [172]:
import gensim
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

Чтобы использовать модель, нужно скачать [архив](https://rusvectores.org/static/models/rusvectores4/fasttext/araneum_none_fasttextcbow_300_5_2018.tgz) и распаковать его.

In [173]:
data = pd.read_csv('better_words.csv', encoding='utf-8', sep='\t', names=['index', 'word', 'info', 'modified'])

In [174]:
model = gensim.models.KeyedVectors.load("araneum_none_fasttextcbow_300_5_2018.model")

In [175]:
with open('forms.txt') as file: # словарь словоформ
    forms = file.readlines()
forms = [word.strip('\"»«\n)(') for word in forms]
forms = [word.lower() for word in forms if word]

with open('slovar_edited.csv', encoding='utf-8') as file: # словарь англицизмов
    slovar = file.readlines()
slovar = [word.strip('\"»«\n)(') for word in slovar]
slovar = [word.lower() for word in slovar if len(word) > 1]


Данные очень несбалансированные, поэтому сделаем выборку.

In [176]:
len(forms), len(slovar)

(2334516, 16107)

Также отдельно отложим данные для валидации.

In [177]:
forms = pd.DataFrame({'word': forms, 'label': 0}).sample(60000)
forms_train = forms.head(50000)
forms_valid = forms.tail(10000)
slovar = pd.DataFrame({'word': slovar, 'label': 1}).sample(frac=1)
slovar_train = slovar.head(13107)
slovar_valid = slovar.tail(3000)
training = pd.concat([forms_train, slovar_train])
validation = pd.concat([forms_valid, slovar_valid]) 

In [178]:
training['model'] = training.word.apply(lambda x: model[x])
validation['model'] = validation.word.apply(lambda x: model[x])

In [179]:
X = np.vstack(training.model.values)
y = training['label'].values

In [180]:
X.shape

(63107, 300)

In [181]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [182]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [187]:
for word in data.sample(100).modified.values:
    res = clf.predict(model[word].reshape(1, -1))
    if res[0] == 1:
        print(f"{word}: {res}")

ипере: [1]
гуанкси: [1]
лавэр: [1]
дрэма: [1]
укулель: [1]
голдену: [1]


### F1 Score

In [188]:
y_pred = clf.predict(X_test)
f1_score(y_test, y_pred)

0.7736287785528656

In [189]:
i = 0
for word in neo.loc[neo.label == 1].sample(1000).word.values:
    pred = clf.predict(model[word].reshape(1, -1))
    #print(word, pred)
    if pred[0] == 1:
        i += 1
print(i/1000)

0.678


In [190]:
validation.loc[validation.label == 1].head()

Unnamed: 0,word,label,model
8149,неопанк,1,"[0.0037404108, -0.008987354, 0.045442805, 0.01..."
6681,легисигн,1,"[-0.0014465022, -0.01831446, -0.006882749, 0.0..."
10783,сейвер,1,"[-0.013358746, -0.0002563931, 0.021314256, -0...."
14421,фудстилист,1,"[0.009526102, -0.015211814, 0.008865715, 0.030..."
14487,фэйслифтинг,1,"[0.017608402, 0.00046651773, 0.010723985, -0.0..."


In [191]:
def predict_word(word):
    return clf.predict(model[word].reshape(1, -1))[0]

In [192]:
validation['prediction'] = validation.word.apply(predict_word)

In [193]:
# tp = len(validation.loc[(validation.label == 1) & (validation.prediction == 1)])
# fp = len(validation.loc[(validation.label == 0) & (validation.prediction == 1)])
# tn = len(validation.loc[(validation.label == 0) & (validation.prediction == 0)])
# fn = len(validation.loc[(validation.label == 1) & (validation.prediction == 0)])

In [194]:
# tp, fp, tn, fn

In [195]:
# precision = tp / (tp + fp)
# precision

In [196]:
# recall = tn / (tn + fp)
# recall

In [197]:
# f1 = 2 * (precision * recall) / (precision + recall)
# f1

In [198]:
f1_score(validation.label.values, validation.prediction.values)

0.790896712701809