In [2]:
import pandas as pd
import numpy as np
from string import ascii_lowercase

In [3]:
deformed_1 = pd.read_csv('false_deforming.csv')[['words', 'values']]

In [4]:
deformed_1.columns = ['word', 'is_deformed']

In [5]:
deformed_1

Unnamed: 0,word,is_deformed
0,мнооооооооооогие,0
1,боооооожееееее,0
2,теплыевещички,0
3,грозитъ,0
4,доооожджииикиии,0
...,...,...
2201,янышевский,0
2202,дилнара,0
2203,роздруковках,0
2204,дрбрались,0


In [6]:
deformed_2 = pd.read_csv('train_1.csv')[['words', 'values']]

In [7]:
deformed_2.columns = ['word', 'is_deformed']

In [8]:
deformed = pd.concat([deformed_1, deformed_2])

In [9]:
deformed.head()

Unnamed: 0,word,is_deformed
0,мнооооооооооогие,0
1,боооооожееееее,0
2,теплыевещички,0
3,грозитъ,0
4,доооожджииикиии,0


In [10]:
expressive_1 = pd.read_csv('expressive_words_train.txt', names=['word'])
expressive_1['is_expressive'] = 1

In [11]:
expressive_2 = pd.read_csv('zaliznyak.txt', names=['word'])
expressive_2['is_expressive'] = 0

In [12]:
expressive = pd.concat([expressive_1, expressive_2])

In [13]:
expressive.head()

Unnamed: 0,word,is_expressive
0,справулечек,1
1,любопытнеько,1
2,мегадоклад,1
3,хняшечка,1
4,особнячково,1


In [14]:
loanwords_1 = pd.read_csv('slovar_edited.csv')

In [15]:
loanwords_1.columns = ['word']

In [16]:
loanwords_1.word = loanwords_1.word.apply(lambda x: x.lower().lstrip('(').rstrip(')').rstrip('»').lstrip('«').strip(' '))

In [17]:
loanwords_1 = loanwords_1.loc[loanwords_1.word.str.len() > 2]

In [18]:
loanwords_1 = loanwords_1.drop_duplicates()

In [19]:
loanwords_1 = loanwords_1.loc[~loanwords_1.word.apply(lambda x: bool(set(x).intersection(set(ascii_lowercase))))]

In [20]:
loanwords_1['is_loanword'] = 1

In [21]:
loanwords_2 = pd.read_csv('forms.csv').sample(10_000) # здесь нужно подумать над числом

In [22]:
loanwords_2.columns = ['word']

In [23]:
loanwords_2['is_loanword'] = 0

In [24]:
loanwords = pd.concat([loanwords_1, loanwords_2])

In [25]:
data = pd.concat([loanwords, expressive, deformed])

In [32]:
data.fillna(0, inplace=True)
data.is_loanword = data.is_loanword.astype(int)
data.is_expressive = data.is_expressive.astype(int)
data.is_deformed = data.is_deformed.astype(int)

In [33]:
data

Unnamed: 0,word,is_loanword,is_expressive,is_deformed
0,бонг,1,0,0
18,дженерал,1,0,0
19,колл,1,0,0
21,лонг,1,0,0
22,медведей,1,0,0
...,...,...,...,...
1031,пашпорт,0,0,1
1032,нуступало,0,0,1
1033,вирусид,0,0,1
1034,розшарив,0,0,1


In [35]:
from sklearn.metrics import average_precision_score, hamming_loss, f1_score
from sklearn.linear_model import SGDClassifier
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
import gensim

In [36]:
model = gensim.models.KeyedVectors.load("/mnt/f/data/models/fasttext/araneum_none_fasttextcbow_300_5_2018.model")

In [37]:
X = data['word']
y = data.iloc[:, 1:]

In [38]:
%time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.91 µs


In [39]:
sgd = SGDClassifier()

In [40]:
clf = MultiOutputClassifier(sgd)

In [41]:
X_train_vec = X_train.apply(lambda x: model[x])
X_test_vec = X_test.apply(lambda x: model[x])

In [42]:
X_train_vec = np.vstack(X_train_vec.values)
X_test_vec = np.vstack(X_test_vec.values)

In [43]:
%time
clf.fit(X_train_vec, y_train)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 8.58 µs


MultiOutputClassifier(estimator=SGDClassifier())

In [44]:
preds = clf.predict(X_test_vec)

In [45]:
average_precision_score(y_test, preds)

0.3357394003961723

In [46]:
hamming_loss(y_test, preds)

0.05670603875850079

In [47]:
f1_score(y_test.values[:, 0], preds[:, 0])

0.8613065326633166

In [48]:
f1_score(y_test.values[:, 1], preds[:, 1])

0.25000000000000006

In [49]:
f1_score(y_test.values[:, 2], preds[:, 2])

0.0