In [1]:
import numpy as np
import pandas
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.grid_search import GridSearchCV
from scipy import misc

from imblearn.over_sampling import RandomOverSampler
import codecs



# Загрузка данных

In [264]:
df = pandas.read_csv('.\\rlc_featured_mod9.csv', sep='\t')
df.head()

Unnamed: 0,sentid,sent,start,finish,error,correction,tag,levenstein,lemmaequal,grammequal,...,predyduschej_c,ob'jasnjat'_c,tschatel'no_c,vazhny_c,doktor_c,zhukova_c,privodjat_c,chem_c,mihail_c,vstretila_c
0,9227,"Например, в « Метеле» рассказчик горорит, что ...",5,5,горорит,говорит,ortho,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,57750,"Площадь России составляет 17,1 млн. км 2. Росс...",14,14,"Польшой,","Польшей,",altern infl,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,58941,"Но я помню, как она заботится обо мне всю мою ...",6,6,заботится,заботилась,tense,3,1,0,...,0,0,0,0,0,0,0,0,0,0
3,52313,Я гаvарou та русскy с мyе мама и с мyеме дрouг...,11,11,дрouгеме,другими,graph orpho,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10278,В понедельник 10 мая мы брали интервью у&l,19,19,современной,в современной,lex del,2,0,0,...,0,0,0,0,0,0,0,0,0,0


In [146]:
tag_freq = {}
for tag in order:
    tag_freq[tag] = df[tag].value_counts()[1]
for tag in sorted(tag_freq, key=tag_freq.get, reverse=True):
    print(tag, '(встретился ' + str(tag_freq[tag]) + ' раз)')

ortho (встретился 4277 раз)
lex (встретился 4075 раз)
constr (встретился 2155 раз)
del (встретился 1842 раз)
gov (встретился 1412 раз)
subst (встретился 1146 раз)
insert (встретился 1104 раз)
agrcase (встретился 986 раз)
transfer (встретился 874 раз)
asp (встретился 783 раз)
conj (встретился 713 раз)
wo (встретился 649 раз)
syntax (встретился 638 раз)
not-clear (встретился 624 раз)
ref (встретился 493 раз)
space (встретился 486 раз)
agrgender (встретился 478 раз)
agrnum (встретился 457 раз)
num (встретился 449 раз)
infl (встретился 369 раз)
morph (встретился 309 раз)
prep (встретился 301 раз)
misspell (встретился 287 раз)
tense (встретился 274 раз)
deriv (встретился 265 раз)
hyphen (встретился 212 раз)
graph (встретился 174 раз)
par (встретился 169 раз)
brev (встретился 141 раз)
idiom (встретился 138 раз)
refl (встретился 134 раз)
transp (встретился 94 раз)
coord (встретился 93 раз)
cs (встретился 89 раз)
aux (встретился 86 раз)
typo (встретился 74 раз)
phon (встретился 66 раз)
punc (в

In [293]:
lst = 'brev	deriv	morph	space	conj	num	hyphen	refl	ortho	gov_w_prep	insert	lex	not-clear	lex_wo_conj	infl	asp	ortho_wo_others	punc	ref	constr	agrnum	syntax	wo	del	agrgender	syntax_wo_others	gov	subst	misspell	transfer	agrcase	graph	par	prep	tense	agrgerund	agrpers	aux	coord	cs	idiom	neg	passive	phon	translit	transp	typo'.split('\t')

for tag in lst:
    print(df[tag].value_counts()[1])

141
265
309
486
713
449
212
134
4277
1682
1104
4075
624
3727
369
783
2364
63
493
2155
457
638
649
1842
478
232
1412
1146
287
874
986
174
169
301
274
11
60
86
93
89
138
39
55
66
54
94
74


# Вычисление матрицы условных вероятностей для тегов по формуле Байеса

In [241]:
tagslist = 'prep	phon	graph	hyphen	space	ortho	translit	misspell	deriv	infl	num	morph	asp	passive	agrnum	agrcase	agrgender	agrpers	agrgerund	transfer	gov	ref	conj	wo	neg	aux	brev	syntax	constr	lex	cs	par	idiom	coord	refl	insert	tense	not-clear	transp	subst	del	punc	typo'.split('\t')

In [242]:
tagslist += new_tags

In [6]:
tagmatrix = df[tagslist]

In [7]:
condmatr = np.zeros(shape=(len(tagslist),len(tagslist)))
for i in range(len(tagslist)):
    for j in range(len(tagslist)):
        try:
            condmatr[i][j] = round(tagmatrix.groupby(tagslist[i])[tagslist[j]].apply(lambda g: g.value_counts()/len(g))[1][1], 3)
        except KeyError:
            condmatr[i][j] = 0

In [39]:
condmatr[0][3]

0.0030000000000000001

In [147]:
a = codecs.open('cond_prib_3.csv', 'w', 'cp1251')
a.write('-----;' + ';'.join(tagslist) + '\r\n')
for i in range(len(tagslist)):
    a.write(tagslist[i] + ';' + ';'.join([str(j).replace('.', ',') for j in condmatr[i]]))
    a.write('\r\n')
a.close()

Вероятность при условии ничего нельзя посчитать, так как ничего не бывает :) Количество тегов каждого вида.

# Выбор столбцов с тегами

In [256]:
labels = ['levenstein', 'lemmaequal', 'grammequal', 'stemequal', 'lenorig', 'lencorr', 'bastard']
poslist = 'A ADVPRO PART CONJ ANUM ADV SPRO S COM V APRO INTJ PR NUM'.split(' ')
pos1 = [i + '1' for i in poslist]
pos2 = [i + '2' for i in poslist]
pos_j = 'PR_j1	ADV_j1	INTJ_j1	PRO_j1	PART_j1	V_j1	A_j1	S_j1	NUM_j1	CONJ_j1	PR_j2	ADV_j2	INTJ_j2	PRO_j2	PART_j2	V_j2	A_j2	S_j2	NUM_j2	CONJ_j2'.split('\t')
new_labels = 'hashyphen	hasgraph	hasneg	hasspace	hasaux	samebrev	samegender	samenum	samepers'.split('\t')
very_new_labels = 'haspassive\thascyrillic'.split('\t')
key_words = codecs.open('words_columns.txt', 'r', 'utf-8').read().split('\t')

labels = labels + pos_j + new_labels + ['levenstein_consonants'] + very_new_labels + key_words
labs = labels[:]

In [240]:
order = 'transfer	conj	syntax	neg	phon	aux	prep	del	constr	subst	idiom	ref	insert	transp	not-clear	passive	lex	misspell	graph	hyphen	space	translit	tense	infl	deriv	refl	typo	agrpers	par	wo	ortho	num	cs	agrgerund	morph	gov	asp	agrgender	agrcase	brev	agrnum	coord	punc'.split('\t')

ord_cut = 'transfer	conj	syntax	del	neg	prep	constr	subst	idiom	ref	transp	not-clear	aux	phon	passive	lex	graph	misspell	translit	infl	refl	deriv	par	hyphen	wo'.split('\t')

new_tags = 'gov_w_prep\tlex_wo_conj\tortho_wo_others\tsyntax_wo_others'.split('\t')
order = order + new_tags

In [259]:
print(len(labs))

1274


# Сравнение эффективности оверсемплинга и ограничения выборки для тега lex

## Оверсемплинг

In [66]:
ham = lex_df[lex_df.lex == 1].drop('lex', axis=1)
spam = lex_df[lex_df.lex == 0].drop('lex', axis=1)
n = int(len(ham) * 0.3)
print('hey there we test on ', n, 'of ', len(ham), 'while spam length is', len(spam))
hams_test = ham.sample(n)
spams_test = spam.sample(n)
ham_train = [i for index, i in ham.iterrows() if index not in hams_test.index.values]
spam_train = [i for index, i in spam.iterrows() if index not in spams_test.index.values]
test = pandas.concat([hams_test, spams_test])
train = ham_train + spam_train
test_answers = [1 for i in range(len(hams_test))] + [0 for i in range(len(spams_test))]
train_answers = [1 for i in range(len(ham_train))] + [0 for i in range(len(spam_train))]
print(len(train), len(train_answers), len(test), len(test_answers))

hey there we test on  1222 of  4074 while spam length is 14372
16002 16002 2444 2444


In [67]:
ros = RandomOverSampler()

data_resampled, answ_resampled = ros.fit_sample(train, train_answers)

In [68]:
clf = DecisionTreeClassifier(min_samples_split=5)
gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
gs_clf.fit(np.array(data_resampled), np.array(answ_resampled))
print(gs_clf.best_estimator_)
answ_pred = gs_clf.predict(test)
print(classification_report(test_answers, answ_pred))
print(confusion_matrix(test_answers, answ_pred))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=-1, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
             precision    recall  f1-score   support

          0       0.74      0.76      0.75      1222
          1       0.76      0.74      0.75      1222

avg / total       0.75      0.75      0.75      2444

[[931 291]
 [321 901]]


## Ограничение выборки

In [14]:
lex_df.index.values

array([    0,     1,     2, ..., 18542, 18543, 18544], dtype=int64)

In [15]:
ham = lex_df[lex_df.lex == 1].drop('lex', axis=1)
spam = lex_df[lex_df.lex == 0].drop('lex', axis=1)
n = int(len(ham) * 0.7)
print('hey there we train on ', n, 'of ', len(ham), 'while spam length is', len(spam))
hams = ham.sample(n)
spams = spam.sample(n)

hey there we train on  2867 of  4096 while spam length is 14449


In [16]:
ham_test = [i for index, i in ham.iterrows() if index not in hams.index.values]
spam_test = [i for index, i in spam.iterrows() if index not in spams.index.values][:len(ham_test)]
print(len(hams), len(spams), len(ham_test), len(spam_test))

2867 2867 1229 1229


In [17]:
train = pandas.concat([hams, spams])
test = ham_test + spam_test
train_answers = [1 for i in range(len(hams))] + [0 for i in range(len(spams))]
test_answers = [1 for i in range(len(ham_test))] + [0 for i in range(len(spam_test))]
print(len(train), len(train_answers), len(test), len(test_answers))

5734 5734 2458 2458


In [18]:
clf = DecisionTreeClassifier(min_samples_split=5)
gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
gs_clf.fit(np.array(train), np.array(train_answers))

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [19]:
answ_pred = gs_clf.predict(test)
print(classification_report(test_answers, answ_pred))
print(confusion_matrix(test_answers, answ_pred))

             precision    recall  f1-score   support

          0       0.75      0.75      0.75      1229
          1       0.75      0.75      0.75      1229

avg / total       0.75      0.75      0.75      2458

[[925 304]
 [308 921]]


если порезать, результаты чуть хуже, чем при оверсемплинге.

# Функции для разделения данных на тестовую и обучающую выборку разными способами

In [16]:
def one_tag_db_cut(tagname):
    labs = labels[:]
    labs.append(tagname)
    lex_df = df[labs]
    lex_df = lex_df.fillna('')
    lex_df = pandas.get_dummies(lex_df)
    ham = lex_df[lex_df[tagname] == 1].drop(tagname, axis=1)
    spam = lex_df[lex_df[tagname] == 0].drop(tagname, axis=1)
    n = int(len(ham) * 0.7)
    hams = ham.sample(n)
    spams = spam.sample(n)
    ham_test = [i for index, i in ham.iterrows() if index not in hams.index.values]
    spam_test = [i for index, i in spam.iterrows() if index not in spams.index.values]
    train = pandas.concat([hams, spams])
    test = ham_test + spam_test
    train_answers = [1 for i in range(len(hams))] + [0 for i in range(len(spams))]
    test_answers = [1 for i in range(len(ham_test))] + [0 for i in range(len(spam_test))]
    return train, test, train_answers, test_answers

In [17]:
ros = RandomOverSampler()
def balanced_selection_resampled(tagname, features_to_add=[]):
    labs = labels[:]
    labs.append(tagname)
    lex_df = df[labs + features_to_add]
    lex_df = lex_df.fillna('')
    lex_df = pandas.get_dummies(lex_df)
    data_train, test, answ_train, answ_test = train_test_split(lex_df.drop(tagname, axis=1), lex_df[tagname], test_size=0.3)
    data_resampled, answ_resampled = ros.fit_sample(data_train, answ_train)
    return data_resampled, test, answ_resampled, answ_test

In [5]:
ros = RandomOverSampler()
def one_tag_db_resampled(tagname):
    labs = labels[:]
    labs.append(tagname)
    lex_df = df[labs]
    lex_df = lex_df.fillna('')
    lex_df = pandas.get_dummies(lex_df)
    
    ham = lex_df[lex_df[tagname] == 1].drop(tagname, axis=1)
    spam = lex_df[lex_df[tagname] == 0].drop(tagname, axis=1)
    n = int(len(ham) * 0.3)
    hams_test = ham.sample(n)
    spams_test = spam.sample(n)
    ham_train = [i for index, i in ham.iterrows() if index not in hams_test.index.values]
    spam_train = [i for index, i in spam.iterrows() if index not in spams_test.index.values]
    test = pandas.concat([hams_test, spams_test])
    train = ham_train + spam_train
    test_answers = [1 for i in range(len(hams_test))] + [0 for i in range(len(spams_test))]
    train_answers = [1 for i in range(len(ham_train))] + [0 for i in range(len(spam_train))]
    data_resampled, answ_resampled = ros.fit_sample(train, train_answers)
    return data_resampled, test, answ_resampled, test_answers

In [219]:
ros = RandomOverSampler()
labs = labels[:]
lex_df = df[labs + tagslist]
lex_df = lex_df.fillna('')
lex_df = pandas.get_dummies(lex_df)
data_train, data_test, answers_train_all, answers_test_all = train_test_split(lex_df.drop(tagslist, axis=1), 
                                                                              lex_df[tagslist], test_size=0.3)

def resampling_and_adding(tagname, data_train, data_test, answers_train_all, answers_test_all, prev_res_train, prev_res_test):
    answ_train = answers_train_all[tagname]
    answ_test = answers_test_all[tagname]
    for tag in prev_res_test:
        
        data_train[tag] = prev_res_train[tag]
        data_test[tag] = prev_res_test[tag]
    data_resampled, answ_resampled = ros.fit_sample(data_train, answ_train)
    return data_resampled, data_test, answ_resampled, answ_test, data_train

# С добавлением информации о других тегах в качестве источников

In [111]:
prev_res_train = {}
prev_res_test = {}
f_imp = {}
for tag in order:
    print()
    print(tag, max_by_rate[tag])
    print('________' + tag.upper() + '____________ occurs originally ' + str(df[tag].value_counts()[1]) + ' times')
    train, test, train_answers, test_answers, not_resampled_train = resampling_and_adding(tag, data_train, data_test, answers_train_all,
                                                                     answers_test_all, prev_res_train, prev_res_test)
    rate = df[tag].value_counts()[0] / df[tag].value_counts()[1]               
    clf = DecisionTreeClassifier(min_samples_split=max_by_rate[tag]['msp'], min_samples_leaf=max_by_rate[tag]['msl'])
    sample_weight = np.array([rate*max_by_rate[tag]['rate'] if i == 0 else 1 for i in train_answers])
    clf.fit(np.array(train), np.array(train_answers), sample_weight=sample_weight)
    answ_pred = clf.predict(test)
    print(classification_report(test_answers, answ_pred))
    print(confusion_matrix(test_answers, answ_pred))
    print()
    prev_res_train[tag] = clf.predict(not_resampled_train)
    prev_res_test[tag] = answ_pred
    f_imp[tag] = clf.feature_importances_


transfer {'p': 0.26315789473684209, 'f': 0.03460207612456747, 'rate': 6, 'msl': 28, 'msp': 2}
________TRANSFER____________ occurs originally 874 times
             precision    recall  f1-score   support

          0       0.96      1.00      0.98      5436
          1       0.32      0.03      0.06       249

avg / total       0.93      0.95      0.94      5685

[[5419   17]
 [ 241    8]]


conj {'p': 0.80666666666666664, 'f': 0.69340974212034379, 'rate': 1, 'msl': 47, 'msp': 2}
________CONJ____________ occurs originally 713 times
             precision    recall  f1-score   support

          0       0.98      1.00      0.99      5478
          1       0.81      0.53      0.64       207

avg / total       0.98      0.98      0.98      5685

[[5453   25]
 [  97  110]]


syntax {'p': 1.0, 'f': 0.018957345971563979, 'rate': 19, 'msl': 67, 'msp': 21}
________SYNTAX____________ occurs originally 638 times
             precision    recall  f1-score   support

          0       0.97      1

# Подбор параметров msl и msp

In [277]:
# prev_res_train = {}
# prev_res_test = {}
               
for tag in 'morph	gov	asp	agrgender	agrcase	brev	agrnum	coord	punc'.split('\t'):
    print()
    print('________' + tag.upper() + '____________ occurs originally ' + str(df[tag].value_counts()[1]) + ' times')
    max_by_rate[tag]['p'] = 0
    max_by_rate[tag]['f'] = 0
    train, test, train_answers, test_answers, not_resampled_train = resampling_and_adding_rf(tag, data_train, data_test, 
                                                                                            answers_train_all, answers_test_all,
                                                                                                prev_res_train, prev_res_test, 
                                                                                rem_feat[tag][max_by_rate[tag]['border']])
    rate = df[tag].value_counts()[0] / df[tag].value_counts()[1]
    sample_weight = np.array([rate*max_by_rate[tag]['rate'] if i == 0 else 1 for i in train_answers])      
    for msp in range(2, 22):
        for msl in range(1, 100):              
            clf = DecisionTreeClassifier(min_samples_split=msp, min_samples_leaf=msl)
            clf.fit(np.array(train), np.array(train_answers), sample_weight=sample_weight)
            answ_pred = clf.predict(test)
            cm = confusion_matrix(test_answers, answ_pred)
            m_precision = cm[1][1]/(cm[1][1] + cm[0][1])
            m_recall = cm[1][1]/(cm[1][1] + cm[1][0])
            m_f_measure = 2*m_precision*m_recall/(m_precision + m_recall)
            if max_by_rate[tag]['p'] < 0.5:
                if m_precision > max_by_rate[tag]['p']:
                    max_by_rate[tag]['p'] = m_precision
                    max_by_rate[tag]['f'] = m_f_measure
                    max_by_rate[tag]['msp'] = msp
                    max_by_rate[tag]['msl'] = msl
            elif m_f_measure > max_by_rate[tag]['f']:
                    max_by_rate[tag]['p'] = m_precision
                    max_by_rate[tag]['f'] = m_f_measure
                    max_by_rate[tag]['msp'] = msp
                    max_by_rate[tag]['msl'] = msl
    
    print(tag, max_by_rate[tag])
    clf = DecisionTreeClassifier(min_samples_split=max_by_rate[tag]['msp'], min_samples_leaf=max_by_rate[tag]['msl'])
    clf.fit(np.array(train), np.array(train_answers), sample_weight=sample_weight)
    answ_pred = clf.predict(test)    
    prev_res_train[tag] = clf.predict(not_resampled_train)
    prev_res_test[tag] = answ_pred
    print(classification_report(test_answers, answ_pred))
    print(confusion_matrix(test_answers, answ_pred))
    print()
    


________MORPH____________ occurs originally 309 times
morph {'border': 0.001, 'msl': 76, 'rate': 2, 'p': 0.66666666666666663, 'f': 0.1176470588235294, 'msp': 21}
             precision    recall  f1-score   support

          0       0.98      1.00      0.99      5592
          1       0.67      0.06      0.12        93

avg / total       0.98      0.98      0.98      5685

[[5589    3]
 [  87    6]]


________GOV____________ occurs originally 1412 times




gov {'border': 0.005, 'msl': 37, 'rate': 19, 'p': 0.5, 'f': 0.009852216748768473, 'msp': 21}
             precision    recall  f1-score   support

          0       0.93      1.00      0.96      5283
          1       0.50      0.00      0.01       402

avg / total       0.90      0.93      0.90      5685

[[5281    2]
 [ 400    2]]


________ASP____________ occurs originally 783 times
asp {'border': -1, 'msl': 1, 'rate': 1, 'p': 0.57017543859649122, 'f': 0.35616438356164382, 'msp': 21}
             precision    recall  f1-score   support

          0       0.97      0.99      0.98      5434
          1       0.55      0.26      0.35       251

avg / total       0.95      0.96      0.95      5685

[[5381   53]
 [ 186   65]]


________AGRGENDER____________ occurs originally 478 times
agrgender {'border': -1, 'msl': 50, 'rate': 1, 'p': 0.5161290322580645, 'f': 0.1807909604519774, 'msp': 21}
             precision    recall  f1-score   support

          0       0.98      1.00      0.99  



brev {'border': 0, 'msl': 89, 'rate': 1, 'p': 0, 'f': 0, 'msp': 2}
             precision    recall  f1-score   support

          0       0.99      1.00      1.00      5643
          1       0.00      0.00      0.00        42

avg / total       0.99      0.99      0.99      5685

[[5641    2]
 [  42    0]]


________AGRNUM____________ occurs originally 457 times
agrnum {'border': 0.0001, 'msl': 45, 'rate': 1, 'p': 1.0, 'f': 0.015037593984962407, 'msp': 2}
             precision    recall  f1-score   support

          0       0.98      1.00      0.99      5553
          1       1.00      0.01      0.02       132

avg / total       0.98      0.98      0.97      5685

[[5553    0]
 [ 131    1]]


________COORD____________ occurs originally 93 times
coord {'border': 0.0005, 'msl': 14, 'rate': 101, 'p': 0, 'f': 0, 'msp': 21}
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      5661
          1       0.00      0.00      0.00        24

avg /

  'precision', 'predicted', average, warn_for)


punc {'border': 0.01, 'msl': 1, 'rate': 1, 'p': 0.5, 'f': 0.30769230769230765, 'msp': 9}
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      5667
          1       0.50      0.22      0.31        18

avg / total       1.00      1.00      1.00      5685

[[5663    4]
 [  14    4]]



# Подбор параметров rate и border

In [287]:
prev_res_train = {}
prev_res_test = {}
               
for tag in order:
    print()
    print('________' + tag.upper() + '____________ occurs originally ' + str(df[tag].value_counts()[1]) + ' times')
    max_by_rate[tag]['p'] = 0
    max_by_rate[tag]['f'] = 0
    rate = df[tag].value_counts()[0] / df[tag].value_counts()[1]
    msl = max_by_rate[tag]['msl']
    msp = max_by_rate[tag]['msp']    
    for rate_multi in range(1, 70):
        for border in rem_feat[tag]: 
            train, test, train_answers, test_answers, not_resampled_train = resampling_and_adding_rf(tag, data_train, data_test, 
                                                                                            answers_train_all, answers_test_all,
                                                                                                prev_res_train, prev_res_test, 
                                                                                rem_feat[tag][border])            
            sample_weight = np.array([rate*rate_multi if i == 0 else 1 for i in train_answers])   
            clf = DecisionTreeClassifier(min_samples_split=msp, min_samples_leaf=msl)
            clf.fit(np.array(train), np.array(train_answers), sample_weight=sample_weight)
            answ_pred = clf.predict(test)
            cm = confusion_matrix(test_answers, answ_pred)
            m_precision = cm[1][1]/(cm[1][1] + cm[0][1])
            m_recall = cm[1][1]/(cm[1][1] + cm[1][0])
            m_f_measure = 2*m_precision*m_recall/(m_precision + m_recall)
            if max_by_rate[tag]['p'] < 0.5:
                if m_precision > max_by_rate[tag]['p']:
                    max_by_rate[tag]['p'] = m_precision
                    max_by_rate[tag]['f'] = m_f_measure
                    max_by_rate[tag]['border'] = border
                    max_by_rate[tag]['rate'] = rate_multi
            elif m_f_measure > max_by_rate[tag]['f']:
                    max_by_rate[tag]['p'] = m_precision
                    max_by_rate[tag]['f'] = m_f_measure
                    max_by_rate[tag]['border'] = border
                    max_by_rate[tag]['rate'] = rate_multi
    
    print(tag, max_by_rate[tag])
    train, test, train_answers, test_answers, not_resampled_train = resampling_and_adding_rf(tag, data_train, data_test, 
                                                                                            answers_train_all, answers_test_all,
                                                                                                prev_res_train, prev_res_test, 
                                                                                rem_feat[tag][max_by_rate[tag]['border']])   
    clf = DecisionTreeClassifier(min_samples_split=max_by_rate[tag]['msp'], min_samples_leaf=max_by_rate[tag]['msl'])
    clf.fit(np.array(train), np.array(train_answers), sample_weight=sample_weight)
    answ_pred = clf.predict(test)    
    prev_res_train[tag] = clf.predict(not_resampled_train)
    prev_res_test[tag] = answ_pred
    print(classification_report(test_answers, answ_pred))
    print(confusion_matrix(test_answers, answ_pred))
    print()
    


________TRANSFER____________ occurs originally 874 times




transfer {'border': 0.05, 'msl': 33, 'rate': 69, 'p': 0.23076923076923078, 'f': 0.022140221402214024, 'msp': 21}
             precision    recall  f1-score   support

          0       0.95      1.00      0.98      5427
          1       0.12      0.00      0.01       258

avg / total       0.92      0.95      0.93      5685

[[5420    7]
 [ 257    1]]


________CONJ____________ occurs originally 713 times
conj {'border': 0.05, 'msl': 70, 'rate': 1, 'p': 0.79487179487179482, 'f': 0.5723076923076923, 'msp': 2}
             precision    recall  f1-score   support

          0       0.97      1.00      0.98      5477
          1       0.97      0.15      0.26       208

avg / total       0.97      0.97      0.96      5685

[[5476    1]
 [ 177   31]]


________SYNTAX____________ occurs originally 638 times
syntax {'border': 0.001, 'msl': 28, 'rate': 1, 'p': 0.52941176470588236, 'f': 0.077253218884120164, 'msp': 2}
             precision    recall  f1-score   support

          0       0.96

  'precision', 'predicted', average, warn_for)


prep {'border': 0.005, 'msl': 4, 'rate': 15, 'p': 0.33333333333333331, 'f': 0.024096385542168676, 'msp': 2}
             precision    recall  f1-score   support

          0       0.99      1.00      0.99      5605
          1       0.00      0.00      0.00        80

avg / total       0.97      0.99      0.98      5685

[[5600    5]
 [  80    0]]


________DEL____________ occurs originally 1842 times
del {'border': 0.005, 'msl': 16, 'rate': 1, 'p': 0.51975683890577506, 'f': 0.38169642857142855, 'msp': 21}
             precision    recall  f1-score   support

          0       0.91      0.99      0.95      5118
          1       0.54      0.13      0.21       567

avg / total       0.87      0.90      0.87      5685

[[5056   62]
 [ 493   74]]


________CONSTR____________ occurs originally 2155 times
constr {'border': -1, 'msl': 18, 'rate': 1, 'p': 0.51068376068376065, 'f': 0.41819772528433946, 'msp': 21}
             precision    recall  f1-score   support

          0       0.90     

# Работа классификаторов с заданными параметрами

In [102]:
f_imp = {}
for tag in order:
    train, test, train_answers, test_answers = balanced_selection_resampled(tag)
    rate = df[tag].value_counts()[0] / df[tag].value_counts()[1]               
    clf = DecisionTreeClassifier(min_samples_split=max_by_rate[tag]['msp'], min_samples_leaf=max_by_rate[tag]['msl'])
    sample_weight = np.array([rate*max_by_rate[tag]['rate'] if i == 0 else 1 for i in train_answers])
    clf.fit(np.array(train), np.array(train_answers), sample_weight=sample_weight)
    answ_pred = clf.predict(test)
    print()
    print('________' + tag.upper() + '____________ occurs originally ' + str(df[tag].value_counts()[1]) + ' times')
    print(classification_report(test_answers, answ_pred))
    print(confusion_matrix(test_answers, answ_pred))
    print()
    f_imp[tag] = clf.feature_importances_


________TRANSFER____________ occurs originally 874 times
             precision    recall  f1-score   support

          0       0.96      1.00      0.98      5433
          1       0.00      0.00      0.00       252

avg / total       0.91      0.95      0.93      5685

[[5413   20]
 [ 252    0]]


________CONJ____________ occurs originally 713 times
             precision    recall  f1-score   support

          0       0.98      0.99      0.99      5477
          1       0.80      0.58      0.67       208

avg / total       0.98      0.98      0.98      5685

[[5447   30]
 [  88  120]]


________SYNTAX____________ occurs originally 638 times
             precision    recall  f1-score   support

          0       0.97      1.00      0.98      5486
          1       0.20      0.01      0.01       199

avg / total       0.94      0.96      0.95      5685

[[5482    4]
 [ 198    1]]


________NEG____________ occurs originally 39 times
             precision    recall  f1-score   suppor

  'precision', 'predicted', average, warn_for)


# Рассмотрение вклада различныхз признаков и выбор признаков

In [112]:
fi = codecs.open('feat_imp.csv', 'w', 'cp1251')
fi.write('----;' + ';'.join(labels) + ';' + ';'.join(order) + '\r\n')
for tag in order:
    fi.write(tag + ';' + ';'.join([str(i).replace('.', ',') for i in f_imp[tag]]) + '\r\n')
fi.close()

In [297]:
fi = codecs.open('feat_imp_new.csv', 'w', 'cp1251')
fi.write('----;' + ';'.join(labels) + ';' + ';'.join(order) + '\r\n')
poss_feat = labels + order
for tag in order:
    features_imp = [str(f_imp[tag][i]).replace('.', ',') if poss_feat[i] in rem_feat[tag][max_by_rate[tag]['border']] else '' 
                    for i in range(len(f_imp[tag]))]
    fi.write(tag + ';' + 
             ';'.join(features_imp) + '\r\n')
fi.close()

KeyError: 'gov_w_prep'

KeyError: 0.0005

In [308]:
fi_key = {}
keytags = 'space, conj, hyphen, refl, ortho, insert, lex, not-clear, asp, punc, ref, constr'.split(', ')
for tag in keytags:
    print(tag)
    fi_key[tag] = {}
    i = 0
    for feature in rem_feat[tag][max_by_rate[tag]['border']]:
        fi_key[tag][feature] = f_imp_new[tag][i]
        i += 1

space
conj
hyphen
refl
ortho
insert
lex
not-clear
asp
punc
ref
constr


In [309]:
for tag in fi_key:
    print(tag, max_by_rate[tag])
    for feature in sorted(fi_key[tag], key=fi_key[tag].get, reverse=True):
        print(feature, fi_key[tag][feature])

hyphen {'border': 0.05, 'msl': 82, 'rate': 1, 'p': 0, 'f': 0, 'msp': 2}
hashyphen 0.36175958056
levenstein 0.199870442426
lencorr 0.168854944243
lenorig 0.0757785400114
PR_j1 0.043664730024
ADV_j1 0.0360644810851
PRO_j1 0.0235354401995
ADV_j2 0.0215899938443
A_j1 0.0162179405679
bastard 0.0149227196161
PART_j2 0.0097332166055
INTJ_j1 0.0085133625626
CONJ_j2 0.00710015861219
grammequal 0.00353532037621
govorja_c 0.0023552649165
PRO_j2 0.00210244572689
moemu_e 0.00156719020375
tom_c 0.0014030911036
iz-za_c 0.00140251400805
po-moemu_c 2.86233070939e-05
ostalas'_c 0.0
luchshaja_e 0.0
epstin_e 0.0
vazhny_c 0.0
ipod_c 0.0
V_j2 0.0
den_e 0.0
okazalas'_e 0.0
problemom_e 0.0
po-drugomu_c 0.0
pogovorit'_c 0.0
nikakogo_c 0.0
uchastvuet_c 0.0
gazeta_e 0.0
ochen'_e 0.0
deverja_e 0.0
peterburg_c 0.0
ministra_e 0.0
strogaja_e 0.0
organizm_c 0.0
svjazany_e 0.0
poleznyh_c 0.0
razmenivat'sja_e 0.0
bliz_c 0.0
stemequal 0.0
po-raznomu_c 0.0
prisposablivat'_e 0.0
moskvy_e 0.0
stroilsja_c 0.0
nose_c 0.0
uzb

In [271]:
rem_feat = {}
for tag in order:
    rem_feat[tag] = {}
    for border in [-1, 0, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05]:
        j = 0
        rem_feat[tag][border] = []
        for i in f_imp_new[tag]:
            if i >= 0.0005 and j < len(labels):
                rem_feat[tag][border].append(labels[j])
            elif i >= 0.0005:
                rem_feat[tag][border].append(order[j - len(labels)])
            j += 1
rem_feat

{'agrcase': {-1: ['levenstein',
   'lemmaequal',
   'grammequal',
   'stemequal',
   'lenorig',
   'lencorr',
   'bastard',
   'PRO_j1',
   'V_j1',
   'A_j1',
   'INTJ_j2',
   'PRO_j2',
   'PART_j2',
   'V_j2',
   'A_j2',
   'CONJ_j2',
   'hascyrillic',
   'frantsii_e',
   "ostat'sja_e",
   'nezadavlenye_e',
   'haposho_e',
   'razmyshlenija_e',
   'odety_e',
   'svjazana_e',
   'mozhet_e',
   'kastrom_e',
   "ostalos'_e",
   'zakryv_e',
   'nenavisti_e',
   "chuvstvitel'nye_e",
   '——_e',
   'pervyh_e',
   'no_e',
   'portlenda_e',
   'analisti_e',
   'udivilsja_e',
   'pozvoljat_e',
   'slozhnyh_e',
   'nichego_c',
   'vposledstvii_c',
   'govorja_c',
   'a_c',
   "ob'jasnjat'_c",
   "volnovat'sja_c",
   'kollektsioniruju_c',
   'ponjatno_c',
   'papoj_c',
   'bliz_c',
   'mozhet_c',
   'ejfelevu_c',
   'grammatika_c',
   "men'she_c",
   'obratnom_c',
   'flagman_c',
   'zakonov_c',
   'prognoz_c',
   'uprjamy_c',
   'suschestvujut_c'],
  0: ['levenstein',
   'lemmaequal',
   'gramme

# С заданными параметрами и только важными фичами, подбор границы

In [290]:
ros = RandomOverSampler()
labs = labels[:]
lex_df = df[labs + tagslist]
lex_df = lex_df.fillna('')
lex_df = pandas.get_dummies(lex_df)
data_train, data_test, answers_train_all, answers_test_all = train_test_split(lex_df.drop(tagslist, axis=1), 
                                                                              lex_df[tagslist], test_size=0.3)

def resampling_and_adding_rf(tagname, data_train, data_test, answers_train_all, answers_test_all, prev_res_train, prev_res_test, 
                          features):
    answ_train = answers_train_all[tagname]
    answ_test = answers_test_all[tagname]
    for tag in prev_res_test:
        data_train[tag] = prev_res_train[tag]
        data_test[tag] = prev_res_test[tag]
    data_train = data_train[features]
    data_test = data_test[features]
    data_resampled, answ_resampled = ros.fit_sample(data_train, answ_train)
    return data_resampled, data_test, answ_resampled, answ_test, data_train

In [141]:
prev_res_train = {}
prev_res_test = {}
f_imp_new = {}
for tag in order:
    print()
    print('________' + tag.upper() + '____________ occurs originally ' + str(df[tag].value_counts()[1]) + ' times')
    max_by_rate[tag]['p'] = 0
    max_by_rate[tag]['f'] = 0
    for border in rem_feat[tag]:
        train, test, train_answers, test_answers, not_resampled_train = resampling_and_adding_rf(tag, data_train, data_test, 
                                                                                                 answers_train_all, answers_test_all,
                                                                                                 prev_res_train, prev_res_test, 
                                                                                                 rem_feat[tag][border])
        rate = df[tag].value_counts()[0] / df[tag].value_counts()[1]               
        clf = DecisionTreeClassifier(min_samples_split=max_by_rate[tag]['msp'], min_samples_leaf=max_by_rate[tag]['msl'])
        sample_weight = np.array([rate*max_by_rate[tag]['rate'] if i == 0 else 1 for i in train_answers])
        clf.fit(np.array(train), np.array(train_answers), sample_weight=sample_weight)
        answ_pred = clf.predict(test)
        cm = confusion_matrix(test_answers, answ_pred)
        m_precision = cm[1][1]/(cm[1][1] + cm[0][1])
        m_recall = cm[1][1]/(cm[1][1] + cm[1][0])
        m_f_measure = 2*m_precision*m_recall/(m_precision + m_recall)
        if max_by_rate[tag]['p'] < 0.5:
            if m_precision >= max_by_rate[tag]['p']:
                        max_by_rate[tag]['p'] = m_precision
                        max_by_rate[tag]['f'] = m_f_measure
                        max_by_rate[tag]['border'] = border
        elif m_f_measure >= max_by_rate[tag]['f']:
                        max_by_rate[tag]['p'] = m_precision
                        max_by_rate[tag]['f'] = m_f_measure
                        max_by_rate[tag]['border'] = border           
    
    print(tag, max_by_rate[tag])
    train, test, train_answers, test_answers, not_resampled_train = resampling_and_adding_rf(tag, data_train, data_test, 
                                                                                                 answers_train_all, answers_test_all,
                                                                                                 prev_res_train, prev_res_test, 
                                                                                                 rem_feat[tag][max_by_rate[tag]['border']])
    rate = df[tag].value_counts()[0] / df[tag].value_counts()[1]               
    clf = DecisionTreeClassifier(min_samples_split=max_by_rate[tag]['msp'], min_samples_leaf=max_by_rate[tag]['msl'])
    sample_weight = np.array([rate*max_by_rate[tag]['rate'] if i == 0 else 1 for i in train_answers])
    clf.fit(np.array(train), np.array(train_answers), sample_weight=sample_weight)
    answ_pred = clf.predict(test)
    print(classification_report(test_answers, answ_pred))
    print(confusion_matrix(test_answers, answ_pred))
    print()
    prev_res_train[tag] = clf.predict(not_resampled_train)
    prev_res_test[tag] = answ_pred
    f_imp_new[tag] = clf.feature_importances_


________TRANSFER____________ occurs originally 874 times
transfer {'border': 0.05, 'msl': 28, 'msp': 2, 'p': 0.2857142857142857, 'f': 0.042553191489361708, 'rate': 6}
             precision    recall  f1-score   support

          0       0.95      1.00      0.98      5424
          1       0.31      0.02      0.03       261

avg / total       0.92      0.95      0.93      5685

[[5415    9]
 [ 257    4]]


________CONJ____________ occurs originally 713 times
conj {'border': 0.01, 'msl': 47, 'msp': 2, 'p': 0.85906040268456374, 'f': 0.68085106382978722, 'rate': 1}
             precision    recall  f1-score   support

          0       0.98      1.00      0.99      5458
          1       0.84      0.57      0.68       227

avg / total       0.98      0.98      0.98      5685

[[5433   25]
 [  98  129]]


________SYNTAX____________ occurs originally 638 times
syntax {'border': 0.0005, 'msl': 67, 'msp': 21, 'p': 0.5, 'f': 0.010362694300518137, 'rate': 19}
             precision    recall 



phon {'border': 0.0005, 'msl': 1, 'msp': 2, 'p': 0.0, 'f': nan, 'rate': 1}
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      5668
          1       0.00      0.00      0.00        17

avg / total       0.99      1.00      1.00      5685

[[5665    3]
 [  17    0]]


________AUX____________ occurs originally 86 times
aux {'border': 0.001, 'msl': 39, 'msp': 2, 'p': 0.44444444444444442, 'f': 0.24242424242424243, 'rate': 5}
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      5661
          1       0.40      0.17      0.24        24

avg / total       0.99      1.00      0.99      5685

[[5655    6]
 [  20    4]]


________PREP____________ occurs originally 301 times
prep {'border': 0.001, 'msl': 77, 'msp': 2, 'p': 0.33333333333333331, 'f': 0.0449438202247191, 'rate': 3}
             precision    recall  f1-score   support

          0       0.99      1.00      0.99      5602
          1   

# Обработка и запись выбранных параметров

In [160]:
with open('best_feat.json') as json_file:  
    rem_feat = json.load(json_file)

for tag in rem_feat:
    for border in rem_feat[tag]:
        rem_feat[tag][border] += very_new_labels
for tag in new_tags:
    rem_feat[tag] = {}
    for border in [-1, 0, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05]:
        rem_feat[tag][border] = labels

In [180]:
for tag in new_tags:
    max_by_rate[tag] = {}
    max_by_rate[tag]['rate'] = 1
    max_by_rate[tag]['msl'] = 1
    max_by_rate[tag]['msp'] = 2
    max_by_rate[tag]['border'] = 0

In [278]:
for tag in max_by_rate:
    if max_by_rate[tag]['border'] == -1:
        max_by_rate[tag]['border'] = 0
    elif max_by_rate[tag]['border'] == '-1':
        max_by_rate[tag]['border'] = '0'

In [279]:
import json
with open('best_param.json', 'w') as outfile:  
    json.dump(max_by_rate, outfile, indent=4)
with open('best_feat.json', 'w') as outfile:  
    json.dump(rem_feat, outfile, indent=4)

In [249]:
with open('best_param.json') as json_file:  
    max_by_rate = json.load(json_file)

In [267]:
with open('best_feat.json') as json_file:  
    rem_feat = json.load(json_file)

In [268]:
for tag in rem_feat:
    for border in rem_feat[tag]:
        rem_feat[tag][border] += key_words

In [243]:
rem_feat['syntax_wo_others'] = rem_feat['syntax']
max_by_rate['syntax_wo_others'] = max_by_rate['syntax']

In [292]:
pre_rec_table = codecs.open('pre_rec2.csv', 'w', 'cp1251')
pre_rec_table.write('tag;precision;recall\r\n')
for tag in pre_rec:
    pre_rec_table.write(tag + ';' + str(pre_rec[tag][0]).replace('.', ',') + ';' + str(pre_rec[tag][1]).replace('.', ',') + '\r\n')
pre_rec_table.close()

# Со всеми заданными параметрами (rate, msl, msp, border)

In [291]:
prev_res_train = {}
prev_res_test = {}
f_imp_new = {}
pre_rec = {}
for tag in order:
    print()
    print('________' + tag.upper() + '____________ occurs originally ' + str(df[tag].value_counts()[1]) + ' times')
    max_by_rate[tag]['p'] = 0
    max_by_rate[tag]['f'] = 0
    print(tag, max_by_rate[tag])
    train, test, train_answers, test_answers, not_resampled_train = resampling_and_adding_rf(tag, data_train, data_test, 
                                                                                                 answers_train_all, answers_test_all,
                                                                                                 prev_res_train, prev_res_test, 
                                                                                                 rem_feat[tag][max_by_rate[tag]['border']])
    rate = df[tag].value_counts()[0] / df[tag].value_counts()[1]               
    clf = DecisionTreeClassifier(min_samples_split=max_by_rate[tag]['msp'], min_samples_leaf=max_by_rate[tag]['msl'])
    sample_weight = np.array([rate*max_by_rate[tag]['rate'] if i == 0 else 1 for i in train_answers])
    clf.fit(np.array(train), np.array(train_answers), sample_weight=sample_weight)
    answ_pred = clf.predict(test)
    cm = confusion_matrix(test_answers, answ_pred)
    m_precision = cm[1][1]/(cm[1][1] + cm[0][1])
    m_recall = cm[1][1]/(cm[1][1] + cm[1][0])
    pre_rec[tag] = [m_precision, m_recall]
    print(classification_report(test_answers, answ_pred))
    print(confusion_matrix(test_answers, answ_pred))
    print()
    prev_res_train[tag] = clf.predict(not_resampled_train)
    prev_res_test[tag] = answ_pred
    f_imp_new[tag] = clf.feature_importances_


________TRANSFER____________ occurs originally 874 times
transfer {'border': 0.05, 'msl': 33, 'rate': 69, 'p': 0, 'f': 0, 'msp': 21}
             precision    recall  f1-score   support

          0       0.96      1.00      0.98      5433
          1       0.50      0.00      0.01       252

avg / total       0.94      0.96      0.93      5685

[[5432    1]
 [ 251    1]]


________CONJ____________ occurs originally 713 times
conj {'border': 0.05, 'msl': 70, 'rate': 1, 'p': 0, 'f': 0, 'msp': 2}
             precision    recall  f1-score   support

          0       0.98      1.00      0.99      5463
          1       0.80      0.45      0.58       222

avg / total       0.97      0.97      0.97      5685

[[5438   25]
 [ 121  101]]


________SYNTAX____________ occurs originally 638 times
syntax {'border': 0.001, 'msl': 28, 'rate': 1, 'p': 0, 'f': 0, 'msp': 2}
             precision    recall  f1-score   support

          0       0.97      1.00      0.98      5503
          1       0.

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.99      1.00      1.00      5655
          1       0.00      0.00      0.00        30

avg / total       0.99      0.99      0.99      5685

[[5655    0]
 [  30    0]]


________AUX____________ occurs originally 86 times
aux {'border': 0.001, 'msl': 39, 'rate': 5, 'p': 0, 'f': 0, 'msp': 2}
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      5657
          1       0.00      0.00      0.00        28

avg / total       0.99      1.00      0.99      5685

[[5657    0]
 [  28    0]]


________PREP____________ occurs originally 301 times
prep {'border': 0.005, 'msl': 4, 'rate': 15, 'p': 0, 'f': 0, 'msp': 2}
             precision    recall  f1-score   support

          0       0.99      1.00      0.99      5602
          1       0.17      0.02      0.04        83

avg / total       0.97      0.98      0.98      5685

[[5592   10]
 [  81    2]]


________DEL____________

# all tags together

In [54]:
def combine_tags(line):
    ct = ''
    for tag in tagslist:
        if line[tag] == 1:
            ct += tag + '+'
    return ct[:-1]

In [68]:
ros = RandomOverSampler()
def train_resampling(tagname):
    labs = labels[:]
    labs.append(tagname)
    lex_df = main_train[labs]
    lex_df = lex_df.fillna('')
    lex_df = pandas.get_dummies(lex_df)
    data_resampled, answ_resampled = ros.fit_sample(lex_df.drop(tagname, axis=1), lex_df[tagname]) 
    return data_resampled, answ_resampled

In [70]:
df['tags'] = [combine_tags(i) for index, i in df.iterrows()]
main_train, main_test, answ_train_main, answ_test_main = train_test_split(df.drop(['tags'], axis=1), df['tags'], test_size=0.3)
main_test = main_test[labels]

In [69]:
clasdict = {}
for tag in tagslist:
    train, train_answers = train_resampling(tag)
    clf = DecisionTreeClassifier(min_samples_split=5)
    gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
    gs_clf.fit(np.array(train), np.array(train_answers))
    clasdict[tag] = gs_clf

In [72]:
predicted_values = {}
for tag in clasdict:
    answ_pred = clasdict[tag].predict(main_test)
    predicted_values[tag] = answ_pred

In [73]:
predict_combined = []
for i in range(len(main_test)):
    tagstr = ''
    for tag in tagslist:
        if predicted_values[tag][i] == 1:
            tagstr += tag + '+'
    predict_combined.append(tagstr[:-1])

In [75]:
print(classification_report(answ_test_main, predict_combined))
print(confusion_matrix(answ_test_main, predict_combined))

                                                                                                              precision    recall  f1-score   support

                                                                                                                   0.00      0.00      0.00         0
                                                                                                     agrcase       0.00      0.00      0.00       217
                                                                                           agrcase+agrgender       0.00      0.00      0.00         5
                                                                                    agrcase+agrgender+constr       0.00      0.00      0.00         1
                                                                  agrcase+agrgender+transfer+gov+conj+syntax       0.00      0.00      0.00         0
                                                                      agrcase+agrpers+conj+syntax+c

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


ерунда какая-то получилась...