In [1]:
import numpy as np
import pandas
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.grid_search import GridSearchCV
from scipy import misc

from imblearn.over_sampling import RandomOverSampler
import codecs



In [2]:
df = pandas.read_csv('.\\rlc_featured_mod6.csv', sep='\t')
df.head()

Unnamed: 0,sentid,sent,start,finish,error,correction,tag,levenstein,lemmaequal,grammequal,...,hasgraph,hasneg,hasspace,gramm1,gramm2,hasaux,samebrev,samegender,samenum,samepers
0,56431,Когда мы являемся на собеседование для хорошо ...,20,20,кандидатоми,кандидатов.,infl,2,0,0,...,0,0,0,"V,сов,пе=ед,пов,2-л","S,муж,од=(вин,мн|род,мн)",0,0,0,0,0
1,295,"У животных нет культура, не смотря на то, что ...",18,18,мира,,lex,4,0,0,...,0,0,0,"S,муж,неод=род,ед",,0,0,0,0,0
2,52827,Но после несколько дней они знали что у них не...,3,3,несколько,нескольких,gov,2,1,0,...,0,0,0,ADV=,"NUM=(|пр|род|вин,од)",0,0,0,0,0
3,541,"Сложно контрольировать стиль, и нам еще надо р...",2,2,контрольировать,контролировать,ortho,1,0,1,...,0,0,0,"V,несов,пе=инф","V,несов,пе=инф",0,0,0,0,0
4,53358,С особым восторгом я так же отношусь к русской...,13,13,классике,к класссике,prep del,3,0,0,...,0,0,0,"S,жен,неод=(пр,ед|дат,ед)","PR= S,жен,неод=(пр,ед|дат,ед)",0,0,1,1,0


# посчитаем условные вероятности для тегов по формуле Байеса

In [3]:
tagslist = 'prep	phon	graph	hyphen	space	ortho	translit	misspell	deriv	infl	num	morph	asp	passive	agrnum	agrcase	agrgender	agrpers	agrgerund	transfer	gov	ref	conj	wo	neg	aux	brev	syntax	constr	lex	cs	par	idiom	coord	refl	not-clear	transp	subst	del	punc	typo'.split('\t')

In [5]:
tagmatrix = df[tagslist]

In [56]:
condmatr = np.zeros(shape=(len(tagslist),len(tagslist)))
for i in range(len(tagslist)):
    for j in range(len(tagslist)):
        try:
            condmatr[i][j] = round(tagmatrix.groupby(tagslist[i])[tagslist[j]].apply(lambda g: g.value_counts()/len(g))[1][1], 3)
        except KeyError:
            condmatr[i][j] = 0

In [60]:
condmatr[0][3]

0.0030000000000000001

In [59]:
a = codecs.open('cond_prib.csv', 'w', 'cp1251')
a.write('-----;' + ';'.join(tagslist) + '\r\n')
for i in range(len(tagslist)):
    a.write(tagslist[i] + ';' + ';'.join([str(j) for j in condmatr[i]]))
    a.write('\r\n')
a.close()

уберем ненужные столбцы

In [4]:
labels = ['levenstein', 'lemmaequal', 'grammequal', 'stemequal', 'lenorig', 'lencorr', 'bastard']
poslist = 'A ADVPRO PART CONJ ANUM ADV SPRO S COM V APRO INTJ PR NUM'.split(' ')
pos1 = [i + '1' for i in poslist]
pos2 = [i + '2' for i in poslist]
new_labels = 'hashyphen	hasgraph	hasneg	hasspace	hasaux	samebrev	samegender	samenum	samepers'.split('\t')
labels = labels + pos1 + pos2 + new_labels
labs = labels[:]

# LEX

In [63]:
labs.append('lex')
lex_df = df[labs]
lex_df.head()

Unnamed: 0,levenstein,lemmaequal,grammequal,stemequal,lenorig,lencorr,bastard,lex
0,1,0,0,1,1,1,1,0
1,4,0,0,0,1,2,0,1
2,3,1,0,1,1,1,1,1
3,1,1,1,0,1,1,1,0
4,6,0,0,0,1,1,1,1


In [64]:
lex_df = lex_df.fillna('')

In [65]:
lex_df = pandas.get_dummies(lex_df)
lex_df.head()

Unnamed: 0,levenstein,lemmaequal,grammequal,stemequal,lenorig,lencorr,bastard,lex
0,1,0,0,1,1,1,1,0
1,4,0,0,0,1,2,0,1
2,3,1,0,1,1,1,1,1
3,1,1,1,0,1,1,1,0
4,6,0,0,0,1,1,1,1


In [7]:
data_train, data_test, answ_train, answ_test = train_test_split(lex_df.drop('lex', axis=1), lex_df['lex'], test_size=0.3)

In [8]:
clf = DecisionTreeClassifier(min_samples_split=5)
gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
gs_clf.fit(np.array(data_train), np.array(answ_train))
print(gs_clf.best_estimator_)
# for model in gs_clf.grid_scores_:
#     print(model, model[2])
gs_clf.fit(np.array(data_train), np.array(answ_train))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=8, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [9]:
answ_pred = gs_clf.predict(data_test)
print(classification_report(answ_test, answ_pred))

             precision    recall  f1-score   support

          0       0.82      0.99      0.90      4315
          1       0.84      0.27      0.41      1249

avg / total       0.83      0.82      0.79      5564



In [10]:
print(confusion_matrix(answ_test, answ_pred))

[[4252   63]
 [ 915  334]]


модель переобучена на нолики, так как их больше. Нужно или порезать данные, или сделать оверсемплинг?

# оверсемплинг

In [66]:
ham = lex_df[lex_df.lex == 1].drop('lex', axis=1)
spam = lex_df[lex_df.lex == 0].drop('lex', axis=1)
n = int(len(ham) * 0.3)
print('hey there we test on ', n, 'of ', len(ham), 'while spam length is', len(spam))
hams_test = ham.sample(n)
spams_test = spam.sample(n)
ham_train = [i for index, i in ham.iterrows() if index not in hams_test.index.values]
spam_train = [i for index, i in spam.iterrows() if index not in spams_test.index.values]
test = pandas.concat([hams_test, spams_test])
train = ham_train + spam_train
test_answers = [1 for i in range(len(hams_test))] + [0 for i in range(len(spams_test))]
train_answers = [1 for i in range(len(ham_train))] + [0 for i in range(len(spam_train))]
print(len(train), len(train_answers), len(test), len(test_answers))

hey there we test on  1222 of  4074 while spam length is 14372
16002 16002 2444 2444


In [67]:
ros = RandomOverSampler()

data_resampled, answ_resampled = ros.fit_sample(train, train_answers)

In [68]:
clf = DecisionTreeClassifier(min_samples_split=5)
gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
gs_clf.fit(np.array(data_resampled), np.array(answ_resampled))
print(gs_clf.best_estimator_)
answ_pred = gs_clf.predict(test)
print(classification_report(test_answers, answ_pred))
print(confusion_matrix(test_answers, answ_pred))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=-1, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
             precision    recall  f1-score   support

          0       0.74      0.76      0.75      1222
          1       0.76      0.74      0.75      1222

avg / total       0.75      0.75      0.75      2444

[[931 291]
 [321 901]]


# чуть лучше. А если просто безжалостно порезать данные?

In [14]:
lex_df.index.values

array([    0,     1,     2, ..., 18542, 18543, 18544], dtype=int64)

In [15]:
ham = lex_df[lex_df.lex == 1].drop('lex', axis=1)
spam = lex_df[lex_df.lex == 0].drop('lex', axis=1)
n = int(len(ham) * 0.7)
print('hey there we train on ', n, 'of ', len(ham), 'while spam length is', len(spam))
hams = ham.sample(n)
spams = spam.sample(n)

hey there we train on  2867 of  4096 while spam length is 14449


In [16]:
ham_test = [i for index, i in ham.iterrows() if index not in hams.index.values]
spam_test = [i for index, i in spam.iterrows() if index not in spams.index.values][:len(ham_test)]
print(len(hams), len(spams), len(ham_test), len(spam_test))

2867 2867 1229 1229


In [17]:
train = pandas.concat([hams, spams])
test = ham_test + spam_test
train_answers = [1 for i in range(len(hams))] + [0 for i in range(len(spams))]
test_answers = [1 for i in range(len(ham_test))] + [0 for i in range(len(spam_test))]
print(len(train), len(train_answers), len(test), len(test_answers))

5734 5734 2458 2458


In [18]:
clf = DecisionTreeClassifier(min_samples_split=5)
gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
gs_clf.fit(np.array(train), np.array(train_answers))

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [19]:
answ_pred = gs_clf.predict(test)
print(classification_report(test_answers, answ_pred))
print(confusion_matrix(test_answers, answ_pred))

             precision    recall  f1-score   support

          0       0.75      0.75      0.75      1229
          1       0.75      0.75      0.75      1229

avg / total       0.75      0.75      0.75      2458

[[925 304]
 [308 921]]


если порезать, результаты чуть хуже, чем при оверсемплинге.

# General

In [25]:
def one_tag_db_cut(tagname):
    labs = labels[:]
    labs.append(tagname)
    lex_df = df[labs]
    lex_df = lex_df.fillna('')
    lex_df = pandas.get_dummies(lex_df)
    ham = lex_df[lex_df[tagname] == 1].drop(tagname, axis=1)
    spam = lex_df[lex_df[tagname] == 0].drop(tagname, axis=1)
    n = int(len(ham) * 0.7)
    hams = ham.sample(n)
    spams = spam.sample(n)
    ham_test = [i for index, i in ham.iterrows() if index not in hams.index.values]
    spam_test = [i for index, i in spam.iterrows() if index not in spams.index.values][:len(ham_test)]
    train = pandas.concat([hams, spams])
    test = ham_test + spam_test
    train_answers = [1 for i in range(len(hams))] + [0 for i in range(len(spams))]
    test_answers = [1 for i in range(len(ham_test))] + [0 for i in range(len(spam_test))]
    return train, test, train_answers, test_answers

In [5]:
ros = RandomOverSampler()
def one_tag_db_resampled(tagname):
    labs = labels[:]
    labs.append(tagname)
    lex_df = df[labs]
    lex_df = lex_df.fillna('')
    lex_df = pandas.get_dummies(lex_df)
    
    ham = lex_df[lex_df[tagname] == 1].drop(tagname, axis=1)
    spam = lex_df[lex_df[tagname] == 0].drop(tagname, axis=1)
    n = int(len(ham) * 0.3)
    hams_test = ham.sample(n)
    spams_test = spam.sample(n)
    ham_train = [i for index, i in ham.iterrows() if index not in hams_test.index.values]
    spam_train = [i for index, i in spam.iterrows() if index not in spams_test.index.values]
    test = pandas.concat([hams_test, spams_test])
    train = ham_train + spam_train
    test_answers = [1 for i in range(len(hams_test))] + [0 for i in range(len(spams_test))]
    train_answers = [1 for i in range(len(ham_train))] + [0 for i in range(len(spam_train))]
    data_resampled, answ_resampled = ros.fit_sample(train, train_answers)
    return data_resampled, test, answ_resampled, test_answers

In [70]:
train, test, train_answers, test_answers = one_tag_db_resampled('lex')

In [71]:
clf = DecisionTreeClassifier(min_samples_split=5)
gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
gs_clf.fit(np.array(train), np.array(train_answers))
answ_pred = gs_clf.predict(test)
print(classification_report(test_answers, answ_pred))
print(confusion_matrix(test_answers, answ_pred))

             precision    recall  f1-score   support

          0       0.75      0.75      0.75      1222
          1       0.75      0.75      0.75      1222

avg / total       0.75      0.75      0.75      2444

[[915 307]
 [310 912]]


# ortho

In [74]:
train, test, train_answers, test_answers = one_tag_db_resampled('ortho')

In [75]:
clf = DecisionTreeClassifier(min_samples_split=5)
gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
gs_clf.fit(np.array(train), np.array(train_answers))
answ_pred = gs_clf.predict(test)
print(classification_report(test_answers, answ_pred))
print(confusion_matrix(test_answers, answ_pred))

             precision    recall  f1-score   support

          0       0.82      0.77      0.79      1269
          1       0.78      0.83      0.80      1269

avg / total       0.80      0.80      0.80      2538

[[ 981  288]
 [ 221 1048]]


# typo

In [76]:
train, test, train_answers, test_answers = one_tag_db_resampled('typo')

In [77]:
clf = DecisionTreeClassifier(min_samples_split=5)
gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
gs_clf.fit(np.array(train), np.array(train_answers))
answ_pred = gs_clf.predict(test)
print(classification_report(test_answers, answ_pred))
print(confusion_matrix(test_answers, answ_pred))

             precision    recall  f1-score   support

          0       0.69      0.82      0.75        22
          1       0.78      0.64      0.70        22

avg / total       0.74      0.73      0.73        44

[[18  4]
 [ 8 14]]


# aLL TAGS до разнесения частей речи для сравнения

In [78]:
for tag in tagslist:
    print()
    print('________' + tag.upper() + '____________ occurs originally ' + str(df[tag].value_counts()[1]) + ' times')
    train, test, train_answers, test_answers = one_tag_db_resampled(tag)
    clf = DecisionTreeClassifier(min_samples_split=5)
    gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
    gs_clf.fit(np.array(train), np.array(train_answers))
    answ_pred = gs_clf.predict(test)
    print(classification_report(test_answers, answ_pred))
    print(confusion_matrix(test_answers, answ_pred))
    print()


________PREP____________ occurs originally 301 times
             precision    recall  f1-score   support

          0       0.64      0.78      0.70        90
          1       0.71      0.56      0.63        90

avg / total       0.68      0.67      0.66       180

[[70 20]
 [40 50]]


________PHON____________ occurs originally 66 times
             precision    recall  f1-score   support

          0       0.84      0.84      0.84        19
          1       0.84      0.84      0.84        19

avg / total       0.84      0.84      0.84        38

[[16  3]
 [ 3 16]]


________GRAPH____________ occurs originally 174 times
             precision    recall  f1-score   support

          0       0.78      0.67      0.72        52
          1       0.71      0.81      0.76        52

avg / total       0.74      0.74      0.74       104

[[35 17]
 [10 42]]


________HYPHEN____________ occurs originally 212 times
             precision    recall  f1-score   support

          0       0.86 

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.50      1.00      0.67         3
          1       0.00      0.00      0.00         3

avg / total       0.25      0.50      0.33         6

[[3 0]
 [3 0]]


________TRANSFER____________ occurs originally 870 times
             precision    recall  f1-score   support

          0       0.58      0.79      0.67       261
          1       0.67      0.42      0.51       261

avg / total       0.62      0.61      0.59       522

[[207  54]
 [152 109]]


________GOV____________ occurs originally 1411 times
             precision    recall  f1-score   support

          0       0.72      0.73      0.73       423
          1       0.73      0.72      0.72       423

avg / total       0.72      0.72      0.72       846

[[309 114]
 [119 304]]


________REF____________ occurs originally 493 times
             precision    recall  f1-score   support

          0       0.67      0.76      0.71       147
          1       0

# All tags

In [6]:
for tag in tagslist:
    print()
    print('________' + tag.upper() + '____________ occurs originally ' + str(df[tag].value_counts()[1]) + ' times')
    train, test, train_answers, test_answers = one_tag_db_resampled(tag)
    clf = DecisionTreeClassifier(min_samples_split=5)
    gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
    gs_clf.fit(np.array(train), np.array(train_answers))
    answ_pred = gs_clf.predict(test)
    print(classification_report(test_answers, answ_pred))
    print(confusion_matrix(test_answers, answ_pred))
    print()


________PREP____________ occurs originally 301 times
             precision    recall  f1-score   support

          0       0.64      0.99      0.77        90
          1       0.97      0.43      0.60        90

avg / total       0.81      0.71      0.69       180

[[89  1]
 [51 39]]


________PHON____________ occurs originally 66 times
             precision    recall  f1-score   support

          0       0.61      1.00      0.76        19
          1       1.00      0.37      0.54        19

avg / total       0.81      0.68      0.65        38

[[19  0]
 [12  7]]


________GRAPH____________ occurs originally 174 times
             precision    recall  f1-score   support

          0       0.78      0.90      0.84        52
          1       0.89      0.75      0.81        52

avg / total       0.83      0.83      0.83       104

[[47  5]
 [13 39]]


________HYPHEN____________ occurs originally 212 times
             precision    recall  f1-score   support

          0       0.78 

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.50      1.00      0.67         3
          1       0.00      0.00      0.00         3

avg / total       0.25      0.50      0.33         6

[[3 0]
 [3 0]]


________TRANSFER____________ occurs originally 870 times
             precision    recall  f1-score   support

          0       0.55      0.88      0.67       261
          1       0.69      0.27      0.39       261

avg / total       0.62      0.57      0.53       522

[[229  32]
 [190  71]]


________GOV____________ occurs originally 1411 times
             precision    recall  f1-score   support

          0       0.74      0.87      0.80       423
          1       0.85      0.70      0.77       423

avg / total       0.79      0.79      0.78       846

[[369  54]
 [127 296]]


________REF____________ occurs originally 493 times
             precision    recall  f1-score   support

          0       0.68      0.99      0.80       147
          1       0

# all tags together

In [54]:
def combine_tags(line):
    ct = ''
    for tag in tagslist:
        if line[tag] == 1:
            ct += tag + '+'
    return ct[:-1]

In [68]:
ros = RandomOverSampler()
def train_resampling(tagname):
    labs = labels[:]
    labs.append(tagname)
    lex_df = main_train[labs]
    lex_df = lex_df.fillna('')
    lex_df = pandas.get_dummies(lex_df)
    data_resampled, answ_resampled = ros.fit_sample(lex_df.drop(tagname, axis=1), lex_df[tagname]) 
    return data_resampled, answ_resampled

In [70]:
df['tags'] = [combine_tags(i) for index, i in df.iterrows()]
main_train, main_test, answ_train_main, answ_test_main = train_test_split(df.drop(['tags'], axis=1), df['tags'], test_size=0.3)
main_test = main_test[labels]

In [69]:
clasdict = {}
for tag in tagslist:
    train, train_answers = train_resampling(tag)
    clf = DecisionTreeClassifier(min_samples_split=5)
    gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
    gs_clf.fit(np.array(train), np.array(train_answers))
    clasdict[tag] = gs_clf

In [72]:
predicted_values = {}
for tag in clasdict:
    answ_pred = clasdict[tag].predict(main_test)
    predicted_values[tag] = answ_pred

In [73]:
predict_combined = []
for i in range(len(main_test)):
    tagstr = ''
    for tag in tagslist:
        if predicted_values[tag][i] == 1:
            tagstr += tag + '+'
    predict_combined.append(tagstr[:-1])

In [75]:
print(classification_report(answ_test_main, predict_combined))
print(confusion_matrix(answ_test_main, predict_combined))

                                                                                                              precision    recall  f1-score   support

                                                                                                                   0.00      0.00      0.00         0
                                                                                                     agrcase       0.00      0.00      0.00       217
                                                                                           agrcase+agrgender       0.00      0.00      0.00         5
                                                                                    agrcase+agrgender+constr       0.00      0.00      0.00         1
                                                                  agrcase+agrgender+transfer+gov+conj+syntax       0.00      0.00      0.00         0
                                                                      agrcase+agrpers+conj+syntax+c

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


ерунда какая-то получилась...