In [1]:
import numpy as np
import pandas
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.grid_search import GridSearchCV
from scipy import misc

from imblearn.over_sampling import RandomOverSampler



In [2]:
df = pandas.read_csv('.\\rlc_featured_mod2.csv', sep='\t')
df.head()

Unnamed: 0,sentid,sent,start,finish,error,correction,tag,levenstein,lemmaequal,grammequal,...,par,idiom,coord,refl,not-clear,transp,subst,del,punc,typo
0,61119,Мы сидели пол часа.,3,4,пол часа.,полчаса,space,4,0,0,...,0,0,0,0,0,0,0,0,0,0
1,32,Студенты так же борятся за спасение озера Байк...,2,3,так же,также,orpho,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,107,"(? ). (? ).Мы видим на рынке процесс, по средс...",0,4,,"посредством,",orpho,13,0,0,...,0,0,0,0,0,0,0,0,0,0
3,120,У государства экономические и административные...,3,3,экономические экономические,есть есть экономические,lex,11,0,0,...,0,0,0,0,0,0,0,0,0,0
4,120,У государства экономические и административные...,3,3,экономические экономические,есть есть экономические,lex,11,0,0,...,0,0,0,0,0,0,0,0,0,0


уберем ненужные столбцы

In [3]:
labels = ['levenstein', 'lemmaequal', 'grammequal', 'stemequal', 'lenorig', 'lencorr', 'bastard']
labs = labels[:]

# LEX

In [4]:
labs.append('lex')
lex_df = df[labs]
lex_df.head()

Unnamed: 0,levenstein,lemmaequal,grammequal,stemequal,lenorig,lencorr,bastard,lex
0,4,0,0,0,4,1,0,0
1,1,0,0,0,3,2,0,0
2,13,0,0,0,0,2,0,0
3,11,0,0,0,3,4,0,1
4,11,0,0,0,3,4,0,1


In [5]:
lex_df = lex_df.fillna('')

In [6]:
lex_df = pandas.get_dummies(lex_df)
lex_df.head()

Unnamed: 0,levenstein,lemmaequal,grammequal,stemequal,lenorig,lencorr,bastard,lex
0,4,0,0,0,4,1,0,0
1,1,0,0,0,3,2,0,0
2,13,0,0,0,0,2,0,0
3,11,0,0,0,3,4,0,1
4,11,0,0,0,3,4,0,1


In [7]:
data_train, data_test, answ_train, answ_test = train_test_split(lex_df.drop('lex', axis=1), lex_df['lex'], test_size=0.3)

In [8]:
clf = DecisionTreeClassifier(min_samples_split=5)
gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
gs_clf.fit(np.array(data_train), np.array(answ_train))
print(gs_clf.best_estimator_)
# for model in gs_clf.grid_scores_:
#     print(model, model[2])
gs_clf.fit(np.array(data_train), np.array(answ_train))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=8, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [9]:
answ_pred = gs_clf.predict(data_test)
print(classification_report(answ_test, answ_pred))

             precision    recall  f1-score   support

          0       0.82      0.99      0.90      4315
          1       0.84      0.27      0.41      1249

avg / total       0.83      0.82      0.79      5564



In [10]:
print(confusion_matrix(answ_test, answ_pred))

[[4252   63]
 [ 915  334]]


модель переобучена на нолики, так как их больше. Нужно или порезать данные, или сделать оверсемплинг?

# оверсемплинг

In [11]:
ros = RandomOverSampler()
data_resampled, answ_resampled = ros.fit_sample(lex_df.drop('lex', axis=1), lex_df['lex'])

In [12]:
data_train, data_test, answ_train, answ_test = train_test_split(data_resampled, answ_resampled, test_size=0.3)

In [13]:
clf = DecisionTreeClassifier(min_samples_split=5)
gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
gs_clf.fit(np.array(data_train), np.array(answ_train))
print(gs_clf.best_estimator_)
gs_clf.fit(np.array(data_train), np.array(answ_train))
answ_pred = gs_clf.predict(data_test)
print(classification_report(answ_test, answ_pred))
print(confusion_matrix(answ_test, answ_pred))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=-1, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
             precision    recall  f1-score   support

          0       0.79      0.76      0.78      4288
          1       0.78      0.80      0.79      4382

avg / total       0.78      0.78      0.78      8670

[[3269 1019]
 [ 869 3513]]


# чуть лучше. А если просто безжалостно порезать данные?

In [14]:
lex_df.index.values

array([    0,     1,     2, ..., 18542, 18543, 18544], dtype=int64)

In [15]:
# lex_df['index'] = range(1, len(lex_df) + 1)
ham = lex_df[lex_df.lex == 1].drop('lex', axis=1)
spam = lex_df[lex_df.lex == 0].drop('lex', axis=1)
n = int(len(ham) * 0.7)
print('hey there we train on ', n, 'of ', len(ham), 'while spam length is', len(spam))
hams = ham.sample(n)
spams = spam.sample(n)

hey there we train on  2867 of  4096 while spam length is 14449


In [16]:
ham_test = [i for index, i in ham.iterrows() if index not in hams.index.values]
spam_test = [i for index, i in spam.iterrows() if index not in spams.index.values][:len(ham_test)]
print(len(hams), len(spams), len(ham_test), len(spam_test))

2867 2867 1229 1229


In [17]:
train = pandas.concat([hams, spams])
test = ham_test + spam_test
train_answers = [1 for i in range(len(hams))] + [0 for i in range(len(spams))]
test_answers = [1 for i in range(len(ham_test))] + [0 for i in range(len(spam_test))]
print(len(train), len(train_answers), len(test), len(test_answers))

5734 5734 2458 2458


In [18]:
clf = DecisionTreeClassifier(min_samples_split=5)
gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
gs_clf.fit(np.array(train), np.array(train_answers))

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [19]:
answ_pred = gs_clf.predict(test)
print(classification_report(test_answers, answ_pred))
print(confusion_matrix(test_answers, answ_pred))

             precision    recall  f1-score   support

          0       0.75      0.75      0.75      1229
          1       0.75      0.75      0.75      1229

avg / total       0.75      0.75      0.75      2458

[[925 304]
 [308 921]]


если порезать, результаты чуть хуже, чем при оверсемплинге.

# General

In [25]:
def one_tag_db_cut(tagname):
    labs = labels[:]
    labs.append(tagname)
    lex_df = df[labs]
    lex_df = lex_df.fillna('')
    lex_df = pandas.get_dummies(lex_df)
    ham = lex_df[lex_df[tagname] == 1].drop(tagname, axis=1)
    spam = lex_df[lex_df[tagname] == 0].drop(tagname, axis=1)
    n = int(len(ham) * 0.7)
    hams = ham.sample(n)
    spams = spam.sample(n)
    ham_test = [i for index, i in ham.iterrows() if index not in hams.index.values]
    spam_test = [i for index, i in spam.iterrows() if index not in spams.index.values][:len(ham_test)]
    train = pandas.concat([hams, spams])
    test = ham_test + spam_test
    train_answers = [1 for i in range(len(hams))] + [0 for i in range(len(spams))]
    test_answers = [1 for i in range(len(ham_test))] + [0 for i in range(len(spam_test))]
    return train, test, train_answers, test_answers

In [28]:
ros = RandomOverSampler()
def one_tag_db_resampled(tagname):
    labs = labels[:]
    labs.append(tagname)
    lex_df = df[labs]
    lex_df = lex_df.fillna('')
    lex_df = pandas.get_dummies(lex_df)
    data_resampled, answ_resampled = ros.fit_sample(lex_df.drop(tagname, axis=1), lex_df[tagname]) 
    data_train, data_test, answ_train, answ_test = train_test_split(data_resampled, answ_resampled, test_size=0.3)
    return data_train, data_test, answ_train, answ_test

In [29]:
train, test, train_answers, test_answers = one_tag_db_resampled('lex')

In [30]:
clf = DecisionTreeClassifier(min_samples_split=5)
gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
gs_clf.fit(np.array(train), np.array(train_answers))
answ_pred = gs_clf.predict(test)
print(classification_report(test_answers, answ_pred))
print(confusion_matrix(test_answers, answ_pred))

             precision    recall  f1-score   support

          0       0.80      0.77      0.78      4328
          1       0.78      0.81      0.79      4342

avg / total       0.79      0.79      0.79      8670

[[3331  997]
 [ 840 3502]]


# ortho

In [31]:
train, test, train_answers, test_answers = one_tag_db_resampled('ortho')

In [32]:
clf = DecisionTreeClassifier(min_samples_split=5)
gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
gs_clf.fit(np.array(train), np.array(train_answers))
answ_pred = gs_clf.predict(test)
print(classification_report(test_answers, answ_pred))
print(confusion_matrix(test_answers, answ_pred))

             precision    recall  f1-score   support

          0       0.83      0.79      0.80      4286
          1       0.80      0.83      0.81      4292

avg / total       0.81      0.81      0.81      8578

[[3366  920]
 [ 711 3581]]


# typo

In [33]:
train, test, train_answers, test_answers = one_tag_db_resampled('typo')

In [34]:
clf = DecisionTreeClassifier(min_samples_split=5)
gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
gs_clf.fit(np.array(train), np.array(train_answers))
answ_pred = gs_clf.predict(test)
print(classification_report(test_answers, answ_pred))
print(confusion_matrix(test_answers, answ_pred))

             precision    recall  f1-score   support

          0       0.89      0.67      0.77      5520
          1       0.74      0.92      0.82      5563

avg / total       0.81      0.80      0.79     11083

[[3702 1818]
 [ 454 5109]]


# aLL TAGS

In [50]:
tagslist = 'prep	phon	graph	hyphen	space	ortho	translit	misspell	deriv	infl	num	morph	asp	passive	agrnum	agrcase	agrgender	agrpers	agrgerund	transfer	gov	ref	conj	wo	neg	aux	brev	syntax	constr	lex	cs	par	idiom	coord	refl	not-clear	transp	subst	del	punc	typo'.split('\t')

In [46]:
for tag in tagslist:
    print()
    print('________' + tag.upper() + '____________ occurs originally ' + str(df[tag].value_counts()[1]) + ' times')
    train, test, train_answers, test_answers = one_tag_db_resampled(tag)
    clf = DecisionTreeClassifier(min_samples_split=5)
    gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
    gs_clf.fit(np.array(train), np.array(train_answers))
    answ_pred = gs_clf.predict(test)
    print(classification_report(test_answers, answ_pred))
    print(confusion_matrix(test_answers, answ_pred))
    print()


________PREP____________ occurs originally 301 times
             precision    recall  f1-score   support

          0       0.84      0.78      0.81      5435
          1       0.80      0.85      0.82      5512

avg / total       0.82      0.82      0.82     10947

[[4257 1178]
 [ 837 4675]]


________PHON____________ occurs originally 66 times
             precision    recall  f1-score   support

          0       0.95      0.77      0.85      5578
          1       0.81      0.95      0.87      5510

avg / total       0.88      0.86      0.86     11088

[[4313 1265]
 [ 251 5259]]


________GRAPH____________ occurs originally 174 times
             precision    recall  f1-score   support

          0       0.93      0.70      0.80      5509
          1       0.76      0.95      0.84      5514

avg / total       0.85      0.83      0.82     11023

[[3881 1628]
 [ 293 5221]]


________HYPHEN____________ occurs originally 217 times
             precision    recall  f1-score   support


# all tags together

In [54]:
def combine_tags(line):
    ct = ''
    for tag in tagslist:
        if line[tag] == 1:
            ct += tag + '+'
    return ct[:-1]

In [68]:
ros = RandomOverSampler()
def train_resampling(tagname):
    labs = labels[:]
    labs.append(tagname)
    lex_df = main_train[labs]
    lex_df = lex_df.fillna('')
    lex_df = pandas.get_dummies(lex_df)
    data_resampled, answ_resampled = ros.fit_sample(lex_df.drop(tagname, axis=1), lex_df[tagname]) 
    return data_resampled, answ_resampled

In [70]:
df['tags'] = [combine_tags(i) for index, i in df.iterrows()]
main_train, main_test, answ_train_main, answ_test_main = train_test_split(df.drop(['tags'], axis=1), df['tags'], test_size=0.3)
main_test = main_test[labels]

In [69]:
clasdict = {}
for tag in tagslist:
    train, train_answers = train_resampling(tag)
    clf = DecisionTreeClassifier(min_samples_split=5)
    gs_clf = GridSearchCV(clf, {'max_leaf_nodes': (-5, -4, -3, -2, -1, None, 2, 3, 4, 5, 6, 7, 8)})
    gs_clf.fit(np.array(train), np.array(train_answers))
    clasdict[tag] = gs_clf

In [72]:
predicted_values = {}
for tag in clasdict:
    answ_pred = clasdict[tag].predict(main_test)
    predicted_values[tag] = answ_pred

In [73]:
predict_combined = []
for i in range(len(main_test)):
    tagstr = ''
    for tag in tagslist:
        if predicted_values[tag][i] == 1:
            tagstr += tag + '+'
    predict_combined.append(tagstr[:-1])

In [75]:
print(classification_report(answ_test_main, predict_combined))
print(confusion_matrix(answ_test_main, predict_combined))

                                                                                                              precision    recall  f1-score   support

                                                                                                                   0.00      0.00      0.00         0
                                                                                                     agrcase       0.00      0.00      0.00       217
                                                                                           agrcase+agrgender       0.00      0.00      0.00         5
                                                                                    agrcase+agrgender+constr       0.00      0.00      0.00         1
                                                                  agrcase+agrgender+transfer+gov+conj+syntax       0.00      0.00      0.00         0
                                                                      agrcase+agrpers+conj+syntax+c

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


ерунда какая-то получилась...