In [1]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as et
import pymorphy2
import re

In [2]:
def getdata(filename):
    parsedXML = et.parse(filename)
    dfcols = ['id', 'type', 'text', 'url', 'mood']
    data = pd.DataFrame(columns=dfcols)
    for node in parsedXML.getroot():
        #print (node.tag, node.attrib)
        sentence_id = node.attrib['id']
        speech_type = eval = url = text = None
        text = node[0].text.strip()
        eval = node[1].text.strip()
        url = node[2].text.strip()
        speech_type = node[0].attrib.get('type')
        #print(speech_type)

        data = data.append(
            pd.Series([sentence_id, speech_type, text, url, eval], index = dfcols), ignore_index = True
        )
    data['id'] = pd.to_numeric(data['id'])
    data.index = data['id']
    data = data.drop(['id'], axis=1)

    #убираем амбивалентные новости
    data = data.loc[(data['mood'] == '0') | (data['mood'] == '+') | (data['mood'] == '-')]

    #сохраняем url
    data.loc[:,'url'] = data.loc[:,'url'].str.replace('http://', '')
    data.loc[:,'url'] = data.loc[:,'url'].str.replace('^www.', '')
    data.loc[:,'url'] = data.loc[:,'url'].str.split('/').str.get(0)


    #сохраняем страну, в которой была опубликована новость
    data['country'] = data['url'].str.split('.').str.get(-1)

    #делаем доп. столбцы, соотв. странам
    data['RU'] = 0
    data['UA'] = 0
    data['COM'] = 0
    data['ANOTHER'] = 0

    data.loc[data['country'] == 'ru', 'RU'] = 1
    data.loc[data['country'] == 'ua', 'UA'] = 1
    data.loc[data['country'] == 'com', 'COM'] = 1
    data.loc[(data['country'] != 'ru') & (data['country'] != 'ua') & (data['country'] != 'com') , 'ANOTHER'] = 1

    #1 - прямая речь, 0 - обычная
    data['type'] = data['type'].str.replace('indirect', '0')
    data['type'] = data['type'].str.replace('direct', '1')
    data['type'] = pd.to_numeric(data['type'])

    #хочется, чтобы настроение новости было выражено числом (+ = 1, 0 = 0, - = -1)
    data['mood'] = data['mood'].str.replace('+', '1')
    data['mood'] = data['mood'].str.replace('-', '-1')
    data['mood'] = pd.to_numeric(data['mood'])
    data

    #убираем столбцы, кот. больше не используем
    data = data.drop(['url', 'country'], axis=1)
    return data

data_train = getdata("train/news_eval_train.xml")
data_test = getdata('test/news_eval_test.xml')
data_test

Unnamed: 0_level_0,type,text,mood,RU,UA,COM,ANOTHER
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1001,1,"""Джазбэнд под руководством\nПеппе Сервилло, ко...",0,1,0,0,0
1003,1,"""Посредством этих структур\nдесяткам тысяч изб...",-1,0,0,0,1
1004,1,"""Появилось очень много бедных\nизбирателей. В ...",-1,0,0,0,1
1005,1,"""За теленовостями - главным\nисточником информ...",-1,0,0,0,1
1006,1,"""Такого раньше никогда не было,\nчтобы местные...",-1,0,0,0,1
1007,1,"""Он рассказал, что\nграбители воспользовались ...",-1,1,0,0,0
1008,1,"""Экспедиция уже завершилась.\nОна прошла в пол...",0,1,0,0,0
1009,1,"""Следственным комитетом РФ\nпостановление след...",-1,1,0,0,0
1010,1,"""Преступление, по словам\nБалгабаева, произошл...",-1,1,0,0,0
1011,1,"""Нужно учитывать не только\nрезультаты экзамен...",0,1,0,0,0


In [3]:
morph = pymorphy2.MorphAnalyzer()

In [55]:
def strtonormalform(line, should_delete = {'PREP', 'CONJ', 'PRCL', 'INTJ'}):
    ans = ''
    words = re.findall(r"[\w]+[\w|-]*", line)
    for word in words:
        lexInfo = morph.parse(word)[0]
        lex = lexInfo.normal_form
        if (lexInfo.tag.POS not in should_delete) and (re.match(r"[-+]?\d+$", lex) is None) :#союзы, предлоги и т.п. не сохраняем
            ans = ans + ' ' + lex
    return ans

data_train['text'] = data_train['text'].apply(strtonormalform)
data_test['text'] = data_test['text'].apply(strtonormalform)
data_train.to_pickle('data_train')
data_test.to_pickle('data_test')

# Если не хотим из раза пересчитывать тестовые данные - грузим их из pickle-файла

In [3]:
data_train = pd.read_pickle('data_train')
data_test = pd.read_pickle('data_test')
morph = pymorphy2.MorphAnalyzer()

#смотрим топ-25 слов
allwords = {}
i=0
for line in data_train['text']:
    #print(line)
    i+=1
    words = re.findall(r"[\w]+[\w|-]*", line)
    for w in words:
        lexInfo = morph.parse(w)[0]
        lex = lexInfo.normal_form
        allwords[w] = allwords.get(w, 0) + 1
wordnum = {}
i = 1
for w in sorted(allwords.items(), key = lambda x: -x[1]):
    wordnum[w[0]] = i
    i+=1

In [64]:
#print top 25 words
i=0
for w in sorted(allwords.items(), key = lambda x: -x[1]):
    print(w[0], w[1])
    i+=1
    if i==25: break

In [5]:
#если удаляем местоимения и числительные
SHOULD_I_DELETE_MORE = False

if SHOULD_I_DELETE_MORE:
    data_train['text'] = data_train['text'].apply(strtonormalform, should_delete ={'NUMR', 'NPRO'})
    data_test['text'] = data_test['text'].apply(strtonormalform, should_delete ={'NUMR', 'NPRO'})

In [4]:
from sklearn.metrics import fbeta_score
from sklearn.metrics import accuracy_score
def trainTestClf(clf, X_train, y_train, X_test):
    clf.fit(X_train, y_train)
    return clf.predict(X_test)

def printF2(y_test, y_pred, vType ='tf-idf', cType='Bayess'):
    print("Vectorizing: ", vType, ". Classifier: ", cType, sep='')
    print('F2 micro score = %0.3f' % fbeta_score(y_test, y_pred, beta=2, average = 'micro'))
    print('F2 macro = %0.3f' % fbeta_score(y_test, y_pred, beta=2, average = 'macro'))
    print("Accuracy = %0.3f" % accuracy_score(y_test, y_pred))

In [6]:
#бинарная вектаризация
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary = True)
X_train = vectorizer.fit_transform(data_train.loc[:,'text'])
X_test = vectorizer.transform(data_test.loc[:,'text'])
y_train, y_test = data_train['mood'], data_test['mood']
X_train, X_test

(<3893x14204 sparse matrix of type '<class 'numpy.int64'>'
 	with 89815 stored elements in Compressed Sparse Row format>,
 <4573x14204 sparse matrix of type '<class 'numpy.int64'>'
 	with 103238 stored elements in Compressed Sparse Row format>)

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import fbeta_score

clf = MultinomialNB(alpha=0.1)
printF2(y_test, trainTestClf(clf, X_train, y_train, X_test), vType = "Boolean", cType = 'Naieve Bayess, alpha = 1')

Vectorizing: Boolean. Classifier: Naieve Bayess, alpha = 1
F2 micro score = 0.611
F2 macro = 0.587
Accuracy = 0.611


In [8]:
from sklearn import svm

clf = svm.LinearSVC()
printF2(y_test, trainTestClf(clf, X_train, y_train, X_test), vType = "Boolean", cType = 'svm')

Vectorizing: Boolean. Classifier: svm
F2 micro score = 0.581
F2 macro = 0.562
Accuracy = 0.581


In [9]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100, random_state = 0)
printF2(y_test, trainTestClf(clf, X_train, y_train, X_test), vType = "Boolean", cType = 'RF')

Vectorizing: Boolean. Classifier: RF
F2 micro score = 0.553
F2 macro = 0.481
Accuracy = 0.553


In [10]:
#частоты
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(data_train.loc[:,'text'])
X_test = vectorizer.transform(data_test.loc[:,'text'])
y_train, y_test = data_train['mood'], data_test['mood']
X_train, X_test

(<3893x14204 sparse matrix of type '<class 'numpy.int64'>'
 	with 89815 stored elements in Compressed Sparse Row format>,
 <4573x14204 sparse matrix of type '<class 'numpy.int64'>'
 	with 103238 stored elements in Compressed Sparse Row format>)

In [11]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=0.3)
printF2(y_test, trainTestClf(clf, X_train, y_train, X_test), vType = "Frequency", cType = 'Naieve Bayess, alpha = 0.1')


Vectorizing: Frequency. Classifier: Naieve Bayess, alpha = 0.1
F2 micro score = 0.618
F2 macro = 0.593
Accuracy = 0.618


In [12]:
from sklearn import svm

clf = svm.LinearSVC()
printF2(y_test, trainTestClf(clf, X_train, y_train, X_test), vType = "Frequency", cType = 'svm')


Vectorizing: Frequency. Classifier: svm
F2 micro score = 0.574
F2 macro = 0.554
Accuracy = 0.574


In [13]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100, random_state = 0)
printF2(y_test, trainTestClf(clf, X_train, y_train, X_test), vType = "Frequency", cType = 'RF')

Vectorizing: Frequency. Classifier: RF
F2 micro score = 0.559
F2 macro = 0.486
Accuracy = 0.559


In [14]:
#tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(data_train.loc[:,'text'])
X_test = vectorizer.transform(data_test.loc[:,'text'])
y_train, y_test = data_train['mood'], data_test['mood']
X_train, X_test


(<3893x14204 sparse matrix of type '<class 'numpy.float64'>'
 	with 89815 stored elements in Compressed Sparse Row format>,
 <4573x14204 sparse matrix of type '<class 'numpy.float64'>'
 	with 103238 stored elements in Compressed Sparse Row format>)

In [15]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=0.06)
printF2(y_test, trainTestClf(clf, X_train, y_train, X_test), cType = 'Naieve Bayess, alpha = 1')

Vectorizing: tf-idf. Classifier: Naieve Bayess, alpha = 1
F2 micro score = 0.609
F2 macro = 0.570
Accuracy = 0.609


In [17]:
from sklearn import svm

clf = svm.LinearSVC()
printF2(y_test, trainTestClf(clf, X_train, y_train, X_test), cType = 'svm')

Vectorizing: tf-idf. Classifier: svm
F2 micro score = 0.601
F2 macro = 0.568
Accuracy = 0.601


In [18]:
clf = RandomForestClassifier(n_estimators = 100, random_state = 0)
printF2(y_test, trainTestClf(clf, X_train, y_train, X_test), cType = 'RF')

Vectorizing: tf-idf. Classifier: RF
F2 micro score = 0.552
F2 macro = 0.482
Accuracy = 0.552


In [155]:
clf = RandomForestClassifier(n_estimators = 100, random_state = 0)
printF2(data_test['mood'], trainTestClf(clf, data_train.loc[:,['type', 'RU', 'UA', 'COM', 'ANOTHER']], data_train['mood'], data_test.loc[:,['type', 'RU', 'UA', 'COM', 'ANOTHER']]), cType = 'RF')

Vectorizing: tf-idf. Classifier: RF
F2 micro score = 0.260
F2 macro score = 0.413


  'precision', 'predicted', average, warn_for)


In [19]:
rf = RandomForestClassifier(n_estimators = 100, random_state = 0)
mnb = MultinomialNB(alpha=0.1)
svc = svm.LinearSVC()
vote = VotingClassifier(estimators = [('mnb', mnb), ('rf', rf), ('svm', svc)])
printF2(y_test, trainTestClf(vote, X_train, y_train, X_test), cType = 'voting KNN, MNB, RF')

NameError: name 'VotingClassifier' is not defined

# Далее обучаем нейросетку

In [20]:
import keras

Using TensorFlow backend.


In [88]:
from keras.preprocessing.text import Tokenizer
data_train = pd.read_pickle('data_train')
data_test = pd.read_pickle('data_test')
y_train, y_test = data_train['mood'], data_test['mood']
i=0
for line in data_train['text']:
    #print(line)
    i+=1
    words = re.findall(r"[\w]+[\w|-]*", line)
    for w in words:
        lexInfo = morph.parse(w)[0]
        lex = lexInfo.normal_form
        allwords[w] = allwords.get(w, 0) + 1
wordnum = {}
i = 1
for w in sorted(allwords.items(), key = lambda x: -x[1]):
    wordnum[w[0]] = i
    i+=1

In [89]:
#нумеруем все слова
avg_len = 0
NUMCUT = 25 #режем после 25 слова
i=1


X_train = []
X_test = []
for line in data_train['text']:
    tmp = []
    words = re.findall(r"[\w]+[\w|-]*", line)
    avg_len += len(words)
    j = 0
    while j<NUMCUT:
        for w in words:
            if w in wordnum:
                j+=1
                tmp.append(wordnum[w])
            if j == NUMCUT:
                break
    X_train.append(tmp)
    
for line in data_test['text']:
    tmp = []
    words = re.findall(r"[\w]+[\w|-]*", line)
    j = 0
    if (len(words)>0):
        while j<NUMCUT:
            for w in words:
                if w in wordnum:
                    tmp.append(wordnum[w])
                    j+=1
                if j == NUMCUT:
                    break
    else:
        for j in range(25):
            tmp.append(1)
    X_test.append(tmp)
X_test

[[215,
  5,
  14,
  22,
  636,
  2038,
  6070,
  378,
  2,
  10120,
  69,
  393,
  72,
  2429,
  3380,
  4648,
  207,
  60,
  13027,
  593,
  215,
  5,
  14,
  22,
  636],
 [13,
  547,
  920,
  66,
  588,
  710,
  192,
  1274,
  44,
  2510,
  1761,
  178,
  7,
  13,
  547,
  920,
  66,
  588,
  710,
  192,
  1274,
  44,
  2510,
  1761,
  178],
 [681,
  58,
  249,
  8094,
  588,
  1890,
  10743,
  718,
  588,
  353,
  214,
  205,
  2071,
  1375,
  1357,
  1090,
  4,
  62,
  673,
  756,
  681,
  58,
  249,
  8094,
  588],
 [71,
  210,
  235,
  3630,
  1313,
  4524,
  402,
  7,
  375,
  160,
  158,
  1842,
  71,
  210,
  235,
  3630,
  1313,
  4524,
  402,
  7,
  375,
  160,
  158,
  1842,
  71],
 [20,
  424,
  850,
  1,
  313,
  343,
  2069,
  2828,
  19,
  161,
  140,
  199,
  1485,
  3,
  7868,
  957,
  47,
  56,
  30,
  20,
  424,
  850,
  1,
  313,
  343],
 [2,
  60,
  4564,
  2204,
  1658,
  2,
  216,
  1455,
  3768,
  367,
  2501,
  66,
  67,
  7,
  101,
  41,
  2,
  60,
  4564,
  

In [30]:
len(X_train)

3893

In [90]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM, Dropout, Embedding, GaussianNoise
y = keras.utils.to_categorical(y_train, num_classes=3)
yt = keras.utils.to_categorical(y_test, num_classes=3)

In [104]:
vocab=len(wordnum)
del model
model = Sequential()
model.add(Embedding(vocab,output_dim=128))
model.add(LSTM(128, dropout=0.6, recurrent_dropout=0.3))
model.add(Dense(3, activation='sigmoid'))

#model.add(Dense(100,input_dim=3893))
#model.add(Dropout(0.4))
#model.add(Dense(3, activation='sigmoid'))


model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
# Generate dummy data
import numpy as np

# Convert labels to categorical one-hot encoding


In [105]:
# Train the model, iterating on the data in batches of 32 samples
model.fit(X_train, y, epochs=50, batch_size=32, validation_split=0.1)

Train on 3503 samples, validate on 390 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x63da5978>

In [103]:
model.evaluate(X_test, yt, batch_size=32)



[2.5500021233695254, 0.57270938125450188]

In [44]:
del model

In [45]:
#tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
data_train = pd.read_pickle('data_train')
data_test = pd.read_pickle('data_test')

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(data_train.loc[:,'text'])
X_test = vectorizer.transform(data_test.loc[:,'text'])
y_train, y_test = data_train['mood'], data_test['mood']
X_train, X_test

(<3893x14204 sparse matrix of type '<class 'numpy.float64'>'
 	with 89815 stored elements in Compressed Sparse Row format>,
 <4573x14204 sparse matrix of type '<class 'numpy.float64'>'
 	with 103238 stored elements in Compressed Sparse Row format>)

In [48]:
# Преобразуем данные меток в тензоры
y = keras.utils.to_categorical(y_train, num_classes=3)
yt = keras.utils.to_categorical(y_test, num_classes=3)
print(y.shape, yt.shape)

(3893, 3) (4573, 3)


In [83]:
del model
model = Sequential()
model.add(Dense(100, activation='relu', input_dim=X_train.shape[1]))
model.add(Dropout(0.6))
model.add(GaussianNoise(1))
#model.add(Dense(100, activation='relu'))
#model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Generate dummy data

# Convert labels to categorical one-hot encoding


# Train the model, iterating on the data in batches of 32 samples
model.fit(X_train.toarray(), y, epochs=70, batch_size=32, validation_split = 0.1)

Train on 3503 samples, validate on 390 samples
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70


Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


<keras.callbacks.History at 0x4d7f0748>

In [84]:
model.evaluate(X_test.toarray(), yt, batch_size=32)



[1.0540357939450922, 0.61469494857231277]

In [73]:
del model

NameError: name 'model' is not defined