In [1]:
import re

from bs4 import BeautifulSoup
from html import unescape

from tqdm import tqdm

## Загрузка XML файлов и извлечение предложений

In [2]:
def load_xml_dataset(filename):
    with open(filename, mode='r', encoding='windows-1251') as f_news:
        xml = f_news.read()
    
    soup = BeautifulSoup(xml, 'lxml')
    
    dataset = []
    for sent in soup.find_all('sentence'):
        elem = sent.find('speech')
        parsed = {
            'id':     sent['id'],
            'speach': elem['type'],
            'data':   elem.text.strip(),
            'eval':   sent.find('evaluation').text.strip(),
            'url':    unescape(sent.find('url').text.strip())
        }
        if parsed['eval'] not in {'+', '-', '0'}:
            continue
        dataset.append(parsed)
    return dataset

In [3]:
data_train = load_xml_dataset('news_sentiment_romip2012/train/news_eval_train.xml')
data_test  = load_xml_dataset('news_sentiment_romip2012/test/news_eval_test.xml')

## Морфологическая предобработка предложений

In [4]:
import re

from functools import lru_cache

from pymorphy2 import MorphAnalyzer
from nltk.stem import WordNetLemmatizer

ru_morph = MorphAnalyzer()
en_morph = WordNetLemmatizer()

CYRILLIC_PATTERN = re.compile('[а-яА-Я]')
DIGIT_PATTERN = re.compile('\d+')

def has_cyrillic(text):
    return bool(CYRILLIC_PATTERN.search(text))

def has_numeric(text):
    return bool(DIGIT_PATTERN.search(text))

@lru_cache(maxsize=1500)
def morph_process(token):
    if has_cyrillic(token):
        return ru_morph.parse(token)[0].normal_form
    elif has_numeric(token):
        return token
    else:
        return en_morph.lemmatize(token)
    
def morph_sentence(sentence):
    res = re.findall('\d+', sentence.lower())
    words = re.findall('[^\W\d_]+', sentence.lower())
    res.extend(map(morph_process, words))
    return res

In [5]:
from nltk.corpus import stopwords

ru_stopwords = stopwords.words('russian')
en_stopwords = stopwords.words('english')

stopwords = set(ru_stopwords) | set(en_stopwords)

## Подготовка векторов

In [6]:
import numpy as np

In [7]:
sentences_train = [' '.join(morph_sentence(x['data'])) for x in data_train]
sentences_test  = [' '.join(morph_sentence(x['data'])) for x in data_test]

In [8]:
def modify_label(s):
    return { '+': 1, '0': 0, '-': -1 }[s]

y_train = np.asarray([modify_label(x['eval']) for x in data_train])
y_test  = np.asarray([modify_label(x['eval']) for x in data_test])

In [9]:
dict(zip(*np.unique(y_train, return_counts=True)))

{-1: 1864, 0: 914, 1: 1115}

In [10]:
dict(zip(*np.unique(y_test, return_counts=True)))

{-1: 1890, 0: 1235, 1: 1448}

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [12]:
vectorizer = {
    ('binary', None):        CountVectorizer(binary=True),
    ('binary', 'stopwords'): CountVectorizer(binary=True, stop_words=stopwords),
    ('count', None):         CountVectorizer(binary=False),
    ('count', 'stopwords'):  CountVectorizer(binary=False, stop_words=stopwords),
    ('tfidf', None):         TfidfVectorizer(),
    ('tfidf', 'stopwords'):  TfidfVectorizer(stop_words=stopwords),
}

for v in vectorizer.values():
    v.fit(sentences_train)

## ML-модели

In [13]:
from sklearn.metrics import f1_score

In [14]:
def print_scores(scores, type_):
    format_str = r"{:<6} | {:>9} | {:>9}"
    print(format_str.format('vector', 'stopwords', 'f1-measure'))
    print('-' * 31)
    
    format_str = r"{:<6} | {:>9} | {:>10.6f}"
    for k, v in scores.items():
        if k[2] == type_:
            print( format_str.format( k[0], k[1] == 'stopwords', v ) )

### Логистическая регрессия

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
from sklearn.preprocessing import normalize

In [17]:
%%time

scores = {}

for k, v in vectorizer.items():
    cl = LogisticRegression(multi_class='ovr', max_iter=10_000, C=10)
    
    X_train = normalize(v.transform(sentences_train), norm='l2')
    X_test  = normalize(v.transform(sentences_test), norm='l2')
    
    cl.fit(X_train, y_train)
    y_pred = cl.predict(X_test)
    
    scores[(*k, 'micro')] = f1_score(y_test, y_pred, average='micro')
    scores[(*k, 'macro')] = f1_score(y_test, y_pred, average='macro')

CPU times: user 8.2 s, sys: 351 ms, total: 8.55 s
Wall time: 5.57 s


In [18]:
print_scores(scores, 'micro')

vector | stopwords | f1-measure
-------------------------------
binary |         0 |   0.617538
binary |         1 |   0.605729
count  |         0 |   0.610321
count  |         1 |   0.600700
tfidf  |         0 |   0.616882
tfidf  |         1 |   0.608791


In [19]:
print_scores(scores, 'macro')

vector | stopwords | f1-measure
-------------------------------
binary |         0 |   0.581217
binary |         1 |   0.571044
count  |         0 |   0.572949
count  |         1 |   0.565213
tfidf  |         0 |   0.577080
tfidf  |         1 |   0.569897


### Naive-Bayes

In [20]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [21]:
%%time

scores = {}

for k, v in vectorizer.items():
    if k[0] == 'tfidf':
        continue
    
    if k[0] == 'binary':
        cl = BernoulliNB(binarize=None)
    else:
        cl = MultinomialNB()
    
    X_train = v.transform(sentences_train)
    X_test  = v.transform(sentences_test)
    
    cl.fit(X_train, y_train)
    y_pred = cl.predict(X_test)
    
    scores[(*k, 'micro')] = f1_score(y_test, y_pred, average='micro')
    scores[(*k, 'macro')] = f1_score(y_test, y_pred, average='macro')

CPU times: user 1.99 s, sys: 31.9 ms, total: 2.02 s
Wall time: 2.03 s


In [22]:
print_scores(scores, 'micro')

vector | stopwords | f1-measure
-------------------------------
binary |         0 |   0.600481
binary |         1 |   0.587361
count  |         0 |   0.637656
count  |         1 |   0.624535


In [23]:
print_scores(scores, 'macro')

vector | stopwords | f1-measure
-------------------------------
binary |         0 |   0.529937
binary |         1 |   0.517345
count  |         0 |   0.591538
count  |         1 |   0.586111


### SVM

In [24]:
from sklearn.svm import LinearSVC

In [25]:
%%time

scores = {}

for k, v in vectorizer.items():
    cl = LinearSVC(max_iter=100_000, C=10)
    
    X_train = normalize(v.transform(sentences_train), norm='l2')
    X_test  = normalize(v.transform(sentences_test), norm='l2')
    
    cl.fit(X_train, y_train)
    y_pred = cl.predict(X_test)
    
    scores[(*k, 'micro')] = f1_score(y_test, y_pred, average='micro')
    scores[(*k, 'macro')] = f1_score(y_test, y_pred, average='macro')

CPU times: user 9.16 s, sys: 71.9 ms, total: 9.23 s
Wall time: 9.29 s


In [26]:
print_scores(scores, 'micro')

vector | stopwords | f1-measure
-------------------------------
binary |         0 |   0.590859
binary |         1 |   0.586049
count  |         0 |   0.596764
count  |         1 |   0.582768
tfidf  |         0 |   0.597638
tfidf  |         1 |   0.586923


In [27]:
print_scores(scores, 'macro')

vector | stopwords | f1-measure
-------------------------------
binary |         0 |   0.565042
binary |         1 |   0.563389
count  |         0 |   0.571277
count  |         1 |   0.559259
tfidf  |         0 |   0.570190
tfidf  |         1 |   0.560241


### Нейронная сеть

In [28]:
from sklearn.neural_network import MLPClassifier

In [29]:
from IPython.display import clear_output

In [30]:
%%time

scores = {}

for k, v in vectorizer.items():
    cl = MLPClassifier(hidden_layer_sizes=(1000, 200),
                       early_stopping=True, max_iter=20,
                       random_state=4444, verbose=1)
    
    X_train = normalize(v.transform(sentences_train), norm='l2')
    X_test  = normalize(v.transform(sentences_test), norm='l2')
    
    cl.fit(X_train, y_train)
    y_pred = cl.predict(X_test)
    
    scores[(*k, 'micro')] = f1_score(y_test, y_pred, average='micro')
    scores[(*k, 'macro')] = f1_score(y_test, y_pred, average='macro')
    
    clear_output()

CPU times: user 7min 52s, sys: 6min 6s, total: 13min 59s
Wall time: 12min 14s


In [31]:
print_scores(scores, 'micro')

vector | stopwords | f1-measure
-------------------------------
binary |         0 |   0.636781
binary |         1 |   0.620818
count  |         0 |   0.630658
count  |         1 |   0.620162
tfidf  |         0 |   0.632845
tfidf  |         1 |   0.617975


In [32]:
print_scores(scores, 'macro')

vector | stopwords | f1-measure
-------------------------------
binary |         0 |   0.593978
binary |         1 |   0.574983
count  |         0 |   0.591020
count  |         1 |   0.575492
tfidf  |         0 |   0.595804
tfidf  |         1 |   0.569853
