In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, accuracy_score
from pymystem3 import Mystem
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/a.tsigankov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/a.tsigankov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
mystem = Mystem()
russian_stopwords = stopwords.words("russian")

In [3]:
def read_xml(filename):
    """Чтение XML-файла и получение pandas.DataFrame"""
    tree = ET.parse(filename)
    root = tree.getroot()
    temp = []
    for child in root:
        # child[0] - speech; child[1] - evaluation; child[2] - url
        temp.append([child[0].text.strip(), child[1].text.strip(), child[2].text.strip()])
    df = pd.DataFrame(temp)
    df.columns = ['speech', 'evaluation', 'url']
    df = df[df.evaluation.isin(['0', '+', '-'])]
    df = df.reset_index().drop('index', axis=1)
    return df

In [4]:
def rename(value):
    """Переимненование колонки с оценками в читаемый формат"""
    if value == '0':
        return ['neutral']
    if value == '+':
        return ['positive']
    if value == '-':
        return ['negative']

In [5]:
def lemmatize(sentence):
    """Лемматизирует предложение"""
    return ''.join((mystem.lemmatize(' '.join(tokenizer.tokenize(sentence))))).replace('\n', '')

In [6]:
def remove_stop_words(sentence):
    """Удаляет стоп-слова"""
    return ' '.join([word for word in sentence.split(' ') if word not in russian_stopwords])

In [7]:
train = read_xml('data/train/news_eval_train.xml')
test = read_xml('data/test/news_eval_test.xml')

In [8]:
train['part'] = 'train'
test['part'] = 'test'

In [9]:
combined = pd.concat([train, test]).reset_index().drop('index', axis=1)
combined = pd.concat([combined, pd.get_dummies(combined.evaluation)], axis=1)

combined.columns = ['speech', 'evaluation', 'url', 'part', 'positive', 'negative', 'neutral']

In [10]:
combined['evatuation_new'] = combined.evaluation.apply(rename)

In [11]:
combined['lemmatized_speech'] = combined.speech.apply(lemmatize)
combined['lemmatized_speech'] = combined.lemmatized_speech.apply(remove_stop_words)

In [12]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(combined['evatuation_new'])

y = multilabel_binarizer.transform(combined['evatuation_new'])

In [13]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8)

In [14]:
train_data = combined[combined.part == 'train']

In [15]:
# деление обучающей выборки на тренировочную и валидационную
xtrain, xval, ytrain, yval = train_test_split(combined['lemmatized_speech'], y, test_size=0.2, random_state=9)

In [16]:
tfidf_vectorizer = tfidf_vectorizer.fit(combined['lemmatized_speech'])

In [17]:
xtrain_tfidf = tfidf_vectorizer.transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

In [18]:
# для обучения использую логистическую регрессию
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [19]:
clf = clf.fit(xtrain_tfidf, ytrain)

In [20]:
y_pred = clf.predict(xval_tfidf)

In [21]:
y_pred_prob = clf.predict_proba(xval_tfidf)

In [22]:
# подбираю коэффициент отсечения для выбора метки класса на валидационном датасете
results = []

for t in range(250, 410):
    y_pred_new = (y_pred_prob >= t / 1000.0).astype(int)
    results.append((t / 1000.0, f1_score(yval, y_pred_new, average="micro")))

In [23]:
thresh = sorted(results, key=lambda x:x[1])[-1][0]

In [24]:
print(f'Полученный порог на валидационном датасете {thresh}')

Полученный порог на валидационном датасете 0.349


In [25]:
test = combined[combined.part == 'test']

In [26]:
test_idf = tfidf_vectorizer.transform(test['lemmatized_speech'])

In [27]:
y_pred_prob = clf.predict_proba(test_idf)

In [28]:
y_pred_new = (y_pred_prob >= thresh).astype(int)

In [29]:
y_true = multilabel_binarizer.transform(test.evatuation_new)

In [30]:
print(f'F1-macro score на тесте: {f1_score(y_true, y_pred_new, average="macro")}')
print(f'F1-micro score на тесте: {f1_score(y_true, y_pred_new, average="micro")}')
print(f'Precision score на тесте: {accuracy_score(y_true, y_pred_new)}')

F1-macro score на тесте: 0.790506478036896
F1-micro score на тесте: 0.8005346493933785
Precision score на тесте: 0.7122239230264596


In [32]:
vocabulary = tfidf_vectorizer.get_feature_names()

ТОП-10 слов, который имеют большой вес для определения позитивной новости

In [33]:
# positive
sorted(list(zip(vocabulary, clf.coef_[2])), key=lambda x: x[1])[::-1][:10]

[('хороший', 4.1414030934023165),
 ('позволять', 3.294954507574431),
 ('надеяться', 2.987278430994704),
 ('наш', 2.978806678010927),
 ('сотрудничество', 2.946343499979294),
 ('важный', 2.742419856354912),
 ('развитие', 2.6967884629904693),
 ('новый', 2.473366844508106),
 ('помогать', 2.4023673800228513),
 ('победа', 2.37215931713807)]

ТОП-10 слов, который имеют большой вес для определения нейтральной новости

In [34]:
# neutral
sorted(list(zip(vocabulary, clf.coef_[1])), key=lambda x: x[1])[::-1][:10]

[('определять', 2.340543677928967),
 ('следующий', 2.2669487589299684),
 ('рассматривать', 2.258227003755718),
 ('эксперт', 1.6787471742562015),
 ('вопрос', 1.6742516330949),
 ('пока', 1.5984329994620132),
 ('бюджет', 1.5823660775005306),
 ('категория', 1.473455051638771),
 ('съезд', 1.4571675864517424),
 ('валюта', 1.4198077599738288)]

ТОП-10 слов, который имеют большой вес для определения негативной новости

In [35]:
# negative
sorted(list(zip(vocabulary, clf.coef_[0])), key=lambda x: x[1])[::-1][:10]

[('сожаление', 2.6382776720956356),
 ('происходить', 2.619912652081225),
 ('нарушение', 2.424416076665523),
 ('преступление', 2.395404797173311),
 ('заявлять', 2.256965354539735),
 ('проблема', 2.20327008155676),
 ('причина', 2.1171192027401196),
 ('убийство', 1.9536922307752773),
 ('ситуация', 1.9509065147436642),
 ('против', 1.944501669884656)]