In [88]:
import xml.etree.ElementTree as ET
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from pymystem3 import Mystem
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/a.tsigankov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/a.tsigankov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [89]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
mystem = Mystem()
russian_stopwords = stopwords.words("russian")

In [58]:
def read_xml(filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    temp = []
    for child in root:
        # child[0] - speech; child[1] - evaluation; child[2] - url
        temp.append([child[0].text.strip(), child[1].text.strip(), child[2].text.strip()])
    df = pd.DataFrame(temp)
    df.columns = ['speech', 'evaluation', 'url']
    df = df[df.evaluation.isin(['0', '+', '-'])]
    df = df.reset_index().drop('index', axis=1)
    return df

In [120]:
def rename(value):
    if value == '0':
        return ['neutral']
    if value == '+':
        return ['positive']
    if value == '-':
        return ['negative']

In [178]:
def lemmatize(sentence):
    return ''.join((mystem.lemmatize(' '.join(tokenizer.tokenize(sentence))))).replace('\n', '')

In [179]:
def remove_stop_words(sentence):
    return ' '.join([word for word in sentence.split(' ') if word not in russian_stopwords])

In [59]:
train = read_xml('data/train/news_eval_train.xml')
test = read_xml('data/test/news_eval_test.xml')

In [63]:
train['part'] = 'train'
test['part'] = 'test'

In [67]:
combined = pd.concat([train, test]).reset_index().drop('index', axis=1)
combined = pd.concat([combined, pd.get_dummies(combined.evaluation)], axis=1)
combined.columns = ['speech', 'evaluation', 'url', 'part', 'positive', 'negative', 'neutral']

In [121]:
combined['evatuation_new'] = combined.evaluation.apply(rename)

In [97]:
combined['lemmatized_speech'] = combined.speech.apply(lemmatize)
combined['lemmatized_speech'] = combined.lemmatized_speech.apply(remove_stop_words)

In [122]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(combined['evatuation_new'])

y = multilabel_binarizer.transform(combined['evatuation_new'])

In [125]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8)

In [128]:
train_data = combined[combined.part == 'train']

In [130]:
xtrain, xval, ytrain, yval = train_test_split(combined['lemmatized_speech'], y, test_size=0.2, random_state=9)

In [181]:
tfidf_vectorizer = tfidf_vectorizer.fit(combined['lemmatized_speech'])

In [182]:
xtrain_tfidf = tfidf_vectorizer.transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

In [134]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [188]:
# fit model on train data
clf = clf.fit(xtrain_tfidf, ytrain)

In [189]:
# make predictions for validation set
y_pred = clf.predict(xval_tfidf)

In [190]:
y_pred_prob = clf.predict_proba(xval_tfidf)

In [191]:
# evaluate performance
results = []

for t in range(250, 410):
    y_pred_new = (y_pred_prob >= t / 1000.0).astype(int)
    results.append((t / 1000.0, f1_score(yval, y_pred_new, average="micro")))

In [195]:
sorted(results, key=lambda x:x[1])[-1]

(0.349, 0.6735491071428572)

In [197]:
thresh = 0.349

In [198]:
test = combined[combined.part == 'test']

In [199]:
test_idf = tfidf_vectorizer.transform(test['lemmatized_speech'])

In [200]:
y_pred_prob = clf.predict_proba(test_idf)

In [201]:
y_pred_new = (y_pred_prob >= t).astype(int)

In [203]:
y_true = multilabel_binarizer.transform(test.evatuation_new)

In [204]:
f1_score(y_true, y_pred_new, average="micro")

0.8005346493933785

In [205]:
y_pred_new

array([[0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       ...,
       [0, 0, 1],
       [1, 1, 0],
       [1, 0, 0]])

In [206]:
pd.Series(multilabel_binarizer.inverse_transform(y_pred_new))

0               (positive,)
1               (negative,)
2               (negative,)
3               (negative,)
4               (negative,)
               ...         
4568    (neutral, positive)
4569            (positive,)
4570            (positive,)
4571    (negative, neutral)
4572            (negative,)
Length: 4573, dtype: object

In [207]:
pd.Series(multilabel_binarizer.inverse_transform(y_true))

0        (neutral,)
1       (negative,)
2       (negative,)
3       (negative,)
4       (negative,)
           ...     
4568    (positive,)
4569    (positive,)
4570    (positive,)
4571     (neutral,)
4572    (negative,)
Length: 4573, dtype: object