In [1]:
import xml.etree.cElementTree as ET
from pymystem3 import Mystem
from tqdm import tqdm
from multiprocessing import Pool
import pickle
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC, SVC
from sklearn.multiclass import OneVsRestClassifier

In [2]:
def preprocess_xml(filename):

    tree = ET.ElementTree(file=filename)
    root = tree.getroot()

    sentences = []
    labels = []

    for sentence in list(root.iter('document'))[0].iter('sentence'):
        speech = sentence.findall('speech')[0].text.strip()
        evaluation = sentence.findall('evaluation')[0].text.strip()

        if evaluation in {'+', '0', '-'}:
            sentences.append(speech)
            labels.append(evaluation)
    return sentences, labels

In [40]:
train_sentences, train_labels = preprocess_xml('data/train/news_eval_train.xml')

test_sentence, test_labels = preprocess_xml('data/test/news_eval_test.xml')

In [42]:
def preprocess_corups(corpus, pool_size=4):
    """
    corpus - list of sentences
    """
    m = Mystem()
    with Pool(pool_size) as p:
        lemmitized = list(tqdm(p.imap(m.lemmatize, corpus), total=len(corpus)))
        
    # docs = [[lemma for lemma in doc if (lemma not in russian_stopwords) and (lemma.isalpha() or lemma.isdigit())] for doc in lemmitized]
    docs = [[lemma for lemma in doc if lemma.isalpha() or lemma.isdigit()] for doc in lemmitized]
    return docs


In [43]:
train_sentences_preprocessed = preprocess_corups(train_sentences, pool_size=8)

100%|██████████| 3893/3893 [23:12<00:00,  2.80it/s]


In [44]:
test_sentences_preprocessed = preprocess_corups(test_sentence, pool_size=8)

100%|██████████| 4573/4573 [10:26:50<00:00,  8.22s/it]


In [45]:
SAVE_DATA = False
if SAVE_DATA:
    with open('preprocessed_data.pkl', 'wb') as output_file:
        pickle.dump((
            train_sentences_preprocessed, train_labels,
            test_sentences_preprocessed, test_labels),
        output_file)

In [3]:
with open('preprocessed_data.pkl', 'rb') as output_file:
    train_sentences_preprocessed, train_labels,\
    test_sentences_preprocessed, test_labels = pickle.load(output_file)

In [77]:
def vectorize_data(vectorizer_type='TF-iDF'):

    if vectorizer_type == 'TF-iDF':
        vectorizer = TfidfVectorizer(
            preprocessor=lambda x: x, tokenizer=lambda x: x,
            ngram_range=(1,2)
        )
    elif vectorizer_type == 'Count':
        vectorizer = CountVectorizer(
            preprocessor=lambda x: x, tokenizer=lambda x: x,
            binary=False, ngram_range=(1,2)
        )
    elif vectorizer_type == 'Binary':
        vectorizer = CountVectorizer(
            preprocessor=lambda x: x, tokenizer=lambda x: x,
            binary=True, ngram_range=(1,2)
        )
    else:
        raise ValueError(f"Wrong vectorizer_type value: {vectorizer_type}")

    X_train = vectorizer.fit_transform(train_sentences_preprocessed)
    X_test = vectorizer.transform(test_sentences_preprocessed)
    return X_train, X_test

In [85]:
def test_model(model):
    model_mlb = OneVsRestClassifier(model)
    model_mlb.fit(X_train, y_train)
    prediction = model_mlb.predict(X_test)
    train_prediction = model_mlb.predict(X_train)

    print("Train results")
    print(f"Accuracy: {accuracy_score(train_prediction, y_train):.4f}  F1-micro: {f1_score(train_prediction, y_train, average='micro'):.4f}   F1-macro: {f1_score(train_prediction, y_train, average='macro'):.4f}")

    print("Test results")
    print(f"Accuracy: {accuracy_score(prediction, y_test):.4f}  F1-micro: {f1_score(prediction, y_test, average='macro'):.4f}   F1-micro: {f1_score(prediction, y_test, average='macro'):.4f}")

## TF-iDF

In [100]:
X_train, X_test = vectorize_data('TF-iDF')

In [101]:
test_model(LinearSVC(C=1, loss='hinge'))

Train results
Accuracy: 0.9699  F1-micro: 0.9837   F1-macro: 0.9797
Test results
Accuracy: 0.4787  F1-micro: 0.4950   F1-micro: 0.4950


In [102]:
test_model(SVC(kernel='poly'))

Train results
Accuracy: 0.9938  F1-micro: 0.9969   F1-macro: 0.9960
Test results
Accuracy: 0.0013  F1-micro: 0.0021   F1-micro: 0.0021


In [103]:
test_model(SVC(kernel='rbf'))  

Train results
Accuracy: 0.9877  F1-micro: 0.9938   F1-macro: 0.9921
Test results
Accuracy: 0.3335  F1-micro: 0.3456   F1-micro: 0.3456


In [104]:
test_model(SVC(kernel='sigmoid'))

Train results
Accuracy: 0.8592  F1-micro: 0.9201   F1-macro: 0.8899
Test results
Accuracy: 0.4638  F1-micro: 0.4777   F1-micro: 0.4777


In [105]:
from sklearn.ensemble import RandomForestClassifier

test_model(RandomForestClassifier(n_estimators=1000))

Train results
Accuracy: 0.9987  F1-micro: 0.9992   F1-macro: 0.9991
Test results
Accuracy: 0.2987  F1-micro: 0.3368   F1-micro: 0.3368


In [106]:
from sklearn.linear_model import LogisticRegression

test_model(LogisticRegression(C=1))

Train results
Accuracy: 0.6047  F1-micro: 0.7499   F1-macro: 0.5823
Test results
Accuracy: 0.3523  F1-micro: 0.3652   F1-micro: 0.3652


In [107]:
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier(base_estimator=LinearSVC(C=10),
                        n_estimators=25, random_state=0,
                        max_features=0.9)
test_model(clf)

Train results
Accuracy: 0.9959  F1-micro: 0.9979   F1-macro: 0.9974
Test results
Accuracy: 0.4811  F1-micro: 0.5035   F1-micro: 0.5035


In [109]:
import lightgbm as lgb

lgb_params = {
    'max_depth': 3,
    'learning_rate': 0.1,
    'n_estimators': 1000
}

test_model(lgb.LGBMClassifier(**lgb_params))

Train results
Accuracy: 0.8341  F1-micro: 0.8981   F1-macro: 0.8818
Test results
Accuracy: 0.4465  F1-micro: 0.5152   F1-micro: 0.5152


In [108]:
clf = BaggingClassifier(base_estimator=LogisticRegression(),
                        n_estimators=25, random_state=0,
                        max_features=0.9)
test_model(clf)

Train results
Accuracy: 0.5289  F1-micro: 0.6879   F1-macro: 0.4738
Test results
Accuracy: 0.3136  F1-micro: 0.3248   F1-micro: 0.3248


## Проверка, совпадают ли train и test распределения

Можно попытаться объяснить большую разницу в результатах для train и test наборов различием в их распределениях.

Для этого мы попытаемся обучить модель отличать тестовую выборку от тренировочной. В идеале, если обе выборки имеют одно и то же распределение, то модель не сможет обучиться.

## Частотные вектора

In [118]:
X_train, X_test = vectorize_data('Count')
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

print("\n\nLinearSVC")
test_model(LinearSVC(C=1, loss='hinge'))

print("\n\nSVC-poly")
test_model(SVC(kernel='poly'))
print("\n\nSVC-rbf")
test_model(SVC(kernel='rbf')) 
print("\n\nSVC-sigmoid") 
test_model(SVC(kernel='sigmoid'))

print("\n\nRandomForest")
test_model(RandomForestClassifier(n_estimators=1000))

print("\n\nLogisticRegression")
test_model(LogisticRegression(C=1))

print("\n\nLinearSVM-bagging")
clf = BaggingClassifier(base_estimator=LinearSVC(C=10),
                        n_estimators=25, random_state=0,
                        max_features=0.9)
test_model(clf)

print("\n\nLogisticRegression-bagging")
clf = BaggingClassifier(base_estimator=LogisticRegression(),
                        n_estimators=25, random_state=0,
                        max_features=0.9)
test_model(clf)

print("\nLGBM")
lgb_params = {
    'max_depth': 3,
    'learning_rate': 0.1,
    'n_estimators': 1000
}

test_model(lgb.LGBMClassifier(**lgb_params))



LinearSVC
Train results
Accuracy: 0.9985  F1-micro: 0.9992   F1-macro: 0.9991
Test results
Accuracy: 0.4739  F1-micro: 0.5344   F1-micro: 0.5344


SVC-poly
Train results
Accuracy: 0.4696  F1-micro: 0.6390   F1-macro: 0.5953
Test results
Accuracy: 0.0575  F1-micro: 0.1391   F1-micro: 0.1391


SVC-rbf
Train results
Accuracy: 0.8184  F1-micro: 0.8962   F1-macro: 0.8570
Test results
Accuracy: 0.3477  F1-micro: 0.3694   F1-micro: 0.3694


SVC-sigmoid
Train results
Accuracy: 0.5679  F1-micro: 0.6993   F1-macro: 0.6248
Test results
Accuracy: 0.3536  F1-micro: 0.3917   F1-micro: 0.3917


RandomForest
Train results
Accuracy: 0.9985  F1-micro: 0.9992   F1-macro: 0.9991
Test results
Accuracy: 0.2305  F1-micro: 0.2583   F1-micro: 0.2583


LogisticRegression
Train results
Accuracy: 0.9985  F1-micro: 0.9992   F1-macro: 0.9991
Test results
Accuracy: 0.4605  F1-micro: 0.5063   F1-micro: 0.5063


LinearSVM-bagging
Train results
Accuracy: 0.9920  F1-micro: 0.9957   F1-macro: 0.9948
Test results
Accura

In [119]:
X_train, X_test = vectorize_data('Binary')
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

print("\n\nLinearSVC")
test_model(LinearSVC(C=1, loss='hinge'))

print("\n\nSVC-poly")
test_model(SVC(kernel='poly'))
print("\n\nSVC-rbf")
test_model(SVC(kernel='rbf')) 
print("\n\nSVC-sigmoid") 
test_model(SVC(kernel='sigmoid'))

print("\n\nRandomForest")
test_model(RandomForestClassifier(n_estimators=1000))

print("\n\nLogisticRegression")
test_model(LogisticRegression(C=1))

print("\n\nLinearSVM-bagging")
clf = BaggingClassifier(base_estimator=LinearSVC(C=10),
                        n_estimators=25, random_state=0,
                        max_features=0.9)
test_model(clf)

print("\n\nLogisticRegression-bagging")
clf = BaggingClassifier(base_estimator=LogisticRegression(),
                        n_estimators=25, random_state=0,
                        max_features=0.9)
test_model(clf)

print("\n\LGBM")
lgb_params = {
    'max_depth': 3,
    'learning_rate': 0.1,
    'n_estimators': 1000
}

test_model(lgb.LGBMClassifier(**lgb_params))



LinearSVC
Train results
Accuracy: 0.9985  F1-micro: 0.9992   F1-macro: 0.9991
Test results
Accuracy: 0.4763  F1-micro: 0.5333   F1-micro: 0.5333


SVC-poly
Train results
Accuracy: 0.5936  F1-micro: 0.7450   F1-macro: 0.6843
Test results
Accuracy: 0.0037  F1-micro: 0.0072   F1-micro: 0.0072


SVC-rbf
Train results
Accuracy: 0.8695  F1-micro: 0.9277   F1-macro: 0.9024
Test results
Accuracy: 0.3260  F1-micro: 0.3613   F1-micro: 0.3613


SVC-sigmoid
Train results
Accuracy: 0.6807  F1-micro: 0.7949   F1-macro: 0.7027
Test results
Accuracy: 0.3842  F1-micro: 0.4078   F1-micro: 0.4078


RandomForest
Train results
Accuracy: 0.9985  F1-micro: 0.9992   F1-macro: 0.9991
Test results
Accuracy: 0.2233  F1-micro: 0.2525   F1-micro: 0.2525


LogisticRegression
Train results
Accuracy: 0.9985  F1-micro: 0.9992   F1-macro: 0.9991
Test results
Accuracy: 0.4581  F1-micro: 0.5080   F1-micro: 0.5080


LinearSVM-bagging
Train results
Accuracy: 0.9938  F1-micro: 0.9969   F1-macro: 0.9961
Test results
Accura

## Булевские вектора

## Результаты

Результаты приведены в следующей таблице:

|                               | TF-iDF | Count  | Binary |
|-------------------------------|--------|--------|--------|
| SVM-linear                    | 0.4787 | 0.4739 | 0.4763 |
| SVM-poly                      | 0.0013 | 0.0575 | 0.0037 |
| SVM-rbf                       | 0.3335 | 0.3477 | 0.3260 |
| SVM-sigmoid                   | 0.4638 | 0.3536 | 0.3842 |
| RandomForest                  | 0.2987 | 0.2305 | 0.2233 |
| LogisticRegression            | 0.3523 | 0.4605 | 0.4581 |
| SVM-linear-bagging-25         | 0.4811 | 0.4520 | 0.4522 |
| LogisticRegression-bagging-25 | 0.3136 | 0.4238 | 0.4207 |
| LGBM                          | 0.4465 | 0.4295 | 0.4317 |

Ни один из классификаторов не побил бейзлайн. Лучше всего себя показал линейный SVM.

Большинство модели получили на тренировочной выборке точность, очень близкую к 1.

### TF-iDF

In [81]:
import scipy.stats as st
from sklearn.model_selection import cross_val_score, cross_validate
import scipy.sparse as sp

X_train, X_test = vectorize_data(vectorizer_type='TF-iDF')

X_full = sp.vstack([X_train, X_test], format='csr')
x_is_train = np.hstack([np.ones(X_train.shape[0]), np.zeros(X_test.shape[0])])

model = LinearSVC()

cross_val_resuls = cross_validate(model, X_full, x_is_train, cv=5, scoring=['accuracy'])

data = cross_val_resuls['test_accuracy']

interval = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data)) 
print(f"Accuracy 95% confidience interval: ({interval[0]:.4f} {interval[1]:.4f})")

print(f"Accuracy score with constant predictor: {accuracy_score(x_is_train, np.zeros_like(x_is_train)):.4f}")

Accuracy 95% confidience interval: (0.7649 0.8486)
Accuracy score with constant predictor: 0.5402


### Частотные вектора

In [82]:
X_train, X_test = vectorize_data(vectorizer_type='Count')

X_full = sp.vstack([X_train, X_test], format='csr')
x_is_train = np.hstack([np.ones(X_train.shape[0]), np.zeros(X_test.shape[0])])

model = LinearSVC()

cross_val_resuls = cross_validate(model, X_full, x_is_train, cv=5, scoring=['accuracy'])

data = cross_val_resuls['test_accuracy']

interval = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data)) 
print(f"Accuracy 95% confidience interval: ({interval[0]:.4f} {interval[1]:.4f})")

print(f"Accuracy score with constant predictor: {accuracy_score(x_is_train, np.zeros_like(x_is_train)):.4f}")

Accuracy 95% confidience interval: (0.6823 0.7590)
Accuracy score with constant predictor: 0.5402


### Булевские вектора

In [83]:
X_train, X_test = vectorize_data(vectorizer_type='Binary')

X_full = sp.vstack([X_train, X_test], format='csr')
x_is_train = np.hstack([np.ones(X_train.shape[0]), np.zeros(X_test.shape[0])])

model = LinearSVC()

cross_val_resuls = cross_validate(model, X_full, x_is_train, cv=5, scoring=['accuracy'])

data = cross_val_resuls['test_accuracy']

interval = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data)) 
print(f"Accuracy 95% confidience interval: ({interval[0]:.4f} {interval[1]:.4f})")

print(f"Accuracy score with constant predictor: {accuracy_score(x_is_train, np.zeros_like(x_is_train)):.4f}")

Accuracy 95% confidience interval: (0.6859 0.7665)
Accuracy score with constant predictor: 0.5402


Линейный SVM смог добиться точности значительно лучше, чем константная модель. Это указывает на то, что train и test наборы значительно отличаются друг от друга.