In [1]:
# 1. Выделить из корпуса по 500 примеров на каждый класс, если классификация бинарная и по 200, если классов несколько.
import logging
import ast
CORPUS_PATH = 'data/corpus.txt'

log = logging.getLogger()

POS_MARK = 1
NEG_MARK = 0

def setup_logger():
    logging.basicConfig(format='%(levelname)s - %(asctime)s - %(message)s')
    log.setLevel(logging.INFO)
    
def read_data(corpus_path):
    with open(corpus_path, 'r') as f:
        log.info("Reading file...")

        reviews = []
        marks = []

        lines = f.readlines()
        lines = [x.strip() for x in lines]

        for line in lines:
            dictionary = ast.literal_eval(line)
            reviews.append(dictionary['description'])

            recom_author_mark = dictionary['recom_author_mark']
            marks.append(parse_label(recom_author_mark))

    return reviews, marks


def parse_label(recom_author_mark):
    if recom_author_mark == 'ДА':
        return POS_MARK
    elif recom_author_mark == '':
        return NEG_MARK
    else:
        raise Exception("Unknown recom_author_mark")


def split_reviews(reviews, marks, n=500):
    pos_reviews = []
    neg_reviews = []

    if len(reviews) < n or n == -1:
        n = len(reviews)

    log.info(f"Split reviews to {n} positive and {n} negative.")

    for index, review in enumerate(reviews):
        if marks[index] == POS_MARK and len(pos_reviews) < n:
            pos_reviews.append(review)
        elif marks[index] == NEG_MARK and len(neg_reviews) < n:
            neg_reviews.append(review)

        if len(pos_reviews) == n and len(neg_reviews) == n:
            break

    return pos_reviews, neg_reviews

setup_logger()

reviews, marks = read_data(CORPUS_PATH)
pos_reviews, neg_reviews = split_reviews(reviews, marks)

y_pos = [1] * len(pos_reviews)
y_neg = [0] * len(neg_reviews)

INFO - 2019-05-14 18:19:13,267 - Reading file...
INFO - 2019-05-14 18:19:33,418 - Split reviews to 500 positive and 500 negative.


In [2]:
# 2. Разбить примеры на обучающую и тестовую выборки в соотношении 80% к 20% соответственно.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(pos_reviews + neg_reviews, y_pos + y_neg, test_size=0.2)

In [3]:
# 3. Реализовать классификатор на основе tf-idf и модели логистической регрессии, вывести метрики качества на тестовом множестве. Максимальное кол-во признаков: 50000, минимальная частота: 5.
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from pymystem3 import Mystem
from nltk.corpus import stopwords

def fit_and_print_score(x_train, y_train, x_test, y_test):
    model = LogisticRegression(solver="lbfgs")
    model.fit(x_train, y_train)
    print_score_model(model, x_test, y_test)
    
def print_score_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    metrics = precision_recall_fscore_support(y_pred=y_pred, y_true=y_test, average='binary', pos_label=POS_MARK)
    log.info(f'Precision: {round(metrics[0], 3)}, recall: {round(metrics[1], 3)}, f-measure: {round(metrics[2], 3)}')
    
def tokenize(document):
    ignore = set(stopwords.words('russian'))
    stem = Mystem()

    tokens = stem.lemmatize(document)

    tokens = [w.lower() for w in tokens if w not in ignore]
    tokens = [w for w in tokens if w not in string.punctuation]
    tokens = [w for w in tokens if w.isalpha()]

    return tokens

vectorizer = TfidfVectorizer(max_features=50000, min_df=5, tokenizer=tokenize)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

fit_and_print_score(X_train_vect.toarray(), y_train, X_test_vect.toarray(), y_test)

INFO - 2019-05-14 18:31:30,393 - Precision: 0.857, recall: 0.824, f-measure: 0.84


In [4]:
# 4. Вывести наиболее значимые признаки (токены), используя один из указанных методов [3, 4]
from sklearn.ensemble import ExtraTreesClassifier

def feature_importances(X, Y):
    model = ExtraTreesClassifier()
    model.fit(X, Y)
    return model.feature_importances_

def tokenize_documents_extend(documents):
    texts = []

    for document in documents:
        w = tokenize(document)
        texts.extend(w)

    return texts

def print_most_valuable_features(tokens, tokens_importances, n=100):
    tokens_importances, tokens = zip(*sorted(zip(tokens_importances, tokens), reverse=True))

    print(tokens[:n])
    print(tokens_importances[:n])

log.info('Features importance:')
token_importances = feature_importances(X_train_vect.toarray(), y_train)
word_arr = tokenize_documents_extend(X_train)
print_most_valuable_features(word_arr, token_importances)

INFO - 2019-05-14 18:31:30,405 - Features importance:


('мат', 'рд', 'бесплатно', 'измерение', 'медперсонал', 'врач', 'нагрубить', 'тихий', 'ребенок', 'особо', 'сказать', 'вежливый', 'туда', 'слезать', 'очень', 'вместе', 'ничто', 'линза', 'вывод', 'душ', 'плюс', 'право', 'рождество', 'предупреждать', 'обход', 'гуляля', 'уходить', 'именно', 'программа', 'внушать', 'шесть', 'недоплачивать', 'паспорт', 'действие', 'анализ', 'договор', 'спрашивать', 'считать', 'проходить', 'тысяча', 'орать', 'нужный', 'дело', 'приходить', 'плохой', 'ждать', 'платить', 'жаль', 'услышать', 'медсестра', 'платить', 'внутренне', 'составлять', 'мучать', 'роддом', 'искать', 'наличие', 'успокаивать', 'воронеж', 'выписка', 'порядок', 'плановый', 'роддом', 'начинать', 'ожидать', 'беременность', 'запись', 'узел', 'этаж', 'лечить', 'человек', 'идти', 'договор', 'сервис', 'печение', 'приносить', 'лететь', 'направление', 'запрещать', 'мед', 'раковина', 'момент', 'замериваться', 'ответ', 'заглядывать', 'ярославовна', 'рожать', 'сожаление', 'бокс', 'род', 'область', 'холл', '

In [5]:
# 5. Реализовать не менее 5-ти собственных признаков (можно больше), улучшающих результаты классификатора, полученные с использованием признака tf-idf. Оценить поочередно качество классификатора, обученного на tf-idf и каждом из признаков [5].

# Features char ngram
import numpy as np

def get_char_ngram_feature(x_train, x_test):
    vectorizer = TfidfVectorizer(max_features=50000, min_df=5, analyzer='char', ngram_range=(3, 3))
    X_train_vect_char_ngram = vectorizer.fit_transform(x_train)
    X_test_vect_char_ngram = vectorizer.transform(x_test)

    return X_train_vect_char_ngram, X_test_vect_char_ngram


def get_word_ngram_feature(x_train, x_test):
    vectorizer = TfidfVectorizer(max_features=50000, min_df=5, analyzer='word', ngram_range=(3, 3))
    X_train_vect_word_ngram = vectorizer.fit_transform(x_train)
    X_test_vect_word_ngram = vectorizer.transform(x_test)

    return X_train_vect_word_ngram, X_test_vect_word_ngram

log.info('Features char ngram')
X_train_vect_char_ngram, X_test_vect_char_ngram = get_char_ngram_feature(X_train, X_test)

train_matrix = np.append(X_train_vect.toarray(), X_train_vect_char_ngram.toarray(), axis=1)
test_matrix = np.append(X_test_vect.toarray(), X_test_vect_char_ngram.toarray(), axis=1)

fit_and_print_score(train_matrix, y_train, test_matrix, y_test)

INFO - 2019-05-14 18:34:56,852 - Features char ngram
INFO - 2019-05-14 18:34:58,377 - Precision: 0.91, recall: 0.816, f-measure: 0.861


In [6]:
# Features word ngram
log.info('Features word ngram')
X_train_vect_word_ngram, X_test_vect_word_ngram = get_word_ngram_feature(X_train, X_test)

train_matrix = np.append(X_train_vect.toarray(), X_train_vect_word_ngram.toarray(), axis=1)
test_matrix = np.append(X_test_vect.toarray(), X_test_vect_word_ngram.toarray(), axis=1)

fit_and_print_score(train_matrix, y_train, test_matrix, y_test)

INFO - 2019-05-14 18:35:05,063 - Features word ngram
INFO - 2019-05-14 18:35:07,034 - Precision: 0.896, recall: 0.793, f-measure: 0.841


In [7]:
# Feature word count

def tokenize_documents_append(documents):
    texts = []

    for document in documents:
        w = tokenize(document)
        texts.append(w)

    return texts

def get_documents_tokens(x_train, x_test):
    train_tokens = tokenize_documents_append(x_train)
    test_tokens = tokenize_documents_append(x_test)
    return train_tokens, test_tokens

def get_word_count_feature(train_tokens, test_tokens):
    train_word_counts = [len(tokens) for tokens in train_tokens]
    train_word_counts = np.reshape(train_word_counts, (-1, 1))

    test_word_counts = [len(tokens) for tokens in test_tokens]
    test_word_counts = np.reshape(test_word_counts, (-1, 1))

    return train_word_counts, test_word_counts

log.info('Feature word count')
train_tokens, test_tokens = get_documents_tokens(X_train, X_test)

train_word_counts, test_word_counts = get_word_count_feature(train_tokens, test_tokens)

train_matrix = np.append(X_train_vect.toarray(), train_word_counts, axis=1)
test_matrix = np.append(X_test_vect.toarray(), test_word_counts, axis=1)

fit_and_print_score(train_matrix, y_train, test_matrix, y_test)

INFO - 2019-05-14 18:36:31,899 - Feature word count
INFO - 2019-05-14 18:49:09,872 - Precision: 0.877, recall: 0.816, f-measure: 0.845


In [8]:
# Feature characters count

def get_char_count_feature(X_train, X_test):
    train_char_counts = [len(document) for document in X_train]
    train_char_counts = np.reshape(train_char_counts, (-1, 1))

    test_char_counts = [len(document) for document in X_test]
    test_char_counts = np.reshape(test_char_counts, (-1, 1))

    return train_char_counts, test_char_counts

log.info('Feature characters count')
train_char_counts, test_char_counts = get_char_count_feature(X_train, X_test)

train_matrix = np.append(X_train_vect.toarray(), train_char_counts, axis=1)
test_matrix = np.append(X_test_vect.toarray(), test_char_counts, axis=1)

fit_and_print_score(train_matrix, y_train, test_matrix, y_test)

INFO - 2019-05-14 18:49:10,101 - Feature characters count
INFO - 2019-05-14 18:49:10,165 - Precision: 0.888, recall: 0.816, f-measure: 0.845


In [9]:
# Feature brackets count

def get_bracket_count_feature(X_train, X_test):
    train_bracket_counts = [document.count(')') - document.count('(') for document in X_train]
    train_bracket_counts = np.reshape(train_bracket_counts, (-1, 1))

    test_bracket_counts = [document.count(')') - document.count('(') for document in X_test]
    test_bracket_counts = np.reshape(test_bracket_counts, (-1, 1))

    return train_bracket_counts, test_bracket_counts

log.info('Feature brackets count')
train_bracket_counts, test_bracket_counts = get_bracket_count_feature(X_train, X_test)

train_matrix = np.append(X_train_vect.toarray(), train_bracket_counts, axis=1)
test_matrix = np.append(X_test_vect.toarray(), test_bracket_counts, axis=1)

fit_and_print_score(train_matrix, y_train, test_matrix, y_test)

INFO - 2019-05-14 18:49:10,408 - Feature brackets count
INFO - 2019-05-14 18:49:10,603 - Precision: 0.855, recall: 0.828, f-measure: 0.841


In [10]:
# Feature Sentimental

import codecs

POSITIVE_PATH = 'data/positive.txt'
NEGATIVE_PATH = 'data/negative.txt'


def get_sentimental_feature(train_tokens, test_tokens):
    positive_words = read_words(POSITIVE_PATH)
    negative_words = read_words(NEGATIVE_PATH)

    train_sentimental_values = sentimental_values(train_tokens, positive_words, negative_words)
    train_sentimental_values = np.reshape(train_sentimental_values, (-1, 1))

    test_sentimental_values = sentimental_values(test_tokens, positive_words, negative_words)
    test_sentimental_values = np.reshape(test_sentimental_values, (-1, 1))

    return train_sentimental_values, test_sentimental_values

def read_words(filepath):
    with codecs.open(filepath, 'r', 'utf-8') as f:
        words = f.readlines()
        words = [x.strip() for x in words]
        return set(words)

def sentimental_values(documents, positive_words, negative_words):
    sentimental_values = []

    for document in documents:
        sentimental_value = 0
        for token in document:
            if token in positive_words:
                sentimental_value += 1
            elif token in negative_words:
                sentimental_value -= 1

        sentimental_values.append(sentimental_value)

    return sentimental_values

log.info('Feature Sentimental')
train_sentimental_values, test_sentimental_values = get_sentimental_feature(train_tokens, test_tokens)

train_matrix = np.append(X_train_vect.toarray(), train_sentimental_values, axis=1)
test_matrix = np.append(X_test_vect.toarray(), test_sentimental_values, axis=1)

fit_and_print_score(train_matrix, y_train, test_matrix, y_test)

INFO - 2019-05-14 18:49:10,683 - Feature Sentimental
INFO - 2019-05-14 18:51:11,055 - Precision: 0.815, recall: 0.839, f-measure: 0.825


In [11]:
# 6. Оценить качество классификатора, обученного на tf-idf и всех реализованных признаках.

# All features
log.info('All futures')
train_matrix = np.append(X_train_vect.toarray(), X_train_vect_char_ngram.toarray(), axis=1)
test_matrix = np.append(X_test_vect.toarray(), X_test_vect_char_ngram.toarray(), axis=1)

train_matrix = np.append(train_matrix, X_train_vect_word_ngram.toarray(), axis=1)
test_matrix = np.append(test_matrix, X_test_vect_word_ngram.toarray(), axis=1)

train_matrix = np.append(train_matrix, train_word_counts, axis=1)
test_matrix = np.append(test_matrix, test_word_counts, axis=1)

train_matrix = np.append(train_matrix, train_char_counts, axis=1)
test_matrix = np.append(test_matrix, test_char_counts, axis=1)

train_matrix = np.append(train_matrix, train_bracket_counts, axis=1)
test_matrix = np.append(test_matrix, test_bracket_counts, axis=1)

fit_and_print_score(train_matrix, y_train, test_matrix, y_test)

INFO - 2019-05-14 18:51:11,070 - All futures
INFO - 2019-05-14 18:52:12,167 - Precision: 0.862, recall: 0.793, f-measure: 0.826


In [12]:
# 7. С помощью GridSearch вывести наиболее значимые признаки [4].
from sklearn.model_selection import GridSearchCV

def feature_importances_gridsearch(X, Y):
    model = GridSearchCV(estimator=ExtraTreesClassifier(), param_grid={})
    model.fit(X, Y)
    return model.best_estimator_.feature_importances_

log.info('GridSearchCV features importance:')
token_importances = feature_importances_gridsearch(train_matrix, y_train)
print_most_valuable_features(word_arr, token_importances)


INFO - 2019-05-14 18:52:12,176 - GridSearchCV features importance:


('приходиться', 'посмотреть', 'новый', 'рд', 'свой', 'нагрубить', 'наступление', 'рекомендовать', 'никакой', 'деньги', 'бесплатно', 'измерение', 'хирург', 'стоматолог', 'отделение', 'обращаться', 'травма', 'условие', 'род', 'широдхар', 'жалоба', 'навыкат', 'сумма', 'мужик', 'камень', 'ночь', 'плюс', 'минус', 'кстати', 'равный', 'врач', 'специальный', 'поддерживать', 'тихий', 'ждать', 'вечно', 'родственник', 'процедура', 'город', 'приезжать', 'смысл', 'кушать', 'полечить', 'увы', 'линза', 'говорить', 'подарить', 'отдел', 'коммерческий', 'хотя', 'говорить', 'ребенок', 'стоматология', 'прерикаться', 'аккуратно', 'сказать', 'возле', 'чуткий', 'больший', 'е', 'тарпан', 'зуб', 'июнь', 'вылечивать', 'грубый', 'решать', 'санитарка', 'бестолковый', 'переставать', 'никак', 'выход', 'процедура', 'весь', 'рядом', 'гинеколог', 'ответ', 'анализ', 'взвешивание', 'намечаться', 'ирина', 'каждый', 'хирург', 'заслуга', 'год', 'смущать', 'время', 'который', 'сильно', 'роды', 'бывать', 'малоприятный', 'иск