In [1]:
import ast
import logging
import string
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from pymystem3 import Mystem
from gensim.models import Word2Vec

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [2]:
CORPUS_PATH = 'data/corpus.txt'
log = logging.getLogger()

POS_MARK = 1
NEG_MARK = 0

In [3]:
def read_data(corpus_path):
    with open(corpus_path, 'r') as f:
        log.info("Reading file...")

        reviews = []
        marks = []

        lines = f.readlines()
        lines = [x.strip() for x in lines]

        for line in lines:
            dictionary = ast.literal_eval(line)
            reviews.append(dictionary['description'])

            recom_author_mark = dictionary['recom_author_mark']
            marks.append(parse_label(recom_author_mark))

    return reviews, marks


def parse_label(recom_author_mark):
    if recom_author_mark == 'ДА':
        return POS_MARK
    elif recom_author_mark == '':
        return NEG_MARK
    else:
        raise Exception("Unknown recom_author_mark")


def split_reviews(reviews, marks, n=500):
    pos_reviews = []
    neg_reviews = []

    log.info(f"Split reviews to {n} positive and {n} negative.")

    for index, review in enumerate(reviews):
        if marks[index] == POS_MARK and len(pos_reviews) < n:
            pos_reviews.append(review)
        elif marks[index] == NEG_MARK and len(neg_reviews) < n:
            neg_reviews.append(review)

        if len(pos_reviews) == n and len(neg_reviews) == n:
            break

    return pos_reviews, neg_reviews


def tokenize(document):
    ignore = set(stopwords.words('russian'))
    stem = Mystem()

    tokens = stem.lemmatize(document)

    tokens = [w.lower() for w in tokens if w not in ignore]
    tokens = [w for w in tokens if w not in string.punctuation]
    tokens = [w for w in tokens if w.isalpha()]

    return tokens


def tokenize_documents_extend(documents):
    texts = []

    for document in documents:
        w = tokenize(document)
        texts.extend(w)

    return texts


def tokenize_documents_append(documents):
    texts = []

    for document in documents:
        w = tokenize(document)
        texts.append(w)

    return texts


def print_score_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    metrics = precision_recall_fscore_support(y_pred=y_pred, y_true=y_test, average='binary', pos_label=POS_MARK)
    log.info(f'Precision: {round(metrics[0], 3)}, recall: {round(metrics[1], 3)}, f-measure: {round(metrics[2], 3)}')


def feature_importances(X, Y):
    model = ExtraTreesClassifier()
    model.fit(X, Y)
    return model.feature_importances_


def print_most_valuable_features(tokens, tokens_importances, n=100):
    tokens_importances, tokens = zip(*sorted(zip(tokens_importances, tokens), reverse=True))

    print(tokens[:n])
    print(tokens_importances[:n])


def setup_logger():
    logging.basicConfig(format='%(levelname)s - %(asctime)s - %(message)s')
    log.setLevel(logging.INFO)


def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,), dtype="float32")

    nwords = 0

    index2word_set = set(model.wv.index2word)

    for word in words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])

    featureVec = np.divide(featureVec, nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0

    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")

    for review in reviews:
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1

    return reviewFeatureVecs

In [4]:
setup_logger()

In [5]:
reviews, marks = read_data(CORPUS_PATH)
pos_reviews, neg_reviews = split_reviews(reviews, marks)

y_pos = [1] * len(pos_reviews)
y_neg = [0] * len(neg_reviews)

X_train, X_test, y_train, y_test = train_test_split(pos_reviews + neg_reviews, y_pos + y_neg, test_size=0.2)

INFO - 2019-05-08 14:21:45,773 - Reading file...
INFO - 2019-05-08 14:22:01,908 - Split reviews to 500 positive and 500 negative.


In [6]:
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [7]:
%%notify
vectorizer = TfidfVectorizer(max_features=50000, min_df=5, tokenizer=tokenize)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

model = LogisticRegression(solver="lbfgs")
model.fit(X_train_vect.toarray(), y_train)
print_score_model(model, X_test_vect.toarray(), y_test)

INFO - 2019-05-08 14:44:42,194 - Precision: 0.79, recall: 0.84, f-measure: 0.814


<IPython.core.display.Javascript object>

In [8]:
%%notify
# Test Features char ngram
vectorizer = TfidfVectorizer(max_features=50000, min_df=5, analyzer='char', ngram_range=(3, 3))
X_train_vect_word_ngram = vectorizer.fit_transform(X_train)
X_test_vect_word_ngram = vectorizer.transform(X_test)

train_matrix = np.append(X_train_vect.toarray(), X_train_vect_word_ngram.toarray(), axis=1)
test_matrix = np.append(X_test_vect.toarray(), X_test_vect_word_ngram.toarray(), axis=1)

model = LogisticRegression(solver="lbfgs")
model.fit(train_matrix, y_train)
print_score_model(model, test_matrix, y_test)

INFO - 2019-05-08 14:45:00,126 - Precision: 0.816, recall: 0.851, f-measure: 0.833


<IPython.core.display.Javascript object>

In [9]:
%%notify
# Test Features word ngram
vectorizer = TfidfVectorizer(max_features=50000, min_df=5, tokenizer=tokenize, analyzer='word', ngram_range=(3, 3))
X_train_vect_word_ngram = vectorizer.fit_transform(X_train)
X_test_vect_word_ngram = vectorizer.transform(X_test)

train_matrix = np.append(X_train_vect.toarray(), X_train_vect_word_ngram.toarray(), axis=1)
test_matrix = np.append(X_test_vect.toarray(), X_test_vect_word_ngram.toarray(), axis=1)

model = LogisticRegression(solver="lbfgs")
model.fit(train_matrix, y_train)
print_score_model(model, test_matrix, y_test)

INFO - 2019-05-08 15:08:32,499 - Precision: 0.786, recall: 0.819, f-measure: 0.802


<IPython.core.display.Javascript object>

In [10]:
%%notify
# Test Feature word count
train_sentences = tokenize_documents_append(X_train)
train_word_counts = [len(tokens) for tokens in train_sentences]
train_word_counts = np.reshape(train_word_counts, (-1, 1))

test_sentences = tokenize_documents_append(X_test)
test_word_counts = [len(tokens) for tokens in test_sentences]
test_word_counts = np.reshape(test_word_counts, (-1, 1))

train_matrix = np.append(X_train_vect.toarray(), train_word_counts, axis=1)
test_matrix = np.append(X_test_vect.toarray(), test_word_counts, axis=1)

model = LogisticRegression(solver="lbfgs")
model.fit(train_matrix, y_train)
print_score_model(model, test_matrix, y_test)

INFO - 2019-05-08 15:30:26,161 - Precision: 0.784, recall: 0.851, f-measure: 0.816


<IPython.core.display.Javascript object>

In [11]:
%%notify
# Test Feature characters count
train_char_counts = [len(document) for document in X_train]
train_char_counts = np.reshape(train_char_counts, (-1, 1))

test_char_counts = [len(document) for document in X_test]
test_char_counts = np.reshape(test_char_counts, (-1, 1))

train_matrix = np.append(X_train_vect.toarray(), train_char_counts, axis=1)
test_matrix = np.append(X_test_vect.toarray(), test_char_counts, axis=1)

model = LogisticRegression(solver="lbfgs")
model.fit(train_matrix, y_train)
print_score_model(model, test_matrix, y_test)

INFO - 2019-05-08 15:30:26,393 - Precision: 0.766, recall: 0.766, f-measure: 0.766


<IPython.core.display.Javascript object>

In [12]:
%%notify
# Test Feature brackets count
train_bracket_counts = [document.count(')') - document.count('(') for document in X_train]
train_bracket_counts = np.reshape(train_bracket_counts, (-1, 1))

test_bracket_counts = [document.count(')') - document.count('(') for document in X_test]
test_bracket_counts = np.reshape(test_bracket_counts, (-1, 1))

train_matrix = np.append(X_train_vect.toarray(), train_bracket_counts, axis=1)
test_matrix = np.append(X_test_vect.toarray(), test_bracket_counts, axis=1)

model = LogisticRegression(solver="lbfgs")
model.fit(train_matrix, y_train)
print_score_model(model, test_matrix, y_test)

INFO - 2019-05-08 15:30:26,600 - Precision: 0.806, recall: 0.798, f-measure: 0.802


<IPython.core.display.Javascript object>

In [17]:
%%notify
import codecs

POSITIVE_PATH = 'data/positive.txt'
NEGATIVE_PATH = 'data/negative.txt'

def read_words(filepath):
    with codecs.open(filepath, 'r', 'utf-8') as f:
        words = f.readlines()
        words = [x.strip() for x in words]
        return set(words)
    
def sentimental_values(documents):
    sentimental_values = []

    for document in documents:
        sentimental_value = 0
        for token in document:
            if token in positive_words:
                sentimental_value += 1
            elif token in negative_words:
                sentimental_value -= 1

        sentimental_values.append(sentimental_value)

    return sentimental_values
    
# Test Feature Sentimental
positive_words = read_words(POSITIVE_PATH)
negative_words = read_words(NEGATIVE_PATH)

train_sentimental_values = sentimental_values(train_sentences)
train_sentimental_values = np.reshape(train_sentimental_values, (-1, 1))

test_sentimental_values = sentimental_values(test_sentences)
test_sentimental_values = np.reshape(test_sentimental_values, (-1, 1))

train_matrix = np.append(X_train_vect.toarray(), train_sentimental_values, axis=1)
test_matrix = np.append(X_test_vect.toarray(), test_sentimental_values, axis=1)

model = LogisticRegression(solver="lbfgs")
model.fit(train_matrix, y_train)
print_score_model(model, test_matrix, y_test)

INFO - 2019-05-08 15:32:22,599 - Precision: 0.76, recall: 0.777, f-measure: 0.768


<IPython.core.display.Javascript object>

In [19]:
%%notify

def fit(x_train, y_train, x_test, y_test):
    model = LogisticRegression(solver="lbfgs")
    model.fit(x_train, y_train)
    print_score_model(model, x_test, y_test)

# All futures
train_matrix = np.append(X_train_vect.toarray(), X_train_vect_word_ngram.toarray(), axis=1)
test_matrix = np.append(X_test_vect.toarray(), X_test_vect_word_ngram.toarray(), axis=1)

train_matrix = np.append(train_matrix, X_train_vect_word_ngram.toarray(), axis=1)
test_matrix = np.append(test_matrix, X_test_vect_word_ngram.toarray(), axis=1)

train_matrix = np.append(train_matrix, train_word_counts, axis=1)
test_matrix = np.append(test_matrix, test_word_counts, axis=1)

train_matrix = np.append(train_matrix, train_char_counts, axis=1)
test_matrix = np.append(test_matrix, test_char_counts, axis=1)

train_matrix = np.append(train_matrix, train_bracket_counts, axis=1)
test_matrix = np.append(test_matrix, test_bracket_counts, axis=1)

fit(train_matrix, y_train, test_matrix, y_test)

INFO - 2019-05-08 15:34:52,398 - Precision: 0.8, recall: 0.723, f-measure: 0.76


In [None]:
%%notify
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV

# GridSearchCV
parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]}
svc = svm.SVC(gamma="scale")
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_matrix, y_train)
print(clf.best_estimator_.feature_importance())