In [51]:
import pickle
import pandas as pd
from collections import defaultdict
from nltk.stem.snowball import SnowballStemmer
import re
from tqdm import tqdm
import time
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np


stemmer = SnowballStemmer("russian")

In [52]:
df = pd.read_pickle('./../data/sentiment_texts.pickle')

Так мы обрабатываем слова: обрезаем окончания для работы с падежами, убираем смайлики, символы и цифры

In [53]:
def transform_word(word):
    return stemmer.stem(''.join(ch for ch in word.lower() if ch.isalpha()))

def transform_sentence(sentence: str) -> str:
    words = filter(lambda w: len(w) > 0, re.split('[?!.,:; \n\t]', sentence))
    return ' '.join(transform_word(word) for word in words)

Подготавливаем тексты

In [54]:
transformed_text = []
total_len = 0
for text in tqdm(df.MessageText):
    _words = filter(lambda w: len(w) > 0, re.split('[?!.,:; \n\t]', text))
    transformed_words = filter(lambda w: True if w in words else False, (transform_word(word) for word in _words))
    transformed_text.append(' '.join(transformed_words))
    total_len += len(list(transformed_words))

df['TransormedText'] = transformed_text
total_len

100%|██████████| 9289/9289 [01:11<00:00, 129.74it/s]


0

Индексы для быстрого поиска

In [55]:
search_idx = []

for row in df.itertuples():
    search_idx.append((row.SentimentScore, set(row.TransormedText.split())))


Считаем, как часто слова встречаются в {sentiment} окрашенных предложениях

In [56]:
word_counts = defaultdict(lambda: defaultdict(int))
for word in tqdm(words):
    for score, sett in search_idx:
        if word in sett:
            word_counts[word][score] += 1
    word_counts[word]['word'] = word


  0%|          | 0/18929 [00:00<?, ?it/s]

100%|██████████| 18929/18929 [00:43<00:00, 433.65it/s]


Подгрузим ещё небольшой датасет

In [57]:
extra_dataset = pd.read_csv('./../data/data.tsv', delimiter='\t')

И тоже посчитаем слова. В датасете другая разметка, ничего страшного, преобразуем

In [58]:
extra_words = defaultdict(int)

search_idx_2 = []
def score_to_categories(_score):
    if _score < -0.6:
        return 1
    elif _score < -0.2:
        return 2
    elif _score < 0.2:
        return 3
    elif score < 0.6:
        return 4
    else:
        return 5

for row in extra_dataset.itertuples():
    text = row.title
    transformed_words = []
    for x in text.split():
        transormed = transform_word(x)
        if len(transormed) > 1:
            extra_words[transormed] += 1
            transformed_words.append(transormed)

    search_idx_2.append((score_to_categories(row.score), set(transformed_words)))

In [59]:
for word in tqdm(extra_words):
    for score, sett in search_idx_2:
        if word in sett:
            word_counts[word][score] += 1
    word_counts[word]['word'] = word

100%|██████████| 1526/1526 [00:00<00:00, 10833.37it/s]


На всякий случай сохраняем

In [60]:
pd.DataFrame(word_counts.values()).to_csv('word_counts.csv', index=False)

In [61]:
counts = pd.read_csv('word_counts.csv')

In [62]:
counts.fillna(0, inplace=True)

Основная часть: считаем несложные метрики на подсчёте встречаний

In [63]:
import math

import numpy as np
import typing as tp
import pandas

class WordScore:
    def __init__(self, super_negative_count: float, negative_count: float, neutral_count: float, positive_count: float, super_positive_count: float):
        self.super_negative_count = super_negative_count
        self.negative_count = negative_count
        self.neutral_count = neutral_count
        self.positive_count = positive_count
        self.super_positive_count = super_positive_count 

        # not to return
        self.total_positive_count = self.super_positive_count + self.positive_count
        self.total_negative_count = self.super_negative_count + self.negative_count
        self.total_count = self.total_positive_count + self.total_negative_count
        
        _total_count = max(self.total_count, 1)

        self.positive_proportion = self.total_positive_count / _total_count
        self.negative_proportion = self.total_negative_count / _total_count
        
        # not to return
        self.simple_score = -1 * self.total_negative_count + self.total_positive_count
        self.score = -1 * self.super_negative_count + -0.1 * self.negative_count + 0.1 * self.positive_count + self.super_positive_count
        self.score_extreme_only = -1 * self.super_negative_count + self.super_positive_count
        
        self.simple_score_relative = self.simple_score / max(self.total_negative_count + self.total_positive_count, 1)
        self.score_relative = self.score / _total_count
        self.score_extreme_only_relative = self.score_extreme_only / _total_count

        self.meaningful_proportion = (self.super_positive_count + self.total_negative_count) / max(self.positive_count + self.neutral_count, 1)
        self.extreme_proporion = (self.super_positive_count + self.super_negative_count) / max(self.positive_count + self.neutral_count + self.negative_count, 1)

        self.certanty = (self.positive_proportion if self.score > 0 else (self.negative_proportion if self.score < 0 else 0)) * max(1, self.extreme_proporion * 10) * math.log(_total_count)

    def get_array(self) -> list[float]:
        return [self.super_negative_count,
            self.negative_count,
            self.neutral_count,
            self.positive_count,
            self.super_positive_count,
            # self.total_positive_count,
            # self.total_negative_count,
            # self.total_count,
            self.positive_proportion,
            self.negative_proportion,
            # self.simple_score,
            # self.score,
            # self.score_extreme_only,
            self.simple_score_relative,
            self.score_relative,
            self.score_extreme_only_relative,

            self.meaningful_proportion,
            self.extreme_proporion,
            
            self.certanty]
    
FEAUTURES_LEN = 13

def prepare_word_scores(counts_df: pandas.DataFrame) -> dict[str, list[float]]:
    word_scores = {}
    for _, row in counts_df.iterrows():
        word_scores[row['word']] = WordScore(row['1'], row['2'], row['3'], row['4'], row['5']).get_array()
    return word_scores


In [64]:
def get_text_word_metrics(text: str, word_scores: dict[str, list[float]]) -> list[float]:
    metrics = []

    sentence_word_scores = [word_scores[word] for word in text.split() if word in word_scores]
    if len(sentence_word_scores) == 0:
        sentence_word_scores.append([0] * FEAUTURES_LEN)

    for idx in range(FEAUTURES_LEN):
        for aggr_fn in (min, max, sum, np.mean):
            metrics.append(aggr_fn(np.array([word_score[idx] for word_score in sentence_word_scores])))

    return metrics

In [65]:
word_scores = prepare_word_scores(counts)

Это пойдёт в прод

In [66]:
import json
json = json.dumps(word_scores)
with open('word_scores_full.json', 'w') as f:
    f.write(json)

И построим модель

In [67]:
X_words = np.array([get_text_word_metrics(text, word_scores) for text in df.TransormedText])
y = df.SentimentScore

In [68]:

X = X_words

X_train, X_test, y_train, y_test = train_test_split(np.array(X)[y], y, random_state=42)
# я знаю про файнтюнинг, мне лень
model = CatBoostClassifier(verbose=False)
model.fit(X_train, y_train)

start = time.time()
y_pred = model.predict(X_test)
print(f'random state {i}')
print('model', 'accuracy_score:', accuracy_score(y_test, y_pred))
print('prediction time', time.time() - start, 'len test', len(y_test))
print('always 1', 'accuracy_score:', accuracy_score(y_test, [1] * len(y_test)))
print('always 2', 'accuracy_score:', accuracy_score(y_test, [2] * len(y_test)))
print('always 3', 'accuracy_score:', accuracy_score(y_test, [3] * len(y_test)))
print('always 4', 'accuracy_score:', accuracy_score(y_test, [4] * len(y_test)))
print('always 5', 'accuracy_score:', accuracy_score(y_test, [5] * len(y_test)))

random state 0
model accuracy_score: 0.8196297890658631
prediction time 0.049341440200805664 len test 2323
always 1 accuracy_score: 0.008179078777442962
always 2 accuracy_score: 0.09900990099009901
always 3 accuracy_score: 0.38613861386138615
always 4 accuracy_score: 0.40637107188979765
always 5 accuracy_score: 0.08136030994403788


Это отличный показатель! Обучимся на всём датасете и сохраним модель

In [69]:
model = CatBoostClassifier(verbose=False)
_y = np.array(y)
_X = np.array(X)
_X = _X[_y != 0]
_y = _y[_y != 0]
model.fit(_X, _y)

<catboost.core.CatBoostClassifier at 0x7fd449e62620>

In [70]:
model.save_model('model')

Посмотрим, как она работает

In [71]:
def make_features(text):
    return [get_text_word_metrics(' '.join(filter(lambda w: True if w in word_scores else False, (transform_word(word) for word in filter(lambda w: len(w) > 0, re.split('[?!.,:; \n\t]', text))))), word_scores)]


In [72]:
t = 'газпром пушка ракета рекомендую к покупке не является иир прекрасные результаты обещают выплату дивидендов'
print(model.predict(make_features(t)))

t = 'на рынке сегодня паника'
print(model.predict(make_features(t)))

t = 'создаётся впечатление, что модель очень-очень тупая'
print(model.predict(make_features(t)))

t = 'продавать невыплата падение шорт шортим неудача'
print(model.predict(make_features(t)))

[[3]]
[[3]]
[[4]]
[[3]]


Модель имеет потенциал к улучшению.