В этой программе я разделяю корпус сообщений на отрезки длиной в месяц и с помощью tf-idf нахожу слова, значимые для каждого месяца. 

In [1]:
import sqlite3
from datetime import date as d
from datetime import timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from collections import Counter
import pandas



In [2]:
conn = sqlite3.connect('oleg.db')
cur = conn.cursor()

In [3]:
cur.execute('''
select date, GROUP_CONCAT(no_stopwords, ' ')
from meta
join texts on texts.id = meta.id
group by date''')

texts = cur.fetchall()

In [4]:
cdd = {texts[i][0]:texts[i][1] for i in range(len(texts))}

In [5]:
first = d.fromisoformat('2018-02-09')
last = d.fromisoformat('2021-02-21')
day = timedelta(days=1)
week = timedelta(days=7)

now = first
while now <= last:
    if str(now) not in cdd.keys():
        cdd[str(now)] = ''
    now += day

Сделаем словарь, ключами в котором будут месяцы, а значениями - строки с сообщениями.

In [6]:
current = first
da = {}

while current <= last-day*30:
    articles = ''
    for i in range(30):
        articles = articles + cdd[d.isoformat(current + day*i)] + ' '
    da[d.isoformat(current)] = articles
    current += day*30
    

l = current
articles = ''
while current <= last:
    articles = articles + cdd[d.isoformat(current)] + ' '
    current += day
da[d.isoformat(l)] = articles

Сколько получилось кусочков: 

In [7]:
len(da)

37

## Модель

In [8]:
def get_top_tf_idf_words(tfidf_vector, feature_names, top_n):
    sorted_nzs = np.argsort(tfidf_vector.data)[:-(top_n+1):-1]
    return feature_names[tfidf_vector.indices[sorted_nzs]]

In [9]:
tfidf = TfidfVectorizer(analyzer="word", 
                        stop_words=['то', 'ага', 'ладно', 'короче', 'олег', 
                                    'лера', 'наверное', 'кстати', 'что'])

In [10]:
articles_tfidf = tfidf.fit_transform(da.values())

Будем сохранять 7 самых важных слов для каждого кусочка.

In [11]:
feature_names = np.array(tfidf.get_feature_names())
idf_words = []
long = {}
i = 0
for key in da.keys():
    article = da[key]
    article_vector = articles_tfidf[i, :]
    words = get_top_tf_idf_words(article_vector, feature_names, 7)
    idf_words.extend(words)
    long[key] = words
    i+=1

Здесь можно обрезать слова по частотности: оставить только те, которые важны сразу для нескольких кусочков. Для визуализации этих данных в презентации я решила вручную выбрать самые интересные, даже если они встречаются только один раз.

In [12]:
def make_short(idf_words, long, n):
    cnt = Counter(idf_words).most_common()
    big_words = []
    for el in cnt:
        if el[1] >= n:
            big_words.append(el[0])
    
    short = {}
    
    for key in long.keys():
        short[key] = []
        for word in long[key]:
            if word in big_words:
                short[key].append(word)

    return short, big_words

In [13]:
for_pr = make_short(idf_words, long, 1)[0]

## Таблица

Это моя попытка автоматической визуализации, весьма неудачная. Здесь будут только те слова, которые важны минимум для трех текстов. 

In [14]:
short3, big_words = make_short(idf_words, long, 3)
dates = short3.keys()

In [15]:
df = pandas.DataFrame({word:[0 for el in dates] for word in big_words})
df.index = dates
df = df.transpose()

In [16]:
for word in big_words:
    for date in dates:
        if word in short3[date]:
            df.at[word, date] = 1

In [17]:
def color_reds(value):
    if value == 1:
        color = 'pink'
    else:
        color = 'white'
    return 'background-color: %s' % color

In [18]:
df.style.applymap(color_reds)

Unnamed: 0,2018-02-09,2018-03-11,2018-04-10,2018-05-10,2018-06-09,2018-07-09,2018-08-08,2018-09-07,2018-10-07,2018-11-06,2018-12-06,2019-01-05,2019-02-04,2019-03-06,2019-04-05,2019-05-05,2019-06-04,2019-07-04,2019-08-03,2019-09-02,2019-10-02,2019-11-01,2019-12-01,2019-12-31,2020-01-30,2020-02-29,2020-03-30,2020-04-29,2020-05-29,2020-06-28,2020-07-28,2020-08-27,2020-09-26,2020-10-26,2020-11-25,2020-12-25,2021-01-24
спокойный,0,0,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
завтра,0,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,1,0,0,0,0
спать,0,0,0,1,1,1,0,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
решать,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
катя,0,0,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
написать,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
задача,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
ле,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
прекрасный,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
нравиться,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
