In [1]:
import math
import pandas as pd
import datetime
from datetime import date, timedelta
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import numpy as np
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/mdyuldina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Create dataset

In [2]:
df = pd.read_json('data/channel_messages.json')

In [3]:
df = df[df['message'].notna()]
df['user_id'] = df['from_id'].apply(lambda x: x['user_id'])
df['forwarded'] = df['fwd_from'].notna()
df['forwards'] = df['forwards'].apply(lambda x: x is not None and x > 0)
df['replies'] = df['replies'].apply(lambda x: x['replies'] if x else 0)
df['pinned'] = df['pinned'].apply(lambda x: x == 1.0)

In [4]:
df = df.sort_values(by='id')

In [5]:
columns = ['date','message','mentioned','media_unread','post','pinned', 'from_id', 'fwd_from', \
           'via_bot_id', 'reply_to', 'media', 'reply_markup', 'replies']
important_columns = ['date', 'message', 'pinned', 'user_id', 'forwarded', 'forwards']

In [6]:
df = df[important_columns]

In [7]:
all_messages = df['message'].to_list()
len(all_messages)

3249

In [8]:
users = pd.read_json('data/channel_users.json')

In [9]:
df = pd.merge(df, users[['id', 'user', 'is_bot']], how='left', left_on="user_id", right_on='id', suffixes=['_x', '_y'])
del df['id']

In [10]:
df['is_bot'] = df['is_bot'].apply(lambda x: x == True)
df['is_admin'] = df['user'].apply(lambda x: x in ['pavel_kikin', 'Irinka_Bekker', 'Chihvost']) 

# TF-IDF using sklearn

## Make short summary for every day

In [11]:
pipe = Pipeline([('count', CountVectorizer(stop_words=['–ø—Ä–∏–≤–µ—Ç', '—Å–ø–∞—Å–∏–±–æ'])),
                 ('tfid', TfidfTransformer())]).fit(all_messages)

In [12]:
min_date = df['date'].min().date()

In [13]:
max_date = df['date'].max().date()

In [14]:
def boost_score(row):
    score = row['score']
    if row['pinned']:
        score = score * 3
    if row['is_admin']:
        score = score * 2
    return score

In [15]:
delta = timedelta(days=1)
cur_date = min_date
while cur_date <= max_date:
    print('***** ' + cur_date.strftime("%Y-%m-%d") + ' *****')
    
    sub_df = df[df['date'].dt.date == cur_date].copy()
    if len(sub_df) == 0:
        cur_date += delta
        continue
    sub_df = sub_df[~sub_df['is_bot']]
    sub_df = sub_df[['message', 'pinned', 'user', 'is_admin']]
    
    dataset = []
    for row in sub_df.iterrows():
        message = row[1][0]
        pinned = row[1][1]
        user = row[1][2]
        is_admin = row[1][3]
        for line in sent_tokenize(message):
            dataset.append([line, pinned, user, is_admin])
    dataset = pd.DataFrame(dataset, columns=['sentence', 'pinned', 'user', 'is_admin'])
    
    dataset = dataset.drop_duplicates(subset='sentence').copy()
    dataset['user'] = dataset['user'].fillna(value='unknown')
    sentences = dataset['sentence'].to_list()
    tdidf = pipe.transform(sentences)
    scores = np.sum(tdidf.toarray(), axis=1)
    dataset['score'] = scores
    dataset['score_updated'] = dataset[['score', 'pinned', 'is_admin']].apply(boost_score, axis=1)
    threshold = dataset['score_updated'].quantile(q=0.95)
    
    summary = ''
    prev_user = None
    for row in dataset[dataset['score_updated'] >= threshold].iterrows():
        user = row[1][2]
        if user is None:
            user = 'unknown'
        if not prev_user or prev_user != user:
            prev_user = row[1][2]
            summary = summary + '\n' + '@' + str(user) + '\n'
        
        summary = summary + row[1][0] + '\n'
    print(summary)

    cur_date += delta

***** 2022-02-28 *****

@Chihvost
–ó–æ–≤–∏—Ç–µ –º–µ–Ω—è –≤ —Ç–µ—Ö —Å–ª—É—á–∞—è—Ö, –∫–æ–≥–¥–∞ –≤–æ–∑–Ω–∏–∫–∞—é—Ç –≤–æ–ø—Ä–æ—Å—ã –æ –ø–ª–∞—Ç—Ñ–æ—Ä–º–µ –≤ —Ü–µ–ª–æ–º, —Å–æ–æ–±—â–µ—Å—Ç–≤–∞—Ö, —ç–∫—Å–ø–µ—Ä—Ç–∞—Ö, –ø—Ä–æ–±–ª–µ–º–∞—Ö —Å —Ä–µ–≥–∏—Å—Ç—Ä–∞—Ü–∏–µ–π –∏ –ø—Ä–æ—á–∏—Ö —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏—Ö –≤–æ–ø—Ä–æ—Å–∞—Ö @Chihvost

‚ú® –ï—â–µ –≤ —á–∞—Ç–µ –µ—Å—Ç—å –≥–ª–∞–≤–Ω—ã–π –æ—Ä–≥–∞–Ω–∏–∑–∞—Ç–æ—Ä –∫—É—Ä—Å–∞ –ü–∞–≤–µ–ª –ö–∏–∫–∏–Ω @pavel_kikin, –ê–ª–µ–∫—Å–µ–π –ö–æ–ª–µ—Å–Ω–∏–∫–æ–≤ @alekseyntk –∏ –ò—Ä–∏–Ω–∞ –ë–µ–∫–∫–µ—Ä.
–¢–∞–º —É –Ω–∞—Å –µ—Å—Ç—å —Å–æ–æ–±—â–µ—Å—Ç–≤–æ –ú–∞—à–∏–Ω–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ –∏ –ù–µ–π—Ä–æ–Ω–Ω—ã–µ —Å–µ—Ç–∏, –≥–¥–µ –Ω–∞—Å—Ç–æ—è—â–∏–µ —ç–∫—Å–ø–µ—Ä—Ç—ã –æ—Ç–≤–µ—á–∞—é—Ç –Ω–∞ –∑–ª–æ–±–æ–¥–Ω–µ–≤–Ω—ã–µ –≤–æ–ø—Ä–æ—Å—ã –ø—Ä–æ –º–∞—à–∏–Ω–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ –∏ —Å–º–µ–∂–Ω—ã–µ –æ–±–ª–∞—Å—Ç–∏, –∞ —Ç–∞–∫–∂–µ –º—ã –µ–∂–µ–Ω–µ–¥–µ–ª—å–Ω–æ –æ–±–Ω–æ–≤–ª—è–µ–º –¥–∞–π–¥–∂–µ—Å—Ç—ã —Å –Ω–æ–≤–æ—Å—Ç—è–º–∏ —Å It –º–µ—Ä–æ–ø—Ä–∏—è—Ç–∏–π, –∫–æ–Ω–∫—É—Ä—Å–æ–≤ –∏ –≤—ã–∫–ª–∞–¥—ã–≤–∞–µ–º –≤–∞–∫–∞–Ω—Å–∏–


@DENisVali
@marcenavuc –∫–æ—Ä–æ—á, –∫–∞–∫ —è —É—Å–ª—ã—à–∞–ª —Ç–≤–æ–π –ø–æ—Å—ã–ª, –ò–ª–æ–Ω–æ–º –ú–∞—Å–∫–æ–º, –Ω–æ –≤ –†–æ—Å—Å–∏–∏ —Ç—ã —Å—Ç–∞–Ω–æ–≤–∏—Ç—å—Å—è –Ω–µ —Ö–æ—á–µ—à—å, –µ—Å–ª–∏ —Ç—ã —Å–æ–∑–¥–∞—à—å –≥–ª–æ–±–∞–ª—å–Ω—ã–π –ø—Ä–æ–µ–∫—Ç, –∞–¥—Ä–µ—Å –ø—Ä–æ–µ–∫—Ç–∞ –±—É–¥–µ—Ç –Ω–µ –†–æ—Å—Å–∏—è üá∑üá∫ –∞ –°–®–ê, –ö–∞–ª–∏—Ñ–æ—Ä–Ω–∏—è üá∫üá∏?

@pavel_kikin
–ü—Ä–æ–º–µ–∂—É—Ç–æ—á–Ω—ã—Ö –æ—Ç—á–µ—Ç–æ–≤ –Ω–µ –Ω—É–∂–Ω–æ, —Ç–æ–ª—å–∫–æ –æ–±—â–∞—è –∑–∞—â–∏—Ç–∞ –≤ —Ñ–∏–Ω–∞–ª–µ –∫—É—Ä—Å–∞
–ü–æ–π–¥—ë—Ç –∏ –≥–∏—Ç—Ö–∞–±, —Ç–æ–ª—å–∫–æ –ø—Ä–∏–¥–µ—Ç—Å—è —Å–∞–º–æ—Å—Ç–æ—è—Ç–µ–ª—å–Ω–æ —Ä–∞–∑–±–∏—Ä–∞—Ç—å—Å—è –≤ –µ–≥–æ CI/CD.
–ü–æ –∫—É—Ä—Å—É –±—É–¥—É –≤—Å—ë –ø–æ–∫–∞–∑—ã–≤–∞—Ç—å –≤ –≥–∏—Ç–ª–∞–±–µ.
–ú–æ–∂–Ω–æ, –ø—Ä–æ—Å—Ç–æ –Ω–∞–∑–≤–∞–≤ remote —Ä–µ–ø–æ–∑–∏—Ç–æ—Ä–∏–∏ –ø–æ —Ä–∞–∑–Ω–æ–º—É.
–ú–æ–∂–Ω–æ —á–µ—Ä–µ–∑ –∫–æ–Ω—Ñ–∏–≥ –Ω–∞—Å—Ç—Ä–æ–∏—Ç—å —Ç–∞–∫, —á—Ç–æ–±—ã –æ–¥–Ω–æ–π –∫–æ–º–∞–Ω–¥–æ–π –¥–µ–ª–∞—Ç—å push

***** 2022-04-11 *****

@pavel_kikin
–°–µ–≥–æ–¥–Ω—è —Ä–∞—Å—Å–º–æ—Ç—Ä–∏–º Codestyle –≤ Python, —Å—Ç–∞–Ω–¥–∞

# Experimantal. Do not use.

In [16]:
sub_df = df[(df['date'] >= '2022-04-01') & (df['date'] < '2022-04-02')].copy()
sub_df = sub_df[~sub_df['is_bot']]

In [17]:
for message in sub_df['message'].tolist():
    print(message)

–î–æ–±—Ä–æ–µ —É—Ç—Ä–æ. @pavel_kikin, –º–æ–∂–Ω–æ –¥–≤–µ —Ç–µ–º—ã –¥–ª—è –¥–æ–∫–ª–∞–¥–∞ —Å–æ–≤–º–µ—Å—Ç–∏—Ç—å, —Ä–∞—Å—Å–∫–∞–∑–∞—Ç—å –ø—Ä–æ tensortt, onnx –∏ –∫–≤–∞–Ω—Ç–∏–∑–∞—Ü–∏—é –∑–∞–æ–¥–Ω–æ?
–î–æ–±—Ä–æ–µ —É—Ç—Ä–æ. –ì–æ—Ç–æ–≤. –ù–∞–ø—Ä–∞–≤–∏–ª –∂–µ–ª–∞–Ω–∏–µ —É—á–∞—Å—Ç–≤–æ–≤–∞—Ç—å –≤ –ª–∏—á–∫—É. –ü–æ—Å–º–æ—Ç—Ä–∏—Ç–µ, –ø–æ–∂–∞–ª—É–π—Å—Ç–∞.
–ê –Ω–µ —Ö–æ—á–µ—à—å –æ–±—ä–µ–¥–µ–Ω–∏—Ç—å—Å—è –¥–ª—è —ç—Ç–æ–π —Ç–µ–º—ã? –Ø –±—ã –µ—â—ë triton –¥–æ–±–∞–≤–∏–ª –±—ã —Å—é–¥–∞, –±—ã–ª–æ –±—ã –≤ —Ç–µ–º—É
–¢—Ä–∏—Ç–æ–Ω —ç—Ç–æ —Å–∫–æ—Ä–µ–µ –∫ tf serving
–ö—Å—Ç–∞—Ç–∏, –∞ –∫—Ç–æ —É–∂–µ –ø—Ä–æ–±–æ–≤–∞–ª serving dl –º–æ–¥–µ–ª–µ–π? –ö–∞–∫–æ–µ —É –≤–∞—Å –ø–æ–ª—É—á–∞–ª–æ—Å—å —Å—Ä–µ–¥–Ω–µ–µ –≤—Ä–µ–º—è –Ω–∞ –æ—Ç–≤–µ—Ç?
–¢–∞–∫ –æ—Ç –º–æ–¥–µ–ª–∏, —Å–µ—Ä–≤–µ—Ä–∞, –∂–µ–ª–µ–∑–∞ –≤—Å—ë –∑–∞–≤–∏—Å–∏—Ç
–ó–Ω–∞—é—â–∏–µ –ª—é–¥–∏, –ø–æ–¥—Å–∫–∞–∂–∏—Ç–µ –Ω–∞—á–∏–Ω–∞—é—â–µ–º—É:
1) –ù–∞—Å–∫–æ–ª—å–∫–æ –ø–æ–Ω—è–ª, –¥–ª—è –ø—Ä–æ—Ö–æ–∂–¥–µ–Ω–∏—è –∫—É—Ä—Å–∞ –ø–æ–Ω–∞–¥–æ–±–∏—Ç—Å—è –∫–∞–∫–æ–π-–ª–∏–±–æ —Å—É—â–µ—Å—Ç–≤—É—é—â–∏–π DS –ø—Ä–æ–µ–∫

In [18]:
sub_df = sub_df[['message', 'pinned', 'user', 'is_admin']]
sub_df

Unnamed: 0,message,pinned,user,is_admin
684,"–î–æ–±—Ä–æ–µ —É—Ç—Ä–æ. @pavel_kikin, –º–æ–∂–Ω–æ –¥–≤–µ —Ç–µ–º—ã –¥–ª—è ...",False,starminalush,False
685,–î–æ–±—Ä–æ–µ —É—Ç—Ä–æ. –ì–æ—Ç–æ–≤. –ù–∞–ø—Ä–∞–≤–∏–ª –∂–µ–ª–∞–Ω–∏–µ —É—á–∞—Å—Ç–≤–æ–≤–∞...,False,dizel0110,False
686,–ê –Ω–µ —Ö–æ—á–µ—à—å –æ–±—ä–µ–¥–µ–Ω–∏—Ç—å—Å—è –¥–ª—è —ç—Ç–æ–π —Ç–µ–º—ã? –Ø –±—ã –µ...,False,ivanglebov,False
687,–¢—Ä–∏—Ç–æ–Ω —ç—Ç–æ —Å–∫–æ—Ä–µ–µ –∫ tf serving,False,starminalush,False
688,"–ö—Å—Ç–∞—Ç–∏, –∞ –∫—Ç–æ —É–∂–µ –ø—Ä–æ–±–æ–≤–∞–ª serving dl –º–æ–¥–µ–ª–µ–π?...",False,marcenavuc,False
...,...,...,...,...
889,–£ –Ω–∞—Å —è–≤–Ω–æ —Å–ª–æ–∂–∏–ª–∞—Å—å –≤–∑–∞–∏–º–Ω–∞—è —Å–∏–º–ø–∞—Ç–∏—è,False,YuryFilimonov,False
890,–ê —è –Ω–µ –Ω—É–∂–¥–∞—é—Å—å –≤ –Ω–µ–π –ø–æ–∫–∞ —á—Ç–æ. –ù–æ –µ—Å–ª–∏ –Ω—É–∂–Ω–æ ...,False,redpf,False
891,–Ø —Ç–∞–∫ –∏ –ø–æ–¥—É–º–∞–ª–∞,False,YuryFilimonov,False
892,"–û–π, –ø–æ–¥—É–º–∞–ª. –¢–æ–∂–µ –ø–µ—Ä–µ–ø—É—Ç–∞–ª.",False,YuryFilimonov,False


In [19]:
dataset = []
for row in sub_df.iterrows():
    message = row[1][0]
    pinned = row[1][1]
    user = row[1][2]
    is_admin = row[1][3]
    for line in sent_tokenize(message):
        dataset.append([line, pinned, user, is_admin])
dataset = pd.DataFrame(dataset, columns=['sentence', 'pinned', 'user', 'is_admin'])
dataset

Unnamed: 0,sentence,pinned,user,is_admin
0,–î–æ–±—Ä–æ–µ —É—Ç—Ä–æ.,False,starminalush,False
1,"@pavel_kikin, –º–æ–∂–Ω–æ –¥–≤–µ —Ç–µ–º—ã –¥–ª—è –¥–æ–∫–ª–∞–¥–∞ —Å–æ–≤–º–µ...",False,starminalush,False
2,–î–æ–±—Ä–æ–µ —É—Ç—Ä–æ.,False,dizel0110,False
3,–ì–æ—Ç–æ–≤.,False,dizel0110,False
4,–ù–∞–ø—Ä–∞–≤–∏–ª –∂–µ–ª–∞–Ω–∏–µ —É—á–∞—Å—Ç–≤–æ–≤–∞—Ç—å –≤ –ª–∏—á–∫—É.,False,dizel0110,False
...,...,...,...,...
393,–ù–æ –µ—Å–ª–∏ –Ω—É–∂–Ω–æ –±—É–¥–µ—Ç —è –¥–æ–±–∞–≤–ª—é—Å—å –æ–±—è–∑–∞—Ç–µ–ª—å–Ω–æ,False,redpf,False
394,–Ø —Ç–∞–∫ –∏ –ø–æ–¥—É–º–∞–ª–∞,False,YuryFilimonov,False
395,"–û–π, –ø–æ–¥—É–º–∞–ª.",False,YuryFilimonov,False
396,–¢–æ–∂–µ –ø–µ—Ä–µ–ø—É—Ç–∞–ª.,False,YuryFilimonov,False


In [20]:
dataset = dataset.drop_duplicates(subset='sentence').copy()
dataset

Unnamed: 0,sentence,pinned,user,is_admin
0,–î–æ–±—Ä–æ–µ —É—Ç—Ä–æ.,False,starminalush,False
1,"@pavel_kikin, –º–æ–∂–Ω–æ –¥–≤–µ —Ç–µ–º—ã –¥–ª—è –¥–æ–∫–ª–∞–¥–∞ —Å–æ–≤–º–µ...",False,starminalush,False
3,–ì–æ—Ç–æ–≤.,False,dizel0110,False
4,–ù–∞–ø—Ä–∞–≤–∏–ª –∂–µ–ª–∞–Ω–∏–µ —É—á–∞—Å—Ç–≤–æ–≤–∞—Ç—å –≤ –ª–∏—á–∫—É.,False,dizel0110,False
5,"–ü–æ—Å–º–æ—Ç—Ä–∏—Ç–µ, –ø–æ–∂–∞–ª—É–π—Å—Ç–∞.",False,dizel0110,False
...,...,...,...,...
393,–ù–æ –µ—Å–ª–∏ –Ω—É–∂–Ω–æ –±—É–¥–µ—Ç —è –¥–æ–±–∞–≤–ª—é—Å—å –æ–±—è–∑–∞—Ç–µ–ª—å–Ω–æ,False,redpf,False
394,–Ø —Ç–∞–∫ –∏ –ø–æ–¥—É–º–∞–ª–∞,False,YuryFilimonov,False
395,"–û–π, –ø–æ–¥—É–º–∞–ª.",False,YuryFilimonov,False
396,–¢–æ–∂–µ –ø–µ—Ä–µ–ø—É—Ç–∞–ª.,False,YuryFilimonov,False


In [21]:
dataset['user'] = dataset['user'].fillna(value='unknown')

In [22]:
sentences = dataset['sentence'].to_list()

In [23]:
tokenizer = RegexpTokenizer('https?://[/\.\?=\S]+|\w+')

def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(['–ø—Ä–∏–≤–µ—Ç', '—Å–ø–∞—Å–∏–±–æ'])

    for sent in sentences:
        freq_table = {}
        words = tokenizer.tokenize(sent)
        for word in words:
            word = word.lower()
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [24]:
freq_matrix = _create_frequency_matrix(sentences)
freq_matrix

{'–î–æ–±—Ä–æ–µ —É—Ç—Ä–æ.': {'–¥–æ–±—Ä–æ–µ': 1, '—É—Ç—Ä–æ': 1},
 '@pavel_kikin, –º': {'pavel_kikin': 1,
  '–º–æ–∂–Ω–æ': 1,
  '–¥–≤–µ': 1,
  '—Ç–µ–º—ã': 1,
  '–¥–ª—è': 1,
  '–¥–æ–∫–ª–∞–¥–∞': 1,
  '—Å–æ–≤–º–µ—Å—Ç–∏—Ç—å': 1,
  '—Ä–∞—Å—Å–∫–∞–∑–∞—Ç—å': 1,
  '–ø—Ä–æ': 1,
  'tensortt': 1,
  'onnx': 1,
  '–∏': 1,
  '–∫–≤–∞–Ω—Ç–∏–∑–∞—Ü–∏—é': 1,
  '–∑–∞–æ–¥–Ω–æ': 1},
 '–ì–æ—Ç–æ–≤.': {'–≥–æ—Ç–æ–≤': 1},
 '–ù–∞–ø—Ä–∞–≤–∏–ª –∂–µ–ª–∞–Ω–∏': {'–Ω–∞–ø—Ä–∞–≤–∏–ª': 1,
  '–∂–µ–ª–∞–Ω–∏–µ': 1,
  '—É—á–∞—Å—Ç–≤–æ–≤–∞—Ç—å': 1,
  '–≤': 1,
  '–ª–∏—á–∫—É': 1},
 '–ü–æ—Å–º–æ—Ç—Ä–∏—Ç–µ, –ø–æ–∂': {'–ø–æ—Å–º–æ—Ç—Ä–∏—Ç–µ': 1, '–ø–æ–∂–∞–ª—É–π—Å—Ç–∞': 1, '–ª–∏—á–∫—É': 1},
 '–ê –Ω–µ —Ö–æ—á–µ—à—å –æ–±—ä': {'–∞': 1,
  '–Ω–µ': 1,
  '—Ö–æ—á–µ—à—å': 1,
  '–æ–±—ä–µ–¥–µ–Ω–∏—Ç—å—Å—è': 1,
  '–¥–ª—è': 1,
  '—ç—Ç–æ–π': 1,
  '—Ç–µ–º—ã': 1},
 '–Ø –±—ã –µ—â—ë triton': {'—è': 1,
  '–±—ã': 3,
  '–µ—â—ë': 1,
  'triton': 1,
  '–¥–æ–±–∞–≤–∏–ª': 1,
  '—Å—é–¥–∞': 1,
  '–±—ã–ª–æ': 1,
  '–≤': 1,
  '—Ç–µ–º—É': 1},
 '–¢—Ä–∏—Ç–æ–Ω —ç—Ç–æ —Å–∫–æ—Ä': 

In [25]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [26]:
tf_matrix = _create_tf_matrix(freq_matrix)
tf_matrix

{'–î–æ–±—Ä–æ–µ —É—Ç—Ä–æ.': {'–¥–æ–±—Ä–æ–µ': 0.5, '—É—Ç—Ä–æ': 0.5},
 '@pavel_kikin, –º': {'pavel_kikin': 0.07142857142857142,
  '–º–æ–∂–Ω–æ': 0.07142857142857142,
  '–¥–≤–µ': 0.07142857142857142,
  '—Ç–µ–º—ã': 0.07142857142857142,
  '–¥–ª—è': 0.07142857142857142,
  '–¥–æ–∫–ª–∞–¥–∞': 0.07142857142857142,
  '—Å–æ–≤–º–µ—Å—Ç–∏—Ç—å': 0.07142857142857142,
  '—Ä–∞—Å—Å–∫–∞–∑–∞—Ç—å': 0.07142857142857142,
  '–ø—Ä–æ': 0.07142857142857142,
  'tensortt': 0.07142857142857142,
  'onnx': 0.07142857142857142,
  '–∏': 0.07142857142857142,
  '–∫–≤–∞–Ω—Ç–∏–∑–∞—Ü–∏—é': 0.07142857142857142,
  '–∑–∞–æ–¥–Ω–æ': 0.07142857142857142},
 '–ì–æ—Ç–æ–≤.': {'–≥–æ—Ç–æ–≤': 1.0},
 '–ù–∞–ø—Ä–∞–≤–∏–ª –∂–µ–ª–∞–Ω–∏': {'–Ω–∞–ø—Ä–∞–≤–∏–ª': 0.2,
  '–∂–µ–ª–∞–Ω–∏–µ': 0.2,
  '—É—á–∞—Å—Ç–≤–æ–≤–∞—Ç—å': 0.2,
  '–≤': 0.2,
  '–ª–∏—á–∫—É': 0.2},
 '–ü–æ—Å–º–æ—Ç—Ä–∏—Ç–µ, –ø–æ–∂': {'–ø–æ—Å–º–æ—Ç—Ä–∏—Ç–µ': 0.3333333333333333,
  '–ø–æ–∂–∞–ª—É–π—Å—Ç–∞': 0.3333333333333333,
  '–ª–∏—á–∫—É': 0.3333333333333333},
 '–ê –Ω–µ —Ö–æ—á–µ—à—å –æ–±—ä': {'

In [27]:
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [28]:
count_doc_per_words = _create_documents_per_words(freq_matrix)
count_doc_per_words

{'–¥–æ–±—Ä–æ–µ': 1,
 '—É—Ç—Ä–æ': 1,
 'pavel_kikin': 3,
 '–º–æ–∂–Ω–æ': 22,
 '–¥–≤–µ': 1,
 '—Ç–µ–º—ã': 2,
 '–¥–ª—è': 29,
 '–¥–æ–∫–ª–∞–¥–∞': 1,
 '—Å–æ–≤–º–µ—Å—Ç–∏—Ç—å': 1,
 '—Ä–∞—Å—Å–∫–∞–∑–∞—Ç—å': 3,
 '–ø—Ä–æ': 13,
 'tensortt': 1,
 'onnx': 1,
 '–∏': 84,
 '–∫–≤–∞–Ω—Ç–∏–∑–∞—Ü–∏—é': 1,
 '–∑–∞–æ–¥–Ω–æ': 1,
 '–≥–æ—Ç–æ–≤': 5,
 '–Ω–∞–ø—Ä–∞–≤–∏–ª': 1,
 '–∂–µ–ª–∞–Ω–∏–µ': 2,
 '—É—á–∞—Å—Ç–≤–æ–≤–∞—Ç—å': 1,
 '–≤': 83,
 '–ª–∏—á–∫—É': 7,
 '–ø–æ—Å–º–æ—Ç—Ä–∏—Ç–µ': 2,
 '–ø–æ–∂–∞–ª—É–π—Å—Ç–∞': 1,
 '–∞': 37,
 '–Ω–µ': 77,
 '—Ö–æ—á–µ—à—å': 1,
 '–æ–±—ä–µ–¥–µ–Ω–∏—Ç—å—Å—è': 1,
 '—ç—Ç–æ–π': 1,
 '—è': 51,
 '–±—ã': 14,
 '–µ—â—ë': 11,
 'triton': 1,
 '–¥–æ–±–∞–≤–∏–ª': 1,
 '—Å—é–¥–∞': 2,
 '–±—ã–ª–æ': 10,
 '—Ç–µ–º—É': 6,
 '—Ç—Ä–∏—Ç–æ–Ω': 1,
 '—ç—Ç–æ': 36,
 '—Å–∫–æ—Ä–µ–µ': 1,
 '–∫': 14,
 'tf': 2,
 'serving': 2,
 '–∫—Å—Ç–∞—Ç–∏': 3,
 '–∫—Ç–æ': 12,
 '—É–∂–µ': 17,
 '–ø—Ä–æ–±–æ–≤–∞–ª': 1,
 'dl': 1,
 '–º–æ–¥–µ–ª–µ–π': 1,
 '–∫–∞–∫–æ–µ': 4,
 '—É': 22,
 '–≤–∞—Å': 6,
 '–ø–æ–ª—É—á–∞–ª–æ—Å—å': 1,
 '—Å—Ä–µ–¥–Ω–µ–µ': 1,
 '–≤—Ä–µ–º—è':

In [29]:
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [30]:
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, len(sentences))
idf_matrix

{'–î–æ–±—Ä–æ–µ —É—Ç—Ä–æ.': {'–¥–æ–±—Ä–æ–µ': 2.578639209968072, '—É—Ç—Ä–æ': 2.578639209968072},
 '@pavel_kikin, –º': {'pavel_kikin': 2.10151795524841,
  '–º–æ–∂–Ω–æ': 1.236216529145866,
  '–¥–≤–µ': 2.578639209968072,
  '—Ç–µ–º—ã': 2.2776092143040914,
  '–¥–ª—è': 1.1162412120691163,
  '–¥–æ–∫–ª–∞–¥–∞': 2.578639209968072,
  '—Å–æ–≤–º–µ—Å—Ç–∏—Ç—å': 2.578639209968072,
  '—Ä–∞—Å—Å–∫–∞–∑–∞—Ç—å': 2.10151795524841,
  '–ø—Ä–æ': 1.4646958576612357,
  'tensortt': 2.578639209968072,
  'onnx': 2.578639209968072,
  '–∏': 0.6543599239061907,
  '–∫–≤–∞–Ω—Ç–∏–∑–∞—Ü–∏—é': 2.578639209968072,
  '–∑–∞–æ–¥–Ω–æ': 2.578639209968072},
 '–ì–æ—Ç–æ–≤.': {'–≥–æ—Ç–æ–≤': 1.8796692056320534},
 '–ù–∞–ø—Ä–∞–≤–∏–ª –∂–µ–ª–∞–Ω–∏': {'–Ω–∞–ø—Ä–∞–≤–∏–ª': 2.578639209968072,
  '–∂–µ–ª–∞–Ω–∏–µ': 2.2776092143040914,
  '—É—á–∞—Å—Ç–≤–æ–≤–∞—Ç—å': 2.578639209968072,
  '–≤': 0.6595611175919984,
  '–ª–∏—á–∫—É': 1.7335411699538155},
 '–ü–æ—Å–º–æ—Ç—Ä–∏—Ç–µ, –ø–æ–∂': {'–ø–æ—Å–º–æ—Ç—Ä–∏—Ç–µ': 2.2776092143040914,
  '–ø–æ–∂–∞–ª—É–π—Å—Ç–∞': 2

In [31]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [32]:
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
tf_idf_matrix

{'–î–æ–±—Ä–æ–µ —É—Ç—Ä–æ.': {'–¥–æ–±—Ä–æ–µ': 1.289319604984036, '—É—Ç—Ä–æ': 1.289319604984036},
 '@pavel_kikin, –º': {'pavel_kikin': 0.15010842537488642,
  '–º–æ–∂–Ω–æ': 0.08830118065327613,
  '–¥–≤–µ': 0.18418851499771943,
  '—Ç–µ–º—ã': 0.16268637245029224,
  '–¥–ª—è': 0.07973151514779402,
  '–¥–æ–∫–ª–∞–¥–∞': 0.18418851499771943,
  '—Å–æ–≤–º–µ—Å—Ç–∏—Ç—å': 0.18418851499771943,
  '—Ä–∞—Å—Å–∫–∞–∑–∞—Ç—å': 0.15010842537488642,
  '–ø—Ä–æ': 0.10462113269008826,
  'tensortt': 0.18418851499771943,
  'onnx': 0.18418851499771943,
  '–∏': 0.0467399945647279,
  '–∫–≤–∞–Ω—Ç–∏–∑–∞—Ü–∏—é': 0.18418851499771943,
  '–∑–∞–æ–¥–Ω–æ': 0.18418851499771943},
 '–ì–æ—Ç–æ–≤.': {'–≥–æ—Ç–æ–≤': 1.8796692056320534},
 '–ù–∞–ø—Ä–∞–≤–∏–ª –∂–µ–ª–∞–Ω–∏': {'–Ω–∞–ø—Ä–∞–≤–∏–ª': 0.5157278419936144,
  '–∂–µ–ª–∞–Ω–∏–µ': 0.4555218428608183,
  '—É—á–∞—Å—Ç–≤–æ–≤–∞—Ç—å': 0.5157278419936144,
  '–≤': 0.13191222351839968,
  '–ª–∏—á–∫—É': 0.3467082339907631},
 '–ü–æ—Å–º–æ—Ç—Ä–∏—Ç–µ, –ø–æ–∂': {'–ø–æ—Å–º–æ—Ç—Ä–∏—Ç–µ': 0.7592030714346971,

In [33]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        #sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence if count_words_in_sentence > 0 else 0
        sentenceValue[sent] = total_score_per_sentence 
    return sentenceValue


In [34]:
sentenceValue = _score_sentences(tf_idf_matrix)
sentenceValue

{'–î–æ–±—Ä–æ–µ —É—Ç—Ä–æ.': 2.578639209968072,
 '@pavel_kikin, –º': 2.0716166512399874,
 '–ì–æ—Ç–æ–≤.': 1.8796692056320534,
 '–ù–∞–ø—Ä–∞–≤–∏–ª –∂–µ–ª–∞–Ω–∏': 1.9655979843572098,
 '–ü–æ—Å–º–æ—Ç—Ä–∏—Ç–µ, –ø–æ–∂': 2.1965965314086597,
 '–ê –Ω–µ —Ö–æ—á–µ—à—å –æ–±—ä': 1.8331934324248702,
 '–Ø –±—ã –µ—â—ë triton': 2.0199361114371355,
 '–¢—Ä–∏—Ç–æ–Ω —ç—Ç–æ —Å–∫–æ—Ä': 2.027890788672491,
 '–ö—Å—Ç–∞—Ç–∏, –∞ –∫—Ç–æ —É': 1.9966413172335051,
 '–ö–∞–∫–æ–µ —É –≤–∞—Å –ø–æ–ª': 1.936857506405942,
 '–¢–∞–∫ –æ—Ç –º–æ–¥–µ–ª–∏, ': 2.143873170250768,
 '–ó–Ω–∞—é—â–∏–µ –ª—é–¥–∏, –ø': 2.1507598492355684,
 '–ï—Å–ª–∏ —Å–≤–æ–µ–≥–æ –ø—Ä–æ': 1.7407864354510012,
 '2) –í—ã–ø–æ–ª–Ω–µ–Ω–∏–µ –∑': 2.0793110782469966,
 '–î–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –ª–∏ –±': 2.079749591987221,
 '–ï—Å–ª–∏ –æ–±–ª–∞–∫–æ, —Ç–æ': 1.7946962122755474,
 '–ó–∞—Ä–∞–Ω–µ–µ —Å–ø–∞—Å–∏–±–æ': 1.9482349238877399,
 '>  –ï—Å–ª–∏ —Å–≤–æ–µ–≥–æ ': 1.7407864354510012,
 '–í–æ–∑–º–æ–∂–Ω–æ, —á—Ç–æ-—Ç': 1.8003179812073018,
 '–ù–∞ –∫–µ–≥–ª–µ —â–∞—Å –∞—Ö': 2.1017620986897914,


In [35]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

In [36]:
avarage = _find_average_score(sentenceValue)
avarage

1.9058468318871509

In [37]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            
            for word, value in tf_idf_matrix[sentence[:15]].items():
                if value >= 0.07:
                     summary += " " + word
                        
            
            summary += ". " 
            
            sentence_count += 1

    return summary

In [38]:
summary = _generate_summary(sentences, sentenceValue, 1.05 * avarage)
print(summary)

 –¥–æ–±—Ä–æ–µ —É—Ç—Ä–æ.  pavel_kikin –º–æ–∂–Ω–æ –¥–≤–µ —Ç–µ–º—ã –¥–ª—è –¥–æ–∫–ª–∞–¥–∞ —Å–æ–≤–º–µ—Å—Ç–∏—Ç—å —Ä–∞—Å—Å–∫–∞–∑–∞—Ç—å –ø—Ä–æ tensortt onnx –∫–≤–∞–Ω—Ç–∏–∑–∞—Ü–∏—é –∑–∞–æ–¥–Ω–æ.  –ø–æ—Å–º–æ—Ç—Ä–∏—Ç–µ –ø–æ–∂–∞–ª—É–π—Å—Ç–∞ –ª–∏—á–∫—É.  —è –±—ã –µ—â—ë triton –¥–æ–±–∞–≤–∏–ª —Å—é–¥–∞ –±—ã–ª–æ –≤ —Ç–µ–º—É.  —Ç—Ä–∏—Ç–æ–Ω —ç—Ç–æ —Å–∫–æ—Ä–µ–µ –∫ tf serving.  —Ç–∞–∫ –æ—Ç –º–æ–¥–µ–ª–∏ —Å–µ—Ä–≤–µ—Ä–∞ –∂–µ–ª–µ–∑–∞ –≤—Å—ë –∑–∞–≤–∏—Å–∏—Ç.  –∑–Ω–∞—é—â–∏–µ –ª—é–¥–∏ –ø–æ–¥—Å–∫–∞–∂–∏—Ç–µ –Ω–∞—á–∏–Ω–∞—é—â–µ–º—É 1 –Ω–∞—Å–∫–æ–ª—å–∫–æ –ø–æ–Ω—è–ª –ø—Ä–æ—Ö–æ–∂–¥–µ–Ω–∏—è –∫—É—Ä—Å–∞ –ø–æ–Ω–∞–¥–æ–±–∏—Ç—Å—è –∫–∞–∫–æ–π –ª–∏–±–æ —Å—É—â–µ—Å—Ç–≤—É—é—â–∏–π ds –ø—Ä–æ–µ–∫—Ç.  2 –≤—ã–ø–æ–ª–Ω–µ–Ω–∏–µ –∑–∞–¥–∞–Ω–∏–π –ø—Ä–µ–¥–ø–æ–ª–∞–≥–∞–µ—Ç—Å—è –ª–æ–∫–∞–ª—å–Ω–æ –∏–ª–∏ –≤ –æ–±–ª–∞–∫–µ.  –¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –ª–∏ –±—É–¥–µ—Ç –¥–ª—è –ª–æ–∫–∞–ª—å–Ω–æ–≥–æ –≤—ã–ø–æ–ª–Ω–µ–Ω–∏—è –Ω–æ—É—Ç–∞ —Å geforce mx 130.  –Ω–∞ –∫–µ–≥–ª–µ —â–∞—Å –∞—Ö—Ä–∏–Ω–µ–Ω–Ω—ã–π —Å–æ—Ä–µ–≤ —Å –ø—Ç–∏—á–∫–∞–º–∏ –≥–¥–µ —Å–æ –∑–≤—É–∫–æ–º —Ä–∞–±–æ—Ç–∞—Ç—å 

In [39]:
total_documents = len(sentences)
freq_matrix = _create_frequency_matrix(sentences)
tf_matrix = _create_tf_matrix(freq_matrix)
count_doc_per_words = _create_documents_per_words(freq_matrix)
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
sentence_scores = _score_sentences(tf_idf_matrix)
threshold = _find_average_score(sentence_scores)
summary = _generate_summary(sentences, sentence_scores, 0.8 * threshold)

In [40]:
summary

' –¥–æ–±—Ä–æ–µ —É—Ç—Ä–æ.  pavel_kikin –º–æ–∂–Ω–æ –¥–≤–µ —Ç–µ–º—ã –¥–ª—è –¥–æ–∫–ª–∞–¥–∞ —Å–æ–≤–º–µ—Å—Ç–∏—Ç—å —Ä–∞—Å—Å–∫–∞–∑–∞—Ç—å –ø—Ä–æ tensortt onnx –∫–≤–∞–Ω—Ç–∏–∑–∞—Ü–∏—é –∑–∞–æ–¥–Ω–æ.  –≥–æ—Ç–æ–≤.  –Ω–∞–ø—Ä–∞–≤–∏–ª –∂–µ–ª–∞–Ω–∏–µ —É—á–∞—Å—Ç–≤–æ–≤–∞—Ç—å –≤ –ª–∏—á–∫—É.  –ø–æ—Å–º–æ—Ç—Ä–∏—Ç–µ –ø–æ–∂–∞–ª—É–π—Å—Ç–∞ –ª–∏—á–∫—É.  –∞ –Ω–µ —Ö–æ—á–µ—à—å –æ–±—ä–µ–¥–µ–Ω–∏—Ç—å—Å—è –¥–ª—è —ç—Ç–æ–π —Ç–µ–º—ã.  —è –±—ã –µ—â—ë triton –¥–æ–±–∞–≤–∏–ª —Å—é–¥–∞ –±—ã–ª–æ –≤ —Ç–µ–º—É.  —Ç—Ä–∏—Ç–æ–Ω —ç—Ç–æ —Å–∫–æ—Ä–µ–µ –∫ tf serving.  –∫—Å—Ç–∞—Ç–∏ –∞ –∫—Ç–æ —É–∂–µ –ø—Ä–æ–±–æ–≤–∞–ª serving dl –º–æ–¥–µ–ª–µ–π.  –∫–∞–∫–æ–µ —É –≤–∞—Å –ø–æ–ª—É—á–∞–ª–æ—Å—å —Å—Ä–µ–¥–Ω–µ–µ –≤—Ä–µ–º—è –Ω–∞ –æ—Ç–≤–µ—Ç.  —Ç–∞–∫ –æ—Ç –º–æ–¥–µ–ª–∏ —Å–µ—Ä–≤–µ—Ä–∞ –∂–µ–ª–µ–∑–∞ –≤—Å—ë –∑–∞–≤–∏—Å–∏—Ç.  –∑–Ω–∞—é—â–∏–µ –ª—é–¥–∏ –ø–æ–¥—Å–∫–∞–∂–∏—Ç–µ –Ω–∞—á–∏–Ω–∞—é—â–µ–º—É 1 –Ω–∞—Å–∫–æ–ª—å–∫–æ –ø–æ–Ω—è–ª –ø—Ä–æ—Ö–æ–∂–¥–µ–Ω–∏—è –∫—É—Ä—Å–∞ –ø–æ–Ω–∞–¥–æ–±–∏—Ç—Å—è –∫–∞–∫–æ–π –ª–∏–±–æ —Å—É—â–µ—Å—Ç–≤—É—é—â–∏–π ds –ø—Ä–æ–µ–∫—Ç.  –µ—Å–ª–∏ —Å–≤–æ–µ–≥–æ –ø