## Import libraries

In [16]:
import pandas as pd
import numpy as np

import ast

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

from sklearn.linear_model import LogisticRegression

import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (16, 12)

In [17]:
import emoji
print(emoji.demojize('Python is üòòüòçüíûüòôüå∏üòö‚ô•Ô∏è'))

Python is :face_blowing_a_kiss::smiling_face_with_heart-eyes::revolving_hearts::kissing_face_with_smiling_eyes::cherry_blossom::kissing_face_with_closed_eyes::heart_suit:


## Raw data reading

In [18]:
df = pd.read_csv('dataset/facebook_yessenov_2.csv')
df = df.loc[df.comments.dropna().index]
df['comments_dict'] = df.comments.apply(ast.literal_eval)
df

Unnamed: 0,id,created_time,message,story,comments,comments_dict
1,1849247698620787_2868153750063505,2021-04-13T15:51:44+0000,üìå13 —Å”ô—É—ñ—Ä –∫“Ø–Ω—ñ Yessenov University “±–π—ã–º–¥–∞—Å—Ç—ã—Ä—É...,Yessenov University —Å–µ–π—á–∞—Å –∑–¥–µ—Å—å: Yessenov Uni...,{'data': [{'created_time': '2021-04-13T18:52:1...,{'data': [{'created_time': '2021-04-13T18:52:1...
2,1849247698620787_2868147476730799,2021-04-13T15:39:30+0000,üìåYessenov University –ë—ñ–ª—ñ–º –±–µ—Ä—É –º–µ–∫—Ç–µ–±—ñ–Ω—ñ“£ “±–π—ã...,Yessenov University —Å–µ–π—á–∞—Å –∑–¥–µ—Å—å: Yessenov Uni...,{'data': [{'created_time': '2021-04-13T19:55:3...,{'data': [{'created_time': '2021-04-13T19:55:3...
3,1849247698620787_2867867713425442,2021-04-13T06:38:10+0000,Yessenov University –±–∞—Ä–ª—ã“õ—Ç–∞—Ä—ã“£—ã–∑–¥—ã “õ–∞—Å–∏–µ—Ç—Ç—ñ –†...,Yessenov University —Å–µ–π—á–∞—Å –∑–¥–µ—Å—å: Yessenov Uni...,{'data': [{'created_time': '2021-04-13T08:25:5...,{'data': [{'created_time': '2021-04-13T08:25:5...
5,1849247698620787_2867838203428393,2021-04-13T05:52:32+0000,"üìå12-—à—ñ —Å”ô—É—ñ—Ä –∫“Ø–Ω—ñ Yessenov University-–¥–µ ""–¢“±–ª“ì...",Yessenov University —Å–µ–π—á–∞—Å –∑–¥–µ—Å—å: Yessenov Uni...,{'data': [{'created_time': '2021-04-13T07:48:5...,{'data': [{'created_time': '2021-04-13T07:48:5...
6,1849247698620787_2867459396799607,2021-04-12T19:12:44+0000,üìå2 —Å”ô—É—ñ—Ä –∂”ô–Ω–µ 9 —Å”ô—É—ñ—Ä –∫“Ø–Ω–¥–µ—Ä—ñ Yessenov Univers...,Yessenov University —Å–µ–π—á–∞—Å –∑–¥–µ—Å—å: Yessenov Uni...,{'data': [{'created_time': '2021-04-13T19:56:2...,{'data': [{'created_time': '2021-04-13T19:56:2...
...,...,...,...,...,...,...
1674,1849247698620787_1963533393858883,2017-10-20T05:17:40+0000,–®.–ï—Å–µ–Ω–æ–≤ –∞—Ç—ã–Ω–¥–∞“ì—ã –ö–ú–¢–ò–£-–¥—ã“£ “õ“±—Ä–º–µ—Ç—Ç—ñ –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä...,Yessenov University –¥–æ–±–∞–≤–∏–ª(-–∞) 16 –Ω–æ–≤—ã—Ö —Ñ–æ—Ç–æ ...,{'data': [{'created_time': '2017-10-20T14:54:2...,{'data': [{'created_time': '2017-10-20T14:54:2...
1678,1849247698620787_1963498200529069,2017-10-25T10:11:23+0000,–ü—Ä–æ—Ñ–µ—Å—Å–æ—Ä –ë.–ò. –ù“±—Ä–¥–∞—É–ª–µ—Ç–æ–≤–∞–Ω—ã“£ –¥”ô—Ä—ñ—Å—ñ\n\n¬´“ö–∞–∑–∞...,Yessenov University –¥–æ–±–∞–≤–∏–ª(-–∞) 6 –Ω–æ–≤—ã—Ö —Ñ–æ—Ç–æ –æ...,{'data': [{'created_time': '2017-10-25T16:21:1...,{'data': [{'created_time': '2017-10-25T16:21:1...
1692,1849247698620787_1954940424718180,2017-10-04T14:36:44+0000,¬´“∞—Å—Ç–∞–∑–¥–∞—Ä- “±–ª–∞“ì–∞—Ç—Ç—ã —Ç“±–ª“ì–∞¬ª –∞—Ç—Ç—ã –º–µ—Ä–µ–∫–µ–ª—ñ–∫ —à–∞—Ä–∞...,,{'data': [{'created_time': '2017-10-07T14:37:0...,{'data': [{'created_time': '2017-10-07T14:37:0...
1698,1849247698620787_1951614865050736,2017-09-26T11:20:46+0000,–£–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç –æ“õ—ã—Ç—É—à—ã–ª–∞—Ä—ã –º–µ–Ω —Å—Ç—É–¥–µ–Ω—Ç—Ç–µ—Ä—ñ –•–∞–ª—ã“õ–∞...,,{'data': [{'created_time': '2017-09-29T14:17:0...,{'data': [{'created_time': '2017-09-29T14:17:0...


In [19]:
comment_list = []

for post in df.values.tolist():
    for comment in post[5]['data']:
        comment_list.append(post[:4] + [comment['id'], comment['message'], comment['created_time']])

In [20]:
comment_df = pd.DataFrame(comment_list, 
                          columns=['id', 'created_time', 'post_text', 'story', 
                                   'comment_id', 'comment_text', 'comment_created_time'])
comment_df = comment_df[comment_df.comment_text.apply(lambda x: True if len(x.split(' ')) > 1 else False)]

In [21]:
import re

def cleaning_data(text):
#     if text.split(' ')[0][0] == "@":
#         text = ' '.join(text.split(" ")[1:])
    text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text)
    text = str(text).replace('(<br/>)', '')
    text = text.replace('(<a).*(>).*(</a>)', '')
    text = text.replace('(&amp)', '')
    text = text.replace('(&gt)', '')
    text = text.replace('(&lt)', '')
    text = text.replace('(\xa0)', ' ')
    text = text.replace('-', ' ')
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    text = re.sub('[^–ê-–Ø,–∞-—è,”ò,–Ü,“¢,“í,“Æ,“∞,“ö,”®,“∫,”ô,—ñ,…ô,“£,“ì,“Ø,“±,“õ,”©,“ª]', ' ', str(text).replace('-', ''))
    text = re.sub('_', '', text)
    text = re.sub('\s+', ' ', text)
    return str(text).lower().strip()

In [22]:
comment_df['post_text'] = comment_df.post_text.astype(str).apply(cleaning_data)
comment_df['comment_text'] = comment_df.comment_text.astype(str).apply(cleaning_data)
comment_df.head()

Unnamed: 0,id,created_time,post_text,story,comment_id,comment_text,comment_created_time
0,1849247698620787_2868153750063505,2021-04-13T15:51:44+0000,—Å”ô—É—ñ—Ä –∫“Ø–Ω—ñ “±–π—ã–º–¥–∞—Å—Ç—ã—Ä—É—ã–º–µ–Ω –∂–µ–º“õ–æ—Ä–ª—ã“õ“õ–∞ “õ–∞—Ä—Å—ã –º...,Yessenov University —Å–µ–π—á–∞—Å –∑–¥–µ—Å—å: Yessenov Uni...,2868153750063505_2868277350051145,–∂–∞—Ä–∞–π—Å—ã–Ω–¥–∞—Ä,2021-04-13T18:52:15+0000
1,1849247698620787_2868153750063505,2021-04-13T15:51:44+0000,—Å”ô—É—ñ—Ä –∫“Ø–Ω—ñ “±–π—ã–º–¥–∞—Å—Ç—ã—Ä—É—ã–º–µ–Ω –∂–µ–º“õ–æ—Ä–ª—ã“õ“õ–∞ “õ–∞—Ä—Å—ã –º...,Yessenov University —Å–µ–π—á–∞—Å –∑–¥–µ—Å—å: Yessenov Uni...,2868153750063505_2868155300063350,—Ç–∞“ì—ã–ª—ã–º–¥—ã –¥”ô—Ä—ñ—Å –±–æ–ª–¥—ã,2021-04-13T15:54:57+0000
2,1849247698620787_2868153750063505,2021-04-13T15:51:44+0000,—Å”ô—É—ñ—Ä –∫“Ø–Ω—ñ “±–π—ã–º–¥–∞—Å—Ç—ã—Ä—É—ã–º–µ–Ω –∂–µ–º“õ–æ—Ä–ª—ã“õ“õ–∞ “õ–∞—Ä—Å—ã –º...,Yessenov University —Å–µ–π—á–∞—Å –∑–¥–µ—Å—å: Yessenov Uni...,2868153750063505_2868620773350136,"—Ä–∞—Ö–º–µ—Ç —É–π—ã–º–¥–∞—Å—Ç—ã—Ä—É—à—ã–ª–∞—Ä–≥–∞, –∫—ã–∑—ã–∫—Ç—ã –∫–µ–∑–¥–µ—Å—É –±–æ–ª–¥—ã",2021-04-14T04:04:54+0000
3,1849247698620787_2868153750063505,2021-04-13T15:51:44+0000,—Å”ô—É—ñ—Ä –∫“Ø–Ω—ñ “±–π—ã–º–¥–∞—Å—Ç—ã—Ä—É—ã–º–µ–Ω –∂–µ–º“õ–æ—Ä–ª—ã“õ“õ–∞ “õ–∞—Ä—Å—ã –º...,Yessenov University —Å–µ–π—á–∞—Å –∑–¥–µ—Å—å: Yessenov Uni...,2868153750063505_2868183710060509,–æ“£ –±–æ–ª—Å—ã–Ω –±–∞—Å—Ç–∞–º–∞,2021-04-13T16:52:29+0000
4,1849247698620787_2868153750063505,2021-04-13T15:51:44+0000,—Å”ô—É—ñ—Ä –∫“Ø–Ω—ñ “±–π—ã–º–¥–∞—Å—Ç—ã—Ä—É—ã–º–µ–Ω –∂–µ–º“õ–æ—Ä–ª—ã“õ“õ–∞ “õ–∞—Ä—Å—ã –º...,Yessenov University —Å–µ–π—á–∞—Å –∑–¥–µ—Å—å: Yessenov Uni...,2868153750063505_2868348833377330,–∂–∞“õ—Å—ã –¥”ô—Ä—ñ—Å,2021-04-13T19:55:12+0000


In [23]:
filtired_df = comment_df[comment_df['comment_text'].apply(lambda x: True if len(x.split(' ')) > 1 else False)]
filtired_df

Unnamed: 0,id,created_time,post_text,story,comment_id,comment_text,comment_created_time
1,1849247698620787_2868153750063505,2021-04-13T15:51:44+0000,—Å”ô—É—ñ—Ä –∫“Ø–Ω—ñ “±–π—ã–º–¥–∞—Å—Ç—ã—Ä—É—ã–º–µ–Ω –∂–µ–º“õ–æ—Ä–ª—ã“õ“õ–∞ “õ–∞—Ä—Å—ã –º...,Yessenov University —Å–µ–π—á–∞—Å –∑–¥–µ—Å—å: Yessenov Uni...,2868153750063505_2868155300063350,—Ç–∞“ì—ã–ª—ã–º–¥—ã –¥”ô—Ä—ñ—Å –±–æ–ª–¥—ã,2021-04-13T15:54:57+0000
2,1849247698620787_2868153750063505,2021-04-13T15:51:44+0000,—Å”ô—É—ñ—Ä –∫“Ø–Ω—ñ “±–π—ã–º–¥–∞—Å—Ç—ã—Ä—É—ã–º–µ–Ω –∂–µ–º“õ–æ—Ä–ª—ã“õ“õ–∞ “õ–∞—Ä—Å—ã –º...,Yessenov University —Å–µ–π—á–∞—Å –∑–¥–µ—Å—å: Yessenov Uni...,2868153750063505_2868620773350136,"—Ä–∞—Ö–º–µ—Ç —É–π—ã–º–¥–∞—Å—Ç—ã—Ä—É—à—ã–ª–∞—Ä–≥–∞, –∫—ã–∑—ã–∫—Ç—ã –∫–µ–∑–¥–µ—Å—É –±–æ–ª–¥—ã",2021-04-14T04:04:54+0000
3,1849247698620787_2868153750063505,2021-04-13T15:51:44+0000,—Å”ô—É—ñ—Ä –∫“Ø–Ω—ñ “±–π—ã–º–¥–∞—Å—Ç—ã—Ä—É—ã–º–µ–Ω –∂–µ–º“õ–æ—Ä–ª—ã“õ“õ–∞ “õ–∞—Ä—Å—ã –º...,Yessenov University —Å–µ–π—á–∞—Å –∑–¥–µ—Å—å: Yessenov Uni...,2868153750063505_2868183710060509,–æ“£ –±–æ–ª—Å—ã–Ω –±–∞—Å—Ç–∞–º–∞,2021-04-13T16:52:29+0000
4,1849247698620787_2868153750063505,2021-04-13T15:51:44+0000,—Å”ô—É—ñ—Ä –∫“Ø–Ω—ñ “±–π—ã–º–¥–∞—Å—Ç—ã—Ä—É—ã–º–µ–Ω –∂–µ–º“õ–æ—Ä–ª—ã“õ“õ–∞ “õ–∞—Ä—Å—ã –º...,Yessenov University —Å–µ–π—á–∞—Å –∑–¥–µ—Å—å: Yessenov Uni...,2868153750063505_2868348833377330,–∂–∞“õ—Å—ã –¥”ô—Ä—ñ—Å,2021-04-13T19:55:12+0000
5,1849247698620787_2868147476730799,2021-04-13T15:39:30+0000,–±—ñ–ª—ñ–º –±–µ—Ä—É –º–µ–∫—Ç–µ–±—ñ–Ω—ñ“£ “±–π—ã–º–¥–∞—Å—Ç—ã—Ä—É—ã–º–µ–Ω —Å”ô—É—ñ—Ä –∫“Ø...,Yessenov University —Å–µ–π—á–∞—Å –∑–¥–µ—Å—å: Yessenov Uni...,2868147476730799_2868349103377303,–∂–∞“õ—Å—ã –±–∞—Å—Ç–∞–º–∞,2021-04-13T19:55:30+0000
...,...,...,...,...,...,...,...
4588,1849247698620787_1963498200529069,2017-10-25T10:11:23+0000,–ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä –± –∏ –Ω“±—Ä–¥–∞—É–ª–µ—Ç–æ–≤–∞–Ω—ã“£ –¥”ô—Ä—ñ—Å—ñ “õ–∞–∑–∞“õ —Ç—ñ–ª...,Yessenov University –¥–æ–±–∞–≤–∏–ª(-–∞) 6 –Ω–æ–≤—ã—Ö —Ñ–æ—Ç–æ –æ...,1963498040529085_1963508637194692,"–±–∏–±–∞–π—à–∞ –∏–ª—å—è—Å–æ–≤–Ω–∞, –º—ã“õ—Ç—ã –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä",2017-10-25T10:41:57+0000
4589,1849247698620787_1954940424718180,2017-10-04T14:36:44+0000,“±—Å—Ç–∞–∑–¥–∞—Ä “±–ª–∞“ì–∞—Ç—Ç—ã —Ç“±–ª“ì–∞ –∞—Ç—Ç—ã –º–µ—Ä–µ–∫–µ–ª—ñ–∫ —à–∞—Ä–∞ ”©—Ç...,,1954940424718180_1956190337926522,—Å–ø–∞—Å–∏–±–æ –ø—Ä–æ—Ñ–∫–æ–º—É –∑–∞ —Ç –ø–ª—ã–µ –ø–æ–∑–¥—Ä–∞–≤–ª–µ–Ω–∏—è –∏ –ø–æ–¥–∞...,2017-10-07T14:37:06+0000
4592,1849247698620787_1954940424718180,2017-10-04T14:36:44+0000,“±—Å—Ç–∞–∑–¥–∞—Ä “±–ª–∞“ì–∞—Ç—Ç—ã —Ç“±–ª“ì–∞ –∞—Ç—Ç—ã –º–µ—Ä–µ–∫–µ–ª—ñ–∫ —à–∞—Ä–∞ ”©—Ç...,,1954940424718180_1956628157882740,—Å–µ–º–±–µ–∫ –∫—Ä–∞—Å–∞–≤—á–∏–∫,2017-10-08T14:10:34+0000
4593,1849247698620787_1951614865050736,2017-09-26T11:20:46+0000,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç –æ“õ—ã—Ç—É—à—ã–ª–∞—Ä—ã –º–µ–Ω —Å—Ç—É–¥–µ–Ω—Ç—Ç–µ—Ä—ñ —Ö–∞–ª—ã“õ–∞...,,1951614865050736_1952841451594744,–±–µ—Ä—ñ–∫ –±–∞“õ—ã—Ç–∂–∞–Ω“±–ª—ã –±“±–ª –∂–æ–æ –Ω—ã–Ω –∂–æ“ì–∞—Ä—ã –¥–µ“£–≥–µ–π–≥–µ ...,2017-09-29T14:17:09+0000


## Data reading

In [24]:
filtired_df = pd.read_csv('translated.txt', sep='\t')
filtired_df.columns = columns=['id', 'created_time', 'post_text', 'story', 
                                   'comment_id', 'comment_text', 'comment_created_time']
filtired_df

Unnamed: 0,id,created_time,post_text,story,comment_id,comment_text,comment_created_time
1,1849247698620787_2868153750063505,2021-04-13T15: 51: 44 + 0000,–ì-–Ω –ï—Å–µ–Ω–æ–≤ –ø—Ä–æ–≤–µ–ª –æ–Ω–ª–∞–π–Ω-–≤–∏–¥–µ–æ–ª–µ–∫—Ü–∏—é –ø–æ –∞–Ω—Ç–∏–∫–æ...,–£–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç –ï—Å–µ–Ω–æ–≤–∞ —Ç–µ–ø–µ—Ä—å –∑–¥–µ—Å—å: –£–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç ...,2868153750063505_2868155300063350,–±—ã–ª–∞ –ø–æ—É—á–∏—Ç–µ–ª—å–Ω–æ–π –ª–µ–∫—Ü–∏–µ–π,2021-04-13T15: 54: 57 + 0000
2,1849247698620787_2868153750063505,2021-04-13T15: 51: 44 + 0000,–ì-–Ω –ï—Å–µ–Ω–æ–≤ –ø—Ä–æ–≤–µ–ª –æ–Ω–ª–∞–π–Ω-–≤–∏–¥–µ–æ–ª–µ–∫—Ü–∏—é –ø–æ –∞–Ω—Ç–∏–∫–æ...,–£–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç –ï—Å–µ–Ω–æ–≤–∞ —Ç–µ–ø–µ—Ä—å –∑–¥–µ—Å—å: –£–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç ...,2868153750063505_2868620773350136,"–°–ø–∞—Å–∏–±–æ –æ—Ä–≥–∞–Ω–∏–∑–∞—Ç–æ—Ä–∞–º, —ç—Ç–æ –±—ã–ª–∞ –∏–Ω—Ç–µ—Ä–µ—Å–Ω–∞—è –≤—Å—Ç...",2021-04-14T04: 04: 54 + 0000
3,1849247698620787_2868153750063505,2021-04-13T15: 51: 44 + 0000,–ì-–Ω –ï—Å–µ–Ω–æ–≤ –ø—Ä–æ–≤–µ–ª –æ–Ω–ª–∞–π–Ω-–≤–∏–¥–µ–æ–ª–µ–∫—Ü–∏—é –ø–æ –∞–Ω—Ç–∏–∫–æ...,–£–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç –ï—Å–µ–Ω–æ–≤–∞ —Ç–µ–ø–µ—Ä—å –∑–¥–µ—Å—å: –£–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç ...,2868153750063505_2868183710060509,–ü—É—Å—Ç—å –∏–Ω–∏—Ü–∏–∞—Ç–∏–≤–∞ –±—É–¥–µ—Ç –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–π,2021-04-13T16: 52: 29 + 0000
4,1849247698620787_2868153750063505,2021-04-13T15: 51: 44 + 0000,–ì-–Ω –ï—Å–µ–Ω–æ–≤ –ø—Ä–æ–≤–µ–ª –æ–Ω–ª–∞–π–Ω-–≤–∏–¥–µ–æ–ª–µ–∫—Ü–∏—é –ø–æ –∞–Ω—Ç–∏–∫–æ...,–£–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç –ï—Å–µ–Ω–æ–≤–∞ —Ç–µ–ø–µ—Ä—å –∑–¥–µ—Å—å: –£–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç ...,2868153750063505_2868348833377330,—Ö–æ—Ä–æ—à–∞—è –ª–µ–∫—Ü–∏—è,2021-04-13T19: 55: 12 + 0000
5,1849247698620787_2868147476730799,2021-04-13T15: 39: 30 + 0000,–ö—Ä—É–≥–ª—ã–π —Å—Ç–æ–ª –Ω–∞ —Ç–µ–º—É ¬´–ú–µ–Ω–µ–¥–∂–º–µ–Ω—Ç –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –∏...,–£–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç –ï—Å–µ–Ω–æ–≤–∞ —Ç–µ–ø–µ—Ä—å –∑–¥–µ—Å—å: –£–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç ...,2868147476730799_2868349103377303,—Ö–æ—Ä–æ—à–µ–µ –Ω–∞—á–∞–ª–æ,2021-04-13T19: 55: 30 + 0000
...,...,...,...,...,...,...,...
4588,1849247698620787_1963498200529069,2017-10-25T10: 11: 23 + 0000,"–ü—Ä–æ—Ñ–µ—Å—Å–æ—Ä –ë. –ù—É—Ä–¥–∞—É–ª–µ—Ç–æ–≤–∞, –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä –∫–∞—Ñ–µ–¥—Ä—ã –∫...",–ï—Å–µ–Ω–æ–≤—Å–∫–∏–π —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç –¥–æ–±–∞–≤–∏–ª 6 –Ω–æ–≤—ã—Ö —Ñ–æ—Ç–æ–≥—Ä–∞...,1963498040529085_1963508637194692,"–ë–∏–±–∞–π—à–∞ –ò–ª—å—è—Å–æ–≤–Ω–∞, —Å–∏–ª—å–Ω—ã–π –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä",2017-10-25T10: 41: 57 + 0000
4589,1849247698620787_1954940424718180,2017-10-04T14: 36: 44 + 0000,–í –æ–∫—Ç—è–±—Ä–µ —ç—Ç–æ–≥–æ –≥–æ–¥–∞ –≤ —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–µ –∏–º–µ–Ω–∏ –®. –ï...,NotAddedInfo,1954940424718180_1956190337926522,—Å–ø–∞—Å–∏–±–æ –ø—Ä–æ—Ñ—Å–æ—é–∑—É –∑–∞ —Ç—Ä–µ–ø—ã–µ –ø–æ–∑–¥—Ä–∞–≤–ª–µ–Ω–∏—è –∏ –ø–æ–¥...,2017-10-07T14: 37: 06 + 0000
4592,1849247698620787_1954940424718180,2017-10-04T14: 36: 44 + 0000,–í –æ–∫—Ç—è–±—Ä–µ —ç—Ç–æ–≥–æ –≥–æ–¥–∞ –≤ —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–µ –∏–º–µ–Ω–∏ –®. –ï...,NotAddedInfo,1954940424718180_1956628157882740,–°–µ–º–±–µ–∫ –∫—Ä–∞—Å–∏–≤,2017-10-08T14: 10:34 + 0000
4593,1849247698620787_1951614865050736,2017-09-26T11: 20: 46 + 0000,–†–µ–∫—Ç–æ—Ä—ã –∏ —Å—Ç—É–¥–µ–Ω—Ç—ã —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–∞ –ø—Ä–∏–Ω—è–ª–∏ —É—á–∞—Å—Ç–∏...,NotAddedInfo,1951614865050736_1952841451594744,"–ü–æ–Ω—è—Ç–Ω–æ, —á—Ç–æ –ë–µ—Ä–∏–∫ –ë–∞–∫—ã—Ç–∂–∞–Ω–æ–≤–∏—á –ø–æ–¥–Ω–∏–º–µ—Ç —ç—Ç–æ—Ç ...",2017-09-29T14: 17: 09 + 0000


## Sentimental model classifier

In [10]:
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel

tokenizer = RegexTokenizer()
model = FastTextSocialNetworkModel(tokenizer=tokenizer)



In [11]:
results = pd.DataFrame(model.predict(filtired_df.comment_text.astype(str)))
results['post_id'] = filtired_df['id'].values
results['post_text'] = filtired_df['post_text'].astype(str).apply(cleaning_data).values
results['comment_text'] = filtired_df['comment_text'].astype(str).apply(cleaning_data).values
results['comment_id'] = filtired_df['comment_id'].values
results = results[['post_id', 'post_text', 'comment_text',
                   'comment_id', 'positive', 'negative', 'neutral', 'skip', 'speech', ]]
results['negative_prob'] = (results.negative / results.loc[:, ['negative', 'positive',
                                                             'skip', 'neutral', 'speech',]].sum(axis=1)) * 100
results['positive_prob'] = (results.positive / results.loc[:, ['negative', 'positive',
                                                             'skip', 'neutral', 'speech',]].sum(axis=1)) * 100
results['negative_boolean'] = results.negative_prob > results.positive_prob
results

Unnamed: 0,post_id,post_text,comment_text,comment_id,positive,negative,neutral,skip,speech,negative_prob,positive_prob,negative_boolean
0,1849247698620787_2868153750063505,–≥ –Ω –µ—Å–µ–Ω–æ–≤ –ø—Ä–æ–≤–µ–ª –æ–Ω–ª–∞–π–Ω –≤–∏–¥–µ–æ–ª–µ–∫—Ü–∏—é –ø–æ –∞–Ω—Ç–∏–∫–æ...,–±—ã–ª–∞ –ø–æ—É—á–∏—Ç–µ–ª—å–Ω–æ–π –ª–µ–∫—Ü–∏–µ–π,2868153750063505_2868155300063350,0.160276,0.015435,0.901931,0.003955,0.000789,1.425975,14.807658,False
1,1849247698620787_2868153750063505,–≥ –Ω –µ—Å–µ–Ω–æ–≤ –ø—Ä–æ–≤–µ–ª –æ–Ω–ª–∞–π–Ω –≤–∏–¥–µ–æ–ª–µ–∫—Ü–∏—é –ø–æ –∞–Ω—Ç–∏–∫–æ...,"—Å–ø–∞—Å–∏–±–æ –æ—Ä–≥–∞–Ω–∏–∑–∞—Ç–æ—Ä–∞–º, —ç—Ç–æ –±—ã–ª–∞ –∏–Ω—Ç–µ—Ä–µ—Å–Ω–∞—è –≤—Å—Ç...",2868153750063505_2868620773350136,0.206904,0.008857,0.026769,0.014967,0.075868,2.656946,62.065143,False
2,1849247698620787_2868153750063505,–≥ –Ω –µ—Å–µ–Ω–æ–≤ –ø—Ä–æ–≤–µ–ª –æ–Ω–ª–∞–π–Ω –≤–∏–¥–µ–æ–ª–µ–∫—Ü–∏—é –ø–æ –∞–Ω—Ç–∏–∫–æ...,–ø—É—Å—Ç—å –∏–Ω–∏—Ü–∏–∞—Ç–∏–≤–∞ –±—É–¥–µ—Ç –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–π,2868153750063505_2868183710060509,0.019134,0.008587,0.966924,0.005070,0.009718,0.850723,1.895521,False
3,1849247698620787_2868153750063505,–≥ –Ω –µ—Å–µ–Ω–æ–≤ –ø—Ä–æ–≤–µ–ª –æ–Ω–ª–∞–π–Ω –≤–∏–¥–µ–æ–ª–µ–∫—Ü–∏—é –ø–æ –∞–Ω—Ç–∏–∫–æ...,—Ö–æ—Ä–æ—à–∞—è –ª–µ–∫—Ü–∏—è,2868153750063505_2868348833377330,0.960371,0.001711,0.015916,0.000617,0.000010,0.174809,98.134711,False
4,1849247698620787_2868147476730799,–∫—Ä—É–≥–ª—ã–π —Å—Ç–æ–ª –Ω–∞ —Ç–µ–º—É –º–µ–Ω–µ–¥–∂–º–µ–Ω—Ç –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –∏ ...,—Ö–æ—Ä–æ—à–µ–µ –Ω–∞—á–∞–ª–æ,2868147476730799_2868349103377303,0.992664,0.003955,0.002811,0.006703,0.000010,0.393126,98.660312,False
...,...,...,...,...,...,...,...,...,...,...,...,...
3950,1849247698620787_1963498200529069,"–ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä –± –Ω—É—Ä–¥–∞—É–ª–µ—Ç–æ–≤–∞, –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä –∫–∞—Ñ–µ–¥—Ä—ã –∫–∞...","–±–∏–±–∞–π—à–∞ –∏–ª—å—è—Å–æ–≤–Ω–∞, —Å–∏–ª—å–Ω—ã–π –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä",1963498040529085_1963508637194692,0.036230,0.031154,0.500010,0.164526,0.000921,4.251101,4.943778,False
3951,1849247698620787_1954940424718180,–≤ –æ–∫—Ç—è–±—Ä–µ —ç—Ç–æ–≥–æ –≥–æ–¥–∞ –≤ —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–µ –∏–º–µ–Ω–∏ —à –µ—Å...,—Å–ø–∞—Å–∏–±–æ –ø—Ä–æ—Ñ—Å–æ—é–∑—É –∑–∞ —Ç—Ä–µ–ø—ã–µ –ø–æ–∑–¥—Ä–∞–≤–ª–µ–Ω–∏—è –∏ –ø–æ–¥...,1954940424718180_1956190337926522,0.362979,0.024433,0.007356,0.042098,0.798197,1.978288,29.389538,False
3952,1849247698620787_1954940424718180,–≤ –æ–∫—Ç—è–±—Ä–µ —ç—Ç–æ–≥–æ –≥–æ–¥–∞ –≤ —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–µ –∏–º–µ–Ω–∏ —à –µ—Å...,—Å–µ–º–±–µ–∫ –∫—Ä–∞—Å–∏–≤,1954940424718180_1956628157882740,0.991163,0.000010,0.006300,0.003183,0.000010,0.000999,99.050373,False
3953,1849247698620787_1951614865050736,—Ä–µ–∫—Ç–æ—Ä—ã –∏ —Å—Ç—É–¥–µ–Ω—Ç—ã —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–∞ –ø—Ä–∏–Ω—è–ª–∏ —É—á–∞—Å—Ç–∏...,"–ø–æ–Ω—è—Ç–Ω–æ, —á—Ç–æ –±–µ—Ä–∏–∫ –±–∞–∫—ã—Ç–∂–∞–Ω–æ–≤–∏—á –ø–æ–¥–Ω–∏–º–µ—Ç —ç—Ç–æ—Ç ...",1951614865050736_1952841451594744,0.080367,0.268951,0.577505,0.106701,0.002561,25.958414,7.756835,True


In [None]:
from datetime import datetime
filtired_df['post_datetime'] = filtired_df.media_created_at.apply(lambda x: datetime.utcfromtimestamp(x).strftime('%d.%m.%Y %H:%M:%S'))
filtired_df['comment_datetime'] = filtired_df.comment_created_at.apply(lambda x: datetime.utcfromtimestamp(x).strftime('%d.%m.%Y %H:%M:%S'))
filtired_df

In [13]:
results.to_excel('excel/facebook_comments.xlsx')

## Top10 words for posts

In [14]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("russian")

import stanza
nlp = stanza.Pipeline(lang='ru', processors = "tokenize,lemma", tokenize_batch_size=16)

def stemming(sentences):
    return ' '.join([stemmer.stem(word) for word in sentences.split()])

def lemmatizing(text):
    return ' '.join([j.lemma for i in nlp(text).sentences for j in i.words])

2021-04-22 13:07:08 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| lemma     | syntagrus |

2021-04-22 13:07:08 INFO: Use device: gpu
2021-04-22 13:07:08 INFO: Loading: tokenize
2021-04-22 13:07:10 INFO: Loading: lemma
2021-04-22 13:07:10 INFO: Done loading processors!


In [15]:
lemmatized = results.drop_duplicates('post_id').post_text.apply(lemmatizing)

In [18]:
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words=[lemmatizing(word) for word in stopwords.words('russian')])
X_tfidf = tfidf.fit_transform(lemmatized).toarray()
vocab = tfidf.vocabulary_
reverse_vocab = {v:k for k,v in vocab.items()}

feature_names = tfidf.get_feature_names()
idx = (-X_tfidf).argsort(axis=1)

tfidf_max10 = idx[:, :20]
tfidf_weight = -np.sort(-X_tfidf, axis=1)[:, :20]

df_tfidf = pd.DataFrame([[reverse_vocab.get(item) for item in row] for row in tfidf_max10])
cl_names = ['top_' + str(i+1) for i in range(20)]
df_tfidf.columns = cl_names
df_tfidf['post_id'] = results.drop_duplicates('post_id')['post_id'].values
df_tfidf = df_tfidf.set_index('post_id')
df_tfidf = df_tfidf[cl_names]
df_tfidf[['weight_' + str(i+1) for i in range(20)]] = -np.sort(-X_tfidf, axis=1)[:, :20]
df_tfidf = df_tfidf[np.array([[df_tfidf.columns[i], df_tfidf.columns[i+20]] for i in range(20)]).reshape(20*2)]
df_tfidf

Unnamed: 0_level_0,top_1,weight_1,top_2,weight_2,top_3,weight_3,top_4,weight_4,top_5,weight_5,...,top_16,weight_16,top_17,weight_17,top_18,weight_18,top_19,weight_19,top_20,weight_20
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1849247698620787_2868153750063505,–≤–∏–¥–µ–æ–ª–µ–∫—Ü–∏–π,0.571383,–∞–Ω—Ç–∏–∫–æ—Ä—Ä—É–ø—Ü–∏–æ–Ω–Ω—ã–π,0.332566,–ø–æ–ª–µ–∑–Ω—ã–π,0.234988,–ø–æ–ª—É—á–∏—Ç—å,0.228441,–æ—Ç–≤–µ—Ç,0.216275,...,–∏—Å—á–µ—Ä–ø—ã–≤–∞—é—â–∏–π,0.121701,—É–∑–Ω–∞—Ç—å,0.121701,—Ö–∏–º–∏—á–µ—Å–∫–∏–π,0.113937,–ø—Ä–∏—Å—É—Ç—Å—Ç–≤–æ–≤–∞—Ç—å,0.113937,–ø–æ–ª–Ω—ã–π,0.108137
1849247698620787_2868147476730799,–∫–æ–¥,0.256901,–≥–∏–º–Ω–∞–∑–∏—è,0.240073,—Å—Å—ã–ª–∫–∞,0.228134,–¥–æ—Å—Ç—É–ø,0.228134,–ø—Ä–∏—Å–æ–µ–¥–∏–Ω–∏—Ç—å—Å—è,0.228134,...,–æ–Ω–ª–∞–π–Ω,0.145920,–º–∞–≥—Ç—Ä–∞–Ω—Ç—Ä–∞–Ω—å,0.128450,–≥–∞–∑–±–∏–ª—å–Ω—ã–π,0.128450,–∞—è—Ç–∂–∞–Ω–Ω—ã–π,0.128450,–∞—Ö–º–µ—Ç–∂–∞–Ω–æ–≤–∏—á—å,0.128450
1849247698620787_2867867713425442,–ø—É—Å—Ç—å,0.547798,–≤–∞—à,0.455083,—Å–≤–µ—Ç,0.401405,–ª–∏—Ü–æ,0.258352,—Å–µ—Ä–¥—Ü–µ,0.237778,...,–º–∏—Ä,0.088344,–ø–æ–∑–¥—Ä–∞–≤–ª—è—Ç—å,0.078057,–ø—Ä–∏–Ω—è—Ç—å,0.069926,–∫–∞–∑–æ–π—Å—Ç–∞–Ω–æ,0.067958,–ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ,0.000000
1849247698620787_2867838203428393,—Ä–æ–ª—å,0.328982,–∞–ª–ª–∞–±–µ—Ä–≥–µ–Ω–Ω—ã–π,0.246977,–∫–æ–Ω–∞—Ä–±–∞–∏,0.246977,–ø—Ä–µ–∑–∏–¥–∏—É–º,0.230799,—Å–µ–º–µ–π–Ω—ã–π,0.203144,...,–ø–æ–ª—É—á–∏—Ç—å,0.131656,–≤–æ–ø—Ä–æ—Å,0.126467,–≤—Ç–æ—Ä–∂–µ–Ω–∏–µ,0.123488,—à—Ç–∞–±,0.123488,–≤—Å—Ç—Ä–µ—á–∞,0.119355
1849247698620787_2863965843815629,–≤–µ—Ç–µ—Ä–∞–Ω,0.503161,–Ω–µ—Ñ—Ç—è–Ω–∏–∫,0.243136,–≤—Å—Ç—Ä–µ—á–∞,0.195832,—Ä–µ–≥–∏–æ–Ω,0.182360,—Ç—Ä—É–¥,0.163995,...,–ª—é–±–∏–º—ã–π,0.133323,–ø—Ä–æ—Ñ–µ—Å—Å–∏—è,0.129287,—Å–ª—É—á–∞–π,0.122706,–æ—Ç–≤–µ—Ç,0.122706,–æ—Ç–∫—Ä—ã—Ç—ã–π,0.117452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1849247698620787_1963011817244374,—Å–æ—Ä–µ–≤–Ω–æ–≤–∞–Ω–∏–µ,0.576152,–∫–æ–º–∞–Ω–¥–∞,0.238988,–º–µ–Ω–µ–¥–∂–º–µ–Ω—Ç,0.202276,–ª—É—á—à–∏–π,0.187117,–º–µ—Å—Ç–æ,0.167022,...,—Ñ–∞–∫—É–ª—å—Ç–µ—Ç,0.115696,–º–µ–Ω–¥–∞–ª–∏–µ,0.108725,—Ñ–∏–∑–∫—É–ª—å—Ç—É—Ä–∞,0.108725,–Ω–∞—É—Ä—ã–∑–±—ã–π,0.108725,—Å—É–¥—å—è,0.101604
1849247698620787_1963533393858883,–ø–æ—ç—Ç,0.488105,—ç—Å–µ–Ω–≥–∞–ª—å–Ω—ã–π,0.306257,–ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä,0.259042,–ª–∏—Ç–µ—Ä–∞—Ç—É—Ä–∞,0.247884,—á–∏—Ç–∞—Ç—å,0.201523,...,–∑–∞—Ç–µ–º,0.104369,–ø–∏—Å–∞—Ç—å,0.104369,–ª–∞—É—Ä–µ–∞—Ç,0.100761,—Å—Ç—É–¥–µ–Ω—Ç,0.096064,–∫–≥–æ—Ç,0.095068
1849247698620787_1963498200529069,–∂—ã—Ä–∞—Ç—å,0.282157,–Ω–∞—É–∫–∞,0.227470,—Ä–µ—Å–ø—É–±–ª–∏–∫–∞,0.196450,—è–∑—ã–∫,0.194238,–≤–µ–∫,0.192313,...,–±–∏–±–æ–π,0.144235,–ª–µ–∫—Ü–∏—è,0.128160,–ø—Ä–∞–≤–æ,0.118356,—Ç–µ—Ä–º–∏–Ω–æ–ª–æ–≥–∏—è,0.112863,–º–æ–∑–≥–æ–≤–æ–π,0.112863
1849247698620787_1954940424718180,—É—á–∏—Ç–µ–ª—å,0.408871,–º–µ—Ä–æ–ø—Ä–∏—è—Ç–∏–µ,0.247692,–ø–æ–¥–∞—Ä–æ–∫,0.212041,–ø—Ä–∞–∑–¥–Ω–∏—á–Ω—ã–π,0.197984,–¥–µ–∫–∞–Ω,0.194144,...,—Å—Ç—É–¥–µ–Ω—Ç,0.128557,–ø–æ—Å–≤—è—Ç–∏—Ç—å,0.127754,—Å–º—ã—Å–ª,0.127667,–ø—Ä–æ—Ñ—Å–æ—é–∑,0.127667,—Ü–µ–Ω–Ω—ã–π,0.127667


In [20]:
results.groupby('post_id').mean().iloc[:,:7]

Unnamed: 0_level_0,positive,negative,neutral,skip,speech,negative_prob,positive_prob
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1849247698620787_1085067278520596,0.013876,0.167305,0.849451,0.010492,0.000010,15.692532,1.354609
1849247698620787_1951614865050736,0.041315,0.167283,0.492424,0.077784,0.266890,16.088009,3.985583
1849247698620787_1954940424718180,0.677071,0.012222,0.006828,0.022640,0.399103,0.989644,64.219956
1849247698620787_1963011817244374,0.065615,0.010338,0.004209,0.006108,0.968866,0.979794,6.218617
1849247698620787_1963498200529069,0.064464,0.111548,0.345976,0.175835,0.006681,16.318981,9.326533
...,...,...,...,...,...,...,...
2031298790415676_2031370527075169,0.000010,0.000010,1.000010,0.000010,0.000010,0.001000,0.001000
2037357129809842_2037370836475138,0.000010,0.000010,1.000010,0.000010,0.000010,0.001000,0.001000
2041496359395919_2042129615999260,0.000010,0.000010,1.000010,0.000010,0.000010,0.001000,0.001000
2045791372299751_2046020808943474,0.000010,0.000010,1.000010,0.000010,0.000010,0.001000,0.001000


## Clustering

In [22]:
posts_dataframe = results.drop_duplicates('post_id')[['post_id', 'post_text']]
posts_dataframe = posts_dataframe.set_index('post_id')
posts_dataframe

Unnamed: 0_level_0,post_text
post_id,Unnamed: 1_level_1
1849247698620787_2868153750063505,–≥ –Ω –µ—Å–µ–Ω–æ–≤ –ø—Ä–æ–≤–µ–ª –æ–Ω–ª–∞–π–Ω –≤–∏–¥–µ–æ–ª–µ–∫—Ü–∏—é –ø–æ –∞–Ω—Ç–∏–∫–æ...
1849247698620787_2868147476730799,–∫—Ä—É–≥–ª—ã–π —Å—Ç–æ–ª –Ω–∞ —Ç–µ–º—É –º–µ–Ω–µ–¥–∂–º–µ–Ω—Ç –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –∏ ...
1849247698620787_2867867713425442,–ø–æ–∑–¥—Ä–∞–≤–ª—è–µ—Ç –≤—Å–µ—Ö –≤–∞—Å —Å –Ω–∞—Å—Ç—É–ø–ª–µ–Ω–∏–µ–º —Å–≤—è—â–µ–Ω–Ω–æ–≥–æ...
1849247698620787_2867838203428393,–∞–ø—Ä–µ–ª—è –≤ –µ—Å–µ–Ω–æ–≤—Å–∫–æ–º —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–µ –ø—Ä–æ—à–ª–∞ –≤—Å—Ç—Ä–µ—á...
1849247698620787_2863965843815629,–≤ –∞–ø—Ä–µ–ª–µ –≤ —Ñ–æ—Ä–º–∞—Ç–µ –æ—Ç–∫—Ä—ã—Ç–æ–≥–æ –¥–∏–∞–ª–æ–≥–∞ –ø—Ä–æ—à–ª–∞ –≤—Å...
...,...
1849247698620787_1963011817244374,–≤ –æ–∫—Ç—è–±—Ä–µ —ç—Ç–æ–≥–æ –≥–æ–¥–∞ –∫–∞—Ñ–µ–¥—Ä–æ–π –º–µ–Ω–µ–¥–∂–º–µ–Ω—Ç–∞ —ç–∫–æ–Ω...
1849247698620787_1963533393858883,–≤—Å—Ç—Ä–µ—á–∞ —Å –ø–æ—á–µ—Ç–Ω—ã–º –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä–æ–º –∫—ã—Ä–≥—ã–∑—Å–∫–æ–≥–æ –≥–æ—Å...
1849247698620787_1963498200529069,"–ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä –± –Ω—É—Ä–¥–∞—É–ª–µ—Ç–æ–≤–∞, –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä –∫–∞—Ñ–µ–¥—Ä—ã –∫–∞..."
1849247698620787_1954940424718180,–≤ –æ–∫—Ç—è–±—Ä–µ —ç—Ç–æ–≥–æ –≥–æ–¥–∞ –≤ —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–µ –∏–º–µ–Ω–∏ —à –µ—Å...


In [23]:
train_df = pd.concat([posts_dataframe, results.groupby('post_id').mean().loc[:,['negative', 'positive',
                                                             'skip', 'neutral', 'speech',]]], axis=1)
train_df

Unnamed: 0,post_text,negative,positive,skip,neutral,speech
1849247698620787_2868153750063505,–≥ –Ω –µ—Å–µ–Ω–æ–≤ –ø—Ä–æ–≤–µ–ª –æ–Ω–ª–∞–π–Ω –≤–∏–¥–µ–æ–ª–µ–∫—Ü–∏—é –ø–æ –∞–Ω—Ç–∏–∫–æ...,0.008648,0.336671,0.006152,0.477885,0.021597
1849247698620787_2868147476730799,–∫—Ä—É–≥–ª—ã–π —Å—Ç–æ–ª –Ω–∞ —Ç–µ–º—É –º–µ–Ω–µ–¥–∂–º–µ–Ω—Ç –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –∏ ...,0.014572,0.503589,0.004993,0.498545,0.000010
1849247698620787_2867867713425442,–ø–æ–∑–¥—Ä–∞–≤–ª—è–µ—Ç –≤—Å–µ—Ö –≤–∞—Å —Å –Ω–∞—Å—Ç—É–ø–ª–µ–Ω–∏–µ–º —Å–≤—è—â–µ–Ω–Ω–æ–≥–æ...,0.007664,0.010051,0.014149,0.935019,0.001518
1849247698620787_2867838203428393,–∞–ø—Ä–µ–ª—è –≤ –µ—Å–µ–Ω–æ–≤—Å–∫–æ–º —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–µ –ø—Ä–æ—à–ª–∞ –≤—Å—Ç—Ä–µ—á...,0.011697,0.000468,0.056662,0.012831,0.938134
1849247698620787_2863965843815629,–≤ –∞–ø—Ä–µ–ª–µ –≤ —Ñ–æ—Ä–º–∞—Ç–µ –æ—Ç–∫—Ä—ã—Ç–æ–≥–æ –¥–∏–∞–ª–æ–≥–∞ –ø—Ä–æ—à–ª–∞ –≤—Å...,0.056662,0.196836,0.008072,0.500010,0.082707
...,...,...,...,...,...,...
1849247698620787_1963011817244374,–≤ –æ–∫—Ç—è–±—Ä–µ —ç—Ç–æ–≥–æ –≥–æ–¥–∞ –∫–∞—Ñ–µ–¥—Ä–æ–π –º–µ–Ω–µ–¥–∂–º–µ–Ω—Ç–∞ —ç–∫–æ–Ω...,0.010338,0.065615,0.006108,0.004209,0.968866
1849247698620787_1963533393858883,–≤—Å—Ç—Ä–µ—á–∞ —Å –ø–æ—á–µ—Ç–Ω—ã–º –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä–æ–º –∫—ã—Ä–≥—ã–∑—Å–∫–æ–≥–æ –≥–æ—Å...,0.233716,0.067557,0.206904,0.281416,0.008587
1849247698620787_1963498200529069,"–ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä –± –Ω—É—Ä–¥–∞—É–ª–µ—Ç–æ–≤–∞, –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä –∫–∞—Ñ–µ–¥—Ä—ã –∫–∞...",0.111548,0.064464,0.175835,0.345976,0.006681
1849247698620787_1954940424718180,–≤ –æ–∫—Ç—è–±—Ä–µ —ç—Ç–æ–≥–æ –≥–æ–¥–∞ –≤ —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–µ –∏–º–µ–Ω–∏ —à –µ—Å...,0.012222,0.677071,0.022640,0.006828,0.399103


In [24]:
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_clus = TfidfVectorizer(stop_words=[lemmatizing(word) for word in stopwords.words('russian')], max_df=0.1, min_df=0.001, )
X_tfidf_clus = tfidf_clus.fit_transform(results.drop_duplicates('post_id').post_text.apply(lemmatizing)).toarray()
print(X_tfidf_clus.shape)

(358, 4731)


In [25]:
from sklearn.cluster import KMeans
from collections import Counter
cluster_count = 17
model = KMeans(n_clusters=cluster_count, random_state=0).fit(np.hstack([X_tfidf_clus, train_df.loc[:, ['negative', 'positive',
                                                             'skip', 'neutral', 'speech',]].values]))
model_prediction = model.predict(np.hstack([X_tfidf_clus, train_df.loc[:, ['negative', 'positive',
                                                             'skip', 'neutral', 'speech',]].values]))
print(Counter(model_prediction).most_common())

[(2, 48), (10, 34), (15, 31), (4, 26), (9, 26), (14, 20), (11, 20), (0, 19), (12, 19), (1, 18), (5, 16), (7, 16), (16, 16), (8, 13), (3, 13), (13, 12), (6, 11)]


In [15]:
train_df['cluster_id'] = model_prediction
# train_df.to_excel('cluster_posts.xlsx')
train_df

NameError: name 'model_prediction' is not defined

In [27]:
cluster_text = {}
for i in range(cluster_count):
    cluster_text[i] = ' '.join(train_df[train_df.cluster_id == i].post_text.values)

In [28]:
cluster_text.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])

In [29]:
tfidf_clus_top = TfidfVectorizer(stop_words=[lemmatizing(word) for word in stopwords.words('russian')])
tfidf_clus_top_arr = tfidf_clus_top.fit_transform([lemmatizing(clusses) for clusses in list(cluster_text.values())]).toarray()
vocab = tfidf_clus_top.vocabulary_
reverse_vocab = {v:k for k,v in vocab.items()}

feature_names = tfidf_clus_top.get_feature_names()

In [30]:
(-np.sort(-tfidf_clus_top_arr, axis=1)[:, :100]).shape

(17, 100)

In [34]:
pd.DataFrame([[reverse_vocab.get(item) for item in row] for row in tfidf_max10])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,–Ω–∞—É–∫–∞,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,–ª—É—á—à–∏–π,–ø–µ–¥–∞–≥–æ–≥–∏–∫–∞,–∫–∞—Ñ–µ–¥—Ä–∞,–ø–æ–∑–¥—Ä–∞–≤–ª—è—Ç—å,–ø—Ä–µ–ø–æ–¥–∞–≤–∞—Ç–µ–ª—å,–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ,–∂–µ–ª–∞—Ç—å,–∞–ª–º–∞–∂—ã–π,...,–∏–∑–æ–±—Ä–µ—Ç–∞—Ç–µ–ª—å–Ω—ã–π,—Ç–∞—É–±–∞–µ–≤–∏—á,–ø–æ–∑–¥—Ä–∞–≤–ª—è—Ç—ã–π,–∏–º–∞–Ω–±–∞–π,—Ä–∞–¥–∏–æ—Ç–µ—Ö–Ω–∏–∫,–∂–æ–ª–∞–º–∞–Ω,–º–µ—Ç–∞–ª–ª—É—Ä–≥–∏—á–µ—Å–∫–∏–π,–±–∞—Ç—ã—Ä,—Ä–∞–∑—Ä—è–¥,—ç–ª–µ–∫—Ç—Ä–æ–Ω–∏–∫–∞
1,–ø–æ—ç—Ç,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,–∞–π–±–µ—Ä–≥–µ–Ω–æ–≤,–≤–µ—á–µ—Ä,–Ω–∞—É–∫–∞,—Ç–∞–ª–∞–Ω—Ç,–∫–∞–∑–æ–π—Å—Ç–∞–Ω–æ,—Å—Ç—É–¥–µ–Ω—Ç,–µ—Å–µ–Ω–æ–≤—Å–∫–∏–π,–ª–∞—É—Ä–µ–∞—Ç,...,—Å—Ç–∞—Ç—å,–æ–±–ª–∞—Å—Ç—å,—á–∞—Å,–æ—Ä–≥–∞–Ω–∏–∑–∞—Ç–æ—Ä,–≤–Ω–µ—Å—Ç–∏,–∫–æ–Ω–µ—Ü,–ø—Ä–∞–∫—Ç–∏—á–µ—Å–∫–∏–π,—Ä–∞–±–æ—Ç–∞,–º–æ–ª–æ–¥–æ–π,–±–∏–∂–∞—Ç—ã–π
2,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,–µ—Å–µ–Ω–æ–≤—Å–∫–∏–π,–∫–æ–ª–ª–µ–¥–∂,—Å—Ç—É–¥–µ–Ω—Ç,–º–µ–Ω–µ–¥–∂–º–µ–Ω—Ç,–∫–∞—Ñ–µ–¥—Ä–∞,–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ,–æ–±—Ä–∞–∑–æ–≤–∞—Ç–µ–ª—å–Ω—ã–π,–∞–∫—Ç–æ–π,–º–µ—Å—Ç–æ,...,—è–∑—ã–∫,–º–æ–ª–æ–¥–µ–∂—å,–ø—Ä–∏–µ–º–Ω–∞—è,—Ñ–∏–ª—å–º,–ø–∞—Ä—Ç–∏—è,–Ω–∞–∑–∞—Ä–±–∞–µ–≤–æ,—Å—Ç–∞—Ç—å—è,—Ä–∞–±–æ—Ç–∞—Ç—å,–æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏—è,–º–æ—Ä—Å–∫–æ–π
3,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,–º–∞–Ω–≥–∞–≤—ã—Å—Ç–∞–≤—Å–∫–∏–π,–µ—Å–µ–Ω–æ–≤—Å–∫–∏–π,—Å—Ç—É–¥–µ–Ω—Ç,—É—á–∞—Å—Ç–∏–µ,–Ω–µ–ø—Ä–µ—Ä—ã–≤–Ω—ã–π,–≤—Å—Ç—Ä–µ—á–∞,–º–µ—Ä–æ–ø—Ä–∏—è—Ç–∏–µ,–∫–∞–∑–æ–π—Å—Ç–∞–Ω–æ,–ø–∞—Ä—Ç–∏—è,...,—Ç–æ–∫–∞–π,–∫—Ä–∞–µ–≤–µ–¥,–º–∏—Ä–∂–∞–∫—ã–π,–∑–Ω–∞–∫–æ–º—ã–π,–ø–æ–¥–¥–µ—Ä–∂–∞—Ç—å,–±–ª—é–¥–æ,–∂–æ–º–∞—Ä—Ç—ã–π,–∞–π–≥—É–ª—å,–∞—Å—Å–∞–º–±–ª–µ—è,–ø—Ä–µ–¥–ø—Ä–∏–Ω–∏–º–∞—Ç–µ–ª—å—Å—Ç–≤–æ
4,–ø–æ–∑–¥—Ä–∞–≤–ª—è—Ç—å,—Ç–≤–æ—Ä—á–µ—Å–∫–∏–π,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,–∫–≤–µ—Å—Ç–æ,–∑–¥–æ—Ä–æ–≤—å–µ,–∫—Ä–µ–ø–∫–∏–π,–∂–µ–ª–∞—Ç—å,–Ω–∞—É–∫–∞,–¥–æ–ª–≥–æ–ª–µ—Ç–∏–µ,–º–æ–ª–æ–¥–æ–π,...,—ç–∫—Å–∫—É—Ä—Å–∏—è,–∂–∞–Ω—Ç–æ—Ä,–∞–ª–ª–∞–±–µ—Ä–≥–µ–Ω–Ω—ã–π,–∂–∞–Ω—Ç–æ—Ä–∏–Ω—ã–π,–µ—Ä–∂–∞–Ω–æ–≤,–∞–∫—Ç–µ—Ä,–∫—É–∑–µ–º–±–∞–µ–≤–Ω—ã–π,—Ä–∏—Å–∫,–∫–æ–Ω–∞—Ä–±–∞–∏,–ø–∞–∂–∞—Ä–±–µ–∫–æ–≤–∏—á
5,–ø–æ—ç—Ç,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,—Å—Ç—É–¥–µ–Ω—Ç,–º—É–∫–∞–≥–∞—Ç—å,–º–∞–∫–∞—Ç—å–µ–≤,–∫–æ–Ω–∫—É—Ä—Å,–ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä,–ª–∏—Ç–µ—Ä–∞—Ç—É—Ä–∞,–≤—Å—Ç—Ä–µ—á–∞,—Ä–µ—Å–ø—É–±–ª–∏–∫–∞,...,–ø–µ—Ä–≤–æ–∫—É—Ä—Å–Ω–∏–∫,–ø—Ä–∏–Ω–∞–¥–ª–µ–∂–Ω–æ—Å—Ç—å,—Å–≤–æ–±–æ–¥–Ω–æ,–≤—Ç–æ—Ä–æ–π,—Ç–∞–∫–∂–µ,–ø—Ä–µ–º–∏—è,–¥–µ–ª–∞—Ç—å,–Ω–∞–≥—Ä–∞–¥–∏—Ç—å,–∫—É—Ä—Å,–±–∏–∑–Ω–µ—Å
6,–∫–æ–Ω—Ñ–µ—Ä–µ–Ω—Ü–∏—è,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,–Ω–∞—É—á–Ω—ã–π,–ø—Ä–∏–∫–∞—Å–ø–∏–π—Å–∫–∏–π,–∫–æ–Ω–¥—ã–±—ã–π,–º–µ–∂–¥—É–Ω–∞—Ä–æ–¥–Ω—ã–π,–ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä,–µ—Å–µ–Ω–æ–≤—Å–∫–∏–π,—á–æ—É–¥—Ö—É—Ä–∏–π,–¥–∞–±–∞–±—Ä–∞—Ç—ã–π,...,–º–∞–¥–∏–π,–¥–≤–æ–π–Ω–æ–π,–∫–æ–º–º—É–Ω–∏—Å—Ç–∏—á–µ—Å–∫–∏–π,—á–∂–∞–Ω,–∂—É–∑–±–µ—Ä–≥–µ–Ω,–æ–∫–µ–∞–Ω–æ–ª–æ–≥–∏—è,—ç—Ä–∫–µ–±—É–ª–∞–Ω–Ω—ã–π,–∞—É—ç–∑,–ø—Ä–æ—Ç–µ–∑,–∫–∞—Ä–∞–Ω–æ–≥–∞–π
7,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,–ø—Ä–æ–≥—Ä–∞–º–º–∞,—Ñ—Ä–∞–Ω—Ü–∏–π,—Ä–µ—Å–ø—É–±–ª–∏–∫–∞,–æ–±–ª–∞—Å—Ç–Ω–æ–π,–∏—Ä–∞–Ω,–∏—Å–ª–∞–º—Å–∫–∏–π,–µ—Å–µ–Ω–æ–≤—Å–∫–∏–π,—É—á–µ–±–Ω—ã–π,–º–∞–Ω–≥–∏—Å—Ç–∞—É—Å–∫–∏–π,...,–∂—É–º–∞,–∞–Ω–Ω–∞–≥–µ–ª—å–¥–∏–µ–≤–∏—á,–æ—Ä–∏–Ω–±–∞–µ–≤–∏—á,—Å–∞–ø–∞—Ä–º—É—Ä–∞—Ç,—Ñ–∏–ª–∏–ø–ø—ã–π,—Ä–æ—Å—Ç,–±—Ä–∏–¥–∂–∏–π,–ø–æ–∫–∞–∑,—É–Ω–∏–≤–µ—Ä—Å–∞–ª,–æ–≥—É–ª–≥–æ–∑–µ–ª—å
8,–Ω–µ–¥–µ–ª—è,–∫–æ–Ω–¥—ã–±—ã–π,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,–∫—É–ª—å—Ç—É—Ä–∞,–º–∏—Ñ–æ–ª–æ–≥–∏—è,–ø—Ä–æ–≥—Ä–∞–º–º–∞,–±—É–¥—É—â–∏–π,—é—Ä–∏—Å—Ç,–æ–±–ª–∞—Å—Ç—å,–∞–∑–µ—Ä–±–∞–π–¥–∂–∞–Ω–Ω—ã–π,...,–≤–æ–∑—Ä–æ–∂–¥–µ–Ω–∏–µ,—Å–ø–µ—Ü–∏–∞–ª–∏—Å—Ç,—Ä–∞–∑–ª–∏—á–Ω—ã–π,–∞–ø—Ä–µ–ª—å,–¥–µ–Ω—å,–∫–æ—Ç–æ—Ä—ã–π,–∞–Ω–∞–ª–∏–∑,–Ω–∞–ª–æ–≥–æ–≤—ã–π,—É–≤–µ–ª–∏—á–∏—Ç—å,–∞–∑–µ—Ä–±–∞–π–¥–∂–∞–Ω—Å–∫–∏–π
9,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,–º–µ–º–æ—Ä–∞–Ω–¥—É–º,—Å–æ—Ç—Ä—É–¥–Ω–∏—á–µ—Å—Ç–≤–æ,—Å—Ç—É–¥–µ–Ω—Ç,–ø–æ–¥–ø–∏—Å–∞—Ç—å,–µ—Å–µ–Ω–æ–≤—Å–∫–∏–π,–ø—Ä–æ–º—ã—à–ª–µ–Ω–Ω—ã–π,–∂—É—Ä–Ω–∞–ª,—Ä–∞–∑–≤–∏—Ç–∏–µ,—Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—è,...,—Ä–µ–∫—Ç–æ—Ä,–ø—Ä–æ–≥—Ä–∞–º–º–Ω—ã–π,–ø—Ä–æ–¥–≤–∏–∂–µ–Ω–∏–µ,—à–æ–ª–ø–∞–Ω,–∫–æ–Ω–∫—É—Ä–µ–Ω—Ç–æ—Å–ø–æ—Å–æ–±–Ω–æ—Å—Ç—å,–≤–∏–∑–∏—Ç,–±–∏–∑–Ω–µ—Å,—É—á–µ–±–Ω—ã–π,–ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä,—Ä–µ—Å–ø—É–±–ª–∏–∫–∞


In [36]:
idx = (-tfidf_clus_top_arr).argsort(axis=1)

tfidf_max10 = idx[:, :100]
tfidf_weight = -np.sort(-tfidf_clus_top_arr, axis=1)[:, :100]

df_tfidf_c = pd.DataFrame([[reverse_vocab.get(item) for item in row] for row in tfidf_max10])
df_tfidf_c = df_tfidf_c.T
df_tfidf_c.index = ['top_' + str(i+1) for i in range(100)]
df_tfidf_c.columns = ['cluster_' + str(i+1) for i in range(cluster_count)]
df_tfidf_c = df_tfidf_c.T
df_tfidf_c[['weight_' + str(i+1) for i in range(100)]] = tfidf_weight
df_tfidf_c = df_tfidf_c[np.array([[df_tfidf_c.columns[i], df_tfidf_c.columns[i+100]] for i in range(100)]).reshape(100*2)]
df_tfidf_c

Unnamed: 0,top_1,weight_1,top_2,weight_2,top_3,weight_3,top_4,weight_4,top_5,weight_5,...,top_96,weight_96,top_97,weight_97,top_98,weight_98,top_99,weight_99,top_100,weight_100
cluster_1,–Ω–∞—É–∫–∞,0.217851,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,0.19844,–ª—É—á—à–∏–π,0.170866,–ø–µ–¥–∞–≥–æ–≥–∏–∫–∞,0.163248,–∫–∞—Ñ–µ–¥—Ä–∞,0.162094,...,–∂–æ–ª–∞–º–∞–Ω,0.048804,–º–µ—Ç–∞–ª–ª—É—Ä–≥–∏—á–µ—Å–∫–∏–π,0.048804,–±–∞—Ç—ã—Ä,0.048804,—Ä–∞–∑—Ä—è–¥,0.048804,—ç–ª–µ–∫—Ç—Ä–æ–Ω–∏–∫–∞,0.048804
cluster_2,–ø–æ—ç—Ç,0.284566,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,0.215723,–∞–π–±–µ—Ä–≥–µ–Ω–æ–≤,0.197062,–≤–µ—á–µ—Ä,0.154644,–Ω–∞—É–∫–∞,0.143348,...,–∫–æ–Ω–µ—Ü,0.041743,–ø—Ä–∞–∫—Ç–∏—á–µ—Å–∫–∏–π,0.041743,—Ä–∞–±–æ—Ç–∞,0.041337,–º–æ–ª–æ–¥–æ–π,0.040846,–±–∏–∂–∞—Ç—ã–π,0.039412
cluster_3,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,0.352678,–µ—Å–µ–Ω–æ–≤—Å–∫–∏–π,0.156746,–∫–æ–ª–ª–µ–¥–∂,0.145245,—Å—Ç—É–¥–µ–Ω—Ç,0.132254,–º–µ–Ω–µ–¥–∂–º–µ–Ω—Ç,0.130804,...,–Ω–∞–∑–∞—Ä–±–∞–µ–≤–æ,0.044691,—Å—Ç–∞—Ç—å—è,0.044352,—Ä–∞–±–æ—Ç–∞—Ç—å,0.043864,–æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏—è,0.043802,–º–æ—Ä—Å–∫–æ–π,0.041468
cluster_4,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,0.366392,–º–∞–Ω–≥–∞–≤—ã—Å—Ç–∞–≤—Å–∫–∏–π,0.150184,–µ—Å–µ–Ω–æ–≤—Å–∫–∏–π,0.14092,—Å—Ç—É–¥–µ–Ω—Ç,0.122131,—É—á–∞—Å—Ç–∏–µ,0.122131,...,–±–ª—é–¥–æ,0.052455,–∂–æ–º–∞—Ä—Ç—ã–π,0.052455,–∞–π–≥—É–ª—å,0.052455,–∞—Å—Å–∞–º–±–ª–µ—è,0.051039,–ø—Ä–µ–¥–ø—Ä–∏–Ω–∏–º–∞—Ç–µ–ª—å—Å—Ç–≤–æ,0.051039
cluster_5,–ø–æ–∑–¥—Ä–∞–≤–ª—è—Ç—å,0.227659,—Ç–≤–æ—Ä—á–µ—Å–∫–∏–π,0.218179,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,0.192375,–∫–≤–µ—Å—Ç–æ,0.175733,–∑–¥–æ—Ä–æ–≤—å–µ,0.174493,...,–∞–∫—Ç–µ—Ä,0.043933,–∫—É–∑–µ–º–±–∞–µ–≤–Ω—ã–π,0.043933,—Ä–∏—Å–∫,0.043933,–∫–æ–Ω–∞—Ä–±–∞–∏,0.043933,–ø–∞–∂–∞—Ä–±–µ–∫–æ–≤–∏—á,0.043933
cluster_6,–ø–æ—ç—Ç,0.28132,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,0.275548,—Å—Ç—É–¥–µ–Ω—Ç,0.223417,–º—É–∫–∞–≥–∞—Ç—å,0.145536,–º–∞–∫–∞—Ç—å–µ–≤,0.142863,...,–ø—Ä–µ–º–∏—è,0.046887,–¥–µ–ª–∞—Ç—å,0.046887,–Ω–∞–≥—Ä–∞–¥–∏—Ç—å,0.046594,–∫—É—Ä—Å,0.046594,–±–∏–∑–Ω–µ—Å,0.046594
cluster_7,–∫–æ–Ω—Ñ–µ—Ä–µ–Ω—Ü–∏—è,0.407242,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,0.25306,–Ω–∞—É—á–Ω—ã–π,0.170918,–ø—Ä–∏–∫–∞—Å–ø–∏–π—Å–∫–∏–π,0.157323,–∫–æ–Ω–¥—ã–±—ã–π,0.156996,...,–æ–∫–µ–∞–Ω–æ–ª–æ–≥–∏—è,0.044949,—ç—Ä–∫–µ–±—É–ª–∞–Ω–Ω—ã–π,0.044949,–∞—É—ç–∑,0.044949,–ø—Ä–æ—Ç–µ–∑,0.044949,–∫–∞—Ä–∞–Ω–æ–≥–∞–π,0.044949
cluster_8,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,0.269076,–ø—Ä–æ–≥—Ä–∞–º–º–∞,0.189638,—Ñ—Ä–∞–Ω—Ü–∏–π,0.16728,—Ä–µ—Å–ø—É–±–ª–∏–∫–∞,0.156961,–æ–±–ª–∞—Å—Ç–Ω–æ–π,0.156174,...,—Ä–æ—Å—Ç,0.047794,–±—Ä–∏–¥–∂–∏–π,0.047794,–ø–æ–∫–∞–∑,0.047794,—É–Ω–∏–≤–µ—Ä—Å–∞–ª,0.047794,–æ–≥—É–ª–≥–æ–∑–µ–ª—å,0.047794
cluster_9,–Ω–µ–¥–µ–ª—è,0.213623,–∫–æ–Ω–¥—ã–±—ã–π,0.211702,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,0.189578,–∫—É–ª—å—Ç—É—Ä–∞,0.186512,–º–∏—Ñ–æ–ª–æ–≥–∏—è,0.181837,...,–∫–æ—Ç–æ—Ä—ã–π,0.050103,–∞–Ω–∞–ª–∏–∑,0.047472,–Ω–∞–ª–æ–≥–æ–≤—ã–π,0.047472,—É–≤–µ–ª–∏—á–∏—Ç—å,0.047472,–∞–∑–µ—Ä–±–∞–π–¥–∂–∞–Ω—Å–∫–∏–π,0.047472
cluster_10,—É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç,0.393698,–º–µ–º–æ—Ä–∞–Ω–¥—É–º,0.176475,—Å–æ—Ç—Ä—É–¥–Ω–∏—á–µ—Å—Ç–≤–æ,0.161304,—Å—Ç—É–¥–µ–Ω—Ç,0.15592,–ø–æ–¥–ø–∏—Å–∞—Ç—å,0.155298,...,–≤–∏–∑–∏—Ç,0.039044,–±–∏–∑–Ω–µ—Å,0.039021,—É—á–µ–±–Ω—ã–π,0.039021,–ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä,0.039021,—Ä–µ—Å–ø—É–±–ª–∏–∫–∞,0.03898


In [99]:
df_tfidf_c.to_excel('excel/facebook_cluster_top100weights.xlsx')

In [13]:
last_otchet = results.drop_duplicates('post_id')[['post_id', 'post_text', 'negative_prob', 'positive_prob']]
last_otchet = last_otchet.set_index('post_id')
last_otchet

Unnamed: 0_level_0,post_text,negative_prob,positive_prob
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1849247698620787_2868153750063505,–≥ –Ω –µ—Å–µ–Ω–æ–≤ –ø—Ä–æ–≤–µ–ª –æ–Ω–ª–∞–π–Ω –≤–∏–¥–µ–æ–ª–µ–∫—Ü–∏—é –ø–æ –∞–Ω—Ç–∏–∫–æ...,1.425975,14.807658
1849247698620787_2868147476730799,–∫—Ä—É–≥–ª—ã–π —Å—Ç–æ–ª –Ω–∞ —Ç–µ–º—É –º–µ–Ω–µ–¥–∂–º–µ–Ω—Ç –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –∏ ...,0.393126,98.660312
1849247698620787_2867867713425442,–ø–æ–∑–¥—Ä–∞–≤–ª—è–µ—Ç –≤—Å–µ—Ö –≤–∞—Å —Å –Ω–∞—Å—Ç—É–ø–ª–µ–Ω–∏–µ–º —Å–≤—è—â–µ–Ω–Ω–æ–≥–æ...,1.947411,1.118466
1849247698620787_2867838203428393,–∞–ø—Ä–µ–ª—è –≤ –µ—Å–µ–Ω–æ–≤—Å–∫–æ–º —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–µ –ø—Ä–æ—à–ª–∞ –≤—Å—Ç—Ä–µ—á...,1.147022,0.045922
1849247698620787_2863965843815629,–≤ –∞–ø—Ä–µ–ª–µ –≤ —Ñ–æ—Ä–º–∞—Ç–µ –æ—Ç–∫—Ä—ã—Ç–æ–≥–æ –¥–∏–∞–ª–æ–≥–∞ –ø—Ä–æ—à–ª–∞ –≤—Å...,6.711268,23.313871
...,...,...,...
1849247698620787_1963011817244374,–≤ –æ–∫—Ç—è–±—Ä–µ —ç—Ç–æ–≥–æ –≥–æ–¥–∞ –∫–∞—Ñ–µ–¥—Ä–æ–π –º–µ–Ω–µ–¥–∂–º–µ–Ω—Ç–∞ —ç–∫–æ–Ω...,0.979794,6.218617
1849247698620787_1963533393858883,–≤—Å—Ç—Ä–µ—á–∞ —Å –ø–æ—á–µ—Ç–Ω—ã–º –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä–æ–º –∫—ã—Ä–≥—ã–∑—Å–∫–æ–≥–æ –≥–æ—Å...,29.281153,8.463839
1849247698620787_1963498200529069,"–ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä –± –Ω—É—Ä–¥–∞—É–ª–µ—Ç–æ–≤–∞, –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä –∫–∞—Ñ–µ–¥—Ä—ã –∫–∞...",28.386861,13.709288
1849247698620787_1954940424718180,–≤ –æ–∫—Ç—è–±—Ä–µ —ç—Ç–æ–≥–æ –≥–æ–¥–∞ –≤ —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–µ –∏–º–µ–Ω–∏ —à –µ—Å...,1.978288,29.389538


In [14]:
last_otchet['cluster_label'] = train_df.cluster_id
last_otchet['top20keywords'] = df_tfidf.T.apply(lambda x: ', '.join(x))
last_otchet['cluster_top100keywords'] = last_otchet.cluster_label.apply(lambda x: df_tfidf_c.T.apply(lambda x: ', '.join(x))[x])
last_otchet

NameError: name 'train_df' is not defined

In [31]:
pd.read_excel('excel/facebook_dates_likes_comments.xlsx')

Unnamed: 0,post_id,datetime,likes,comments,shares
0,1849247698620787_1085067278520596,,0,0,
1,1849247698620787_1951614865050736,26.09.2017,16,2,
2,1849247698620787_1954940424718180,04.10.2017,34,4,4
3,1849247698620787_1963011817244374,,0,0,
4,1849247698620787_1963498200529069,,0,0,
...,...,...,...,...,...
353,2031298790415676_2031370527075169,,0,0,
354,2037357129809842_2037370836475138,,0,0,
355,2041496359395919_2042129615999260,,0,0,
356,2045791372299751_2046020808943474,,0,0,


In [39]:
last_otchet = pd.read_excel('excel/facebook_media.xlsx')
last_otchet['Datetime'] = pd.read_excel('excel/facebook_dates_likes_comments.xlsx').datetime
last_otchet['likes'] = pd.read_excel('excel/facebook_dates_likes_comments.xlsx').likes
last_otchet['comments'] = pd.read_excel('excel/facebook_dates_likes_comments.xlsx').comments
last_otchet['shares'] = pd.read_excel('excel/facebook_dates_likes_comments.xlsx').shares
last_otchet

Unnamed: 0.1,Unnamed: 0,post_id,post_text,negative_prob,positive_prob,cluster_label,top20keywords,cluster_top100keywords,Datetime,likes,comments,shares
0,0,1849247698620787_2868153750063505,–≥ –Ω –µ—Å–µ–Ω–æ–≤ –ø—Ä–æ–≤–µ–ª –æ–Ω–ª–∞–π–Ω –≤–∏–¥–µ–æ–ª–µ–∫—Ü–∏—é –ø–æ –∞–Ω—Ç–∏–∫–æ...,1.425975,14.807658,8,"–ø–æ–ª–Ω—ã–π, —Ö–∏–º–∏—á–µ—Å–∫–∏–π, –ø—Ä–∏—Å—É—Ç—Å—Ç–≤–æ–≤–∞—Ç—å, —É–∑–Ω–∞—Ç—å, –∏—Å...","—É–≤–µ–ª–∏—á–∏—Ç—å, –ø–µ—Ä—Å–∏–¥—Å–∫–∏–π, –∏—Å–ø–æ–ª–Ω–µ–Ω–∏–µ, –Ω–∞–ª–æ–≥–æ–≤—ã–π, ...",,0,0,
1,1,1849247698620787_2868147476730799,–∫—Ä—É–≥–ª—ã–π —Å—Ç–æ–ª –Ω–∞ —Ç–µ–º—É –º–µ–Ω–µ–¥–∂–º–µ–Ω—Ç –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –∏ ...,0.393126,98.660312,14,"–∞—Ö–º–µ—Ç–∂–∞–Ω–æ–≤–∏—á, –∞—Ö–º–µ—Ç–∂–∞–Ω–æ–≤–∏—á—å, –∞—è—Ç–∂–∞–Ω–Ω—ã–π, –∞—è—Ç–∂–∞–Ω...","—è–≤–ª—è—é—Ç—å—Å—è, –¥–µ–π—Å—Ç–≤–µ–Ω–Ω—ã–π, –±–µ—Ä–∫—É—Ç, —Å–∞–º–æ–æ—Ü–µ–Ω–∫–∏–π, –æ...",26.09.2017,16,2,
2,2,1849247698620787_2867867713425442,–ø–æ–∑–¥—Ä–∞–≤–ª—è–µ—Ç –≤—Å–µ—Ö –≤–∞—Å —Å –Ω–∞—Å—Ç—É–ø–ª–µ–Ω–∏–µ–º —Å–≤—è—â–µ–Ω–Ω–æ–≥–æ...,1.947411,1.118466,2,"–∫–∞—Ä–∞–¥–∂–∞–Ω–±–∞, –∫–∞–∑–æ–π—Å—Ç–∞–Ω–æ, –ø—Ä–∏–Ω—è—Ç—å, –ø–æ–∑–¥—Ä–∞–≤–ª—è—Ç—å, ...","–∑–∞–Ω—è—Ç—å, –æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏—è, —Ä–∞–±–æ—Ç–∞—Ç—å, —Å—Ç–∞—Ç—å—è, –ø—Ä–∏–µ–º–Ω–∞...",04.10.2017,34,4,4
3,3,1849247698620787_2867838203428393,–∞–ø—Ä–µ–ª—è –≤ –µ—Å–µ–Ω–æ–≤—Å–∫–æ–º —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–µ –ø—Ä–æ—à–ª–∞ –≤—Å—Ç—Ä–µ—á...,1.147022,0.045922,4,"–≤—Å—Ç—Ä–µ—á–∞, —à—Ç–∞–±, –≤—Ç–æ—Ä–∂–µ–Ω–∏–µ, –≤–æ–ø—Ä–æ—Å, –ø–æ–ª—É—á–∏—Ç—å, –æ—Ä...","–∫–æ–Ω–∞—Ä–±–∞–∏, –∫—É–∑–µ–º–±–∞–µ–≤–Ω—ã–π, –∞–ª–ª–∞–±–µ—Ä–≥–µ–Ω–Ω—ã–π, —Å–æ–ª–Ω–µ—á–Ω...",,0,0,
4,4,1849247698620787_2863965843815629,–≤ –∞–ø—Ä–µ–ª–µ –≤ —Ñ–æ—Ä–º–∞—Ç–µ –æ—Ç–∫—Ä—ã—Ç–æ–≥–æ –¥–∏–∞–ª–æ–≥–∞ –ø—Ä–æ—à–ª–∞ –≤—Å...,6.711268,23.313871,1,"–æ—Ç–∫—Ä—ã—Ç—ã–π, –æ—Ç–≤–µ—Ç, —Å–ª—É—á–∞–π, –ø—Ä–æ—Ñ–µ—Å—Å–∏—è, –æ—Ä–≥–∞–Ω–∏–∑–∞—Ç–æ...","–ø–µ—Ä–µ–≤–æ–¥, –º–æ–ª–æ–¥–æ–π, —Ä–∞–±–æ—Ç–∞, –ø—Ä–∞–∫—Ç–∏—á–µ—Å–∫–∏–π, –∫–æ–Ω–µ—Ü,...",,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...
353,353,1849247698620787_1963011817244374,–≤ –æ–∫—Ç—è–±—Ä–µ —ç—Ç–æ–≥–æ –≥–æ–¥–∞ –∫–∞—Ñ–µ–¥—Ä–æ–π –º–µ–Ω–µ–¥–∂–º–µ–Ω—Ç–∞ —ç–∫–æ–Ω...,0.979794,6.218617,11,"—é–ø, –º–µ–Ω–¥–∞–ª–∏–µ, –Ω–∞—É—Ä—ã–∑–±—ã–π, —Ñ–∏–∑–∫—É–ª—å—Ç—É—Ä–∞, —Ñ–∞–∫—É–ª—å—Ç–µ...","–∑–≤–∞–Ω–∏–µ, –∂—É—Ä–Ω–∞–ª, –ø—Ä–æ–π—Ç–∏, —Å—Ç—Ä–∞–Ω–∞, –≥–æ–¥, –º–∞—Ä—Ç, –∫–∞–∑...",,0,0,
354,354,1849247698620787_1963533393858883,–≤—Å—Ç—Ä–µ—á–∞ —Å –ø–æ—á–µ—Ç–Ω—ã–º –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä–æ–º –∫—ã—Ä–≥—ã–∑—Å–∫–æ–≥–æ –≥–æ—Å...,29.281153,8.463839,5,"–∫–≥–æ—Ç, —Å—Ç—É–¥–µ–Ω—Ç, –ª–∞—É—Ä–µ–∞—Ç, –∑–∞—Ç–µ–º, –ø–∏—Å–∞—Ç—å, —Å–æ–≤—Ä–µ–º–µ...","–∫—É—Ä—Å, –Ω–∞–≥—Ä–∞–¥–∏—Ç—å, –±–∏–∑–Ω–µ—Å, –¥–µ–ª–∞—Ç—å, –ø—Ä–µ–º–∏—è, —Ç–∞–∫–∂–µ...",,0,0,
355,355,1849247698620787_1963498200529069,"–ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä –± –Ω—É—Ä–¥–∞—É–ª–µ—Ç–æ–≤–∞, –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä –∫–∞—Ñ–µ–¥—Ä—ã –∫–∞...",28.386861,13.709288,5,"–º–æ–∑–≥–æ–≤–æ–π, –∫–æ–Ω—Ü–µ–ø—Ç—É–∞–ª—å–Ω—ã–π, –ø—Ä–∞–≤–æ, –ª–µ–∫—Ü–∏—è, –±–∏–±–æ–π...","–∫—É—Ä—Å, –Ω–∞–≥—Ä–∞–¥–∏—Ç—å, –±–∏–∑–Ω–µ—Å, –¥–µ–ª–∞—Ç—å, –ø—Ä–µ–º–∏—è, —Ç–∞–∫–∂–µ...",,0,0,
356,356,1849247698620787_1954940424718180,–≤ –æ–∫—Ç—è–±—Ä–µ —ç—Ç–æ–≥–æ –≥–æ–¥–∞ –≤ —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–µ –∏–º–µ–Ω–∏ —à –µ—Å...,1.978288,29.389538,3,"–¥–æ–±–∏–≤—à–∏–π—Å—è, –ø—Ä–æ—Ñ—Å–æ—é–∑, —Å–º—ã—Å–ª, –ø–æ—Å–≤—è—Ç–∏—Ç—å, —Å—Ç—É–¥–µ–Ω...","–∫–ª—É–±, –ø—Ä–µ–¥–ø—Ä–∏–Ω–∏–º–∞—Ç–µ–ª—å—Å—Ç–≤–æ, –∞–π–≥—É–ª—å, –∫–æ—Ä–æ—Ç–∫–∏–π, –º...",,0,0,


In [40]:
last_otchet.to_excel('excel/facebook_media.xlsx')