In [1]:
import itertools
import sys
import json
import warnings
warnings.filterwarnings("ignore")

from util import *
from scipy.sparse import vstack, csr_matrix, coo_matrix
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model
from sklearn.utils import class_weight
from keras.layers import *
from keras.models import Model
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard, CSVLogger

Using TensorFlow backend.


# Corpora for training model

In [2]:
sources = {'art': [60114472, 19191317], 
           'politics': [29534144, 23482909], 
           'finances': [62438886, 81354264],
           'strateg_management': ["http://www.stplan.ru/", "http://www.30n.ru/2/1.html"], 
           'law': [65974416, 79084019],
           'elaboration': ["https://vk.com/"], 
           'industry': [67183197, 66233468], 
           'education': [30558759, 98643656],
#            'social_business': [23509868, 56821139],
           'charity': [],
           'public_health': [78860407, 61490488],
           'agriculture': ["http://www.nsh.ru/", "http://россельхоз.рф/"],
           'government_management': ["http://be5.biz/upravlenie/gosudarstvennoe_upravlenie.html", 97296142],
           'smm': [74686342, 79925455],
           'innovations': [98643656, 63337812],
           'safety': [37959220, 10933209],
           'military': ["http://www.soldiering.ru", "https://voennoe-delo.com"],
           'corporative_management': ["http://www.cfin.ru/management/"],
           'social_safety': [49582956, 72388807],
           'building': [30713157, 26978036],
           'entrepreneurship': [69560028, 73537456],
           'sport': [29809500, 128350290],
           'investitions': [37876217, 3800580]
          }

In [3]:
# getting data for corpora
for tag, ids_ in sources.items():
    path = f"assets/corpora/{tag}.txt"
    s = set()
    if not os.path.exists(path):
        with open(path, "w") as f:
            for id_ in ids_:
                if isinstance(id_, int):
                    wall = ParseClass.getallwall({"owner_id": -id_}, 1000)
                    for post in tqdm.tqdm(wall):
                        if len(post) and post not in s:
                            s.add(post)
                            _ = f.write(f"{post}\n")
                elif isinstance(id_, str):
                    links = np.random.choice(
                        list(ParseClass.get_all_links(id_)), 1000)
                    for link in tqdm.tqdm(links):
                        try:
                            page = requests.get(link).text
                            soup = BeautifulSoup(page, "lxml")
                            for text in soup.text.strip().split("\n"):
                                if len(text) and text not in s:
                                    s.add(text)
                                    _ = f.write(f"{text}\n")
                        except:
                            pass

Manual processing

In [4]:
categories = list(sources.keys())

In [5]:
labels = ["Искусство", "Политика", "Финансы", "Стратегическое управление", "Юриспруденция", "Исследования и разработки",
          "Промышленность", "Образование", "Благотворительность", "Здравоохранение", "Сельское хозяйство", 
          "Государственное управление", "Реклама и маркетинг", "Инновации и модернизация", "Безопасность", 
          "Военное дело", "Корпоративное управление", "Социальная защита", "Строительство", "Предпринимательство",
          "Спорт", "Инвестиции"]
norm_names = dict(zip(categories, labels))
norm_names_reversed = dict([(a[1], a[0]) for a in norm_names.items()])

In [6]:
# df = pd.read_msgpack("assets/df.msg")
# competencies = pd.read_csv("assets/competencies.csv")
# competencies = competencies.dropna()[['Id', 'Интересы']]
# social = pd.read_excel("assets/social.xlsx", names=['id', 'name', 'fb', 'ok', 'vk', 'email', 'twitter'])
# accepted = social.fb.dropna().index | social.vk.dropna().index
# social = social.loc[accepted, ['name', 'id', 'vk', 'fb']]
# social.vk = social.vk.apply(lambda x: str(x).split("/")[-1])
# social.fb = social.fb.apply(lambda x: str(x).split("/")[-1])
# social.fb = social.fb.apply(lambda x: str(x).split("id=")[-1])
# social.fb = social.fb.apply(lambda x: str(x).split("?ref=")[0])
# social.fb = social.fb.replace('nan', np.nan)
# social = social.merge(df[['vk', 'vk_id']], how='outer', on='vk')
# social.vk = social.vk_id.fillna(0)
# social = social.replace(0, np.nan).drop(labels=['vk_id'], axis=1)
# social = social.set_index('id').merge(how='inner', right=competencies.set_index('Id'), left_index=True, right_index=True)
# to_exclude = pd.read_csv("assets/known_users.csv")['Leader-ID'].dropna().astype('int').values
# social = social.loc[~social.index.isin(to_exclude), :]
# social.to_msgpack("assets/social.msg")

# def get_id(screen_name):
#     try:
#         item = vk.users.get(user_ids=screen_name)
#         if 'deactivated' not in item:
#             return item[0]['id']
#     except Exception as e:
#         print(e.args)
# Нужно замапить screen_name с их vk_id, поэтому придется обрабатывать по одному

social = pd.read_msgpack("assets/social.msg")

In [7]:
label_encoder = LabelEncoder()
# labels_trunc = ["Искусство", "Политика", "Финансы", "Стратегическое управление", "Юриспруденция", "Исследования и разработки",
#           "Промышленность", "Образование", "Здравоохранение", "Сельское хозяйство", 
#           "Государственное управление", "Реклама и маркетинг", "Инновации и модернизация", "Безопасность", 
#           "Военное дело", "Корпоративное управление", "Социальная защита", "Строительство", "Предпринимательство",
#           "Инвестиции"]
labels_trunc = labels
int_labels = label_encoder.fit_transform(labels_trunc)

def f(x):
    b = []
    for a in x.split(","):
        t = a.strip()
        if t == "Частный бизнес" or t == "Социальное предпринимательство":
            b.append("Предпринимательство")
        elif t == "Дошкольное образование/детский отдых":
            b.append("Образование")
        elif t in {'Журналистика', 'Управление персоналом', 'Управление рисками'}:
            pass
        else:
            b.append(t)
    return b[:6]
social.Интересы = social.Интересы.apply(f)
def g(x):
    if not x:
        return np.in1d(int_labels, []).astype('int')
    return np.in1d(int_labels, label_encoder.transform(x)).astype('int')
social['y'] = social.Интересы.apply(g)

### Corpora from created texts

In [8]:
corpora_class = CorporaClass()

for filename in categories:
    with open(f"assets/corpora/{filename}.txt") as f:
        corpora_class.add_to_corpora(f, filename)

pickle.dump(corpora_class.corpora, open("assets/corpora.p", "wb"))
pickle.dump(corpora_class.labels, open("assets/labels.p", "wb"))

corpora = pickle.load(open("assets/corpora.p", "rb"))
labels = pickle.load(open("assets/labels.p", "rb"))

### Corpora from user-generated content

In [9]:
corpora_user_gen = CorporaClass()
corpora_user_gen.labels = pickle.load(open("assets/labels_user_gen.p", "rb"))
corpora_user_gen.corpora = pickle.load(open("assets/corpora_user_gen.p", "rb"))

# r = ResultClass()
# r.publics_dict = json.load(open("assets/publics_dict.json"))
# delimiter = 0
# for i, row in tqdm.tqdm(list(social.iterrows())[delimiter:], total=len(social) - delimiter):
#     user_vk, user_fb = row.vk, row.fb
#     if str(user_vk) != "nan":
#         try:
#             r.parse_vk(user_vk, parse=True)
#         except: pass
#     if str(user_fb) != "nan":
#         try:
#             r.parse_fb(user_fb)
#         except KeyError:
#             pass
#     corpora_user_gen.add_to_corpora(r.texts, i)
#     r.texts = []

# pickle.dump(corpora_user_gen.corpora, open("assets/corpora_user_gen.p", "wb"))
# pickle.dump(corpora_user_gen.labels, open("assets/labels_user_gen.p", "wb"))

In [11]:
# def filter_corpora(corpora, labels=corpora_user_gen.labels, DELIM=200, k=0.8):
#     indexes = np.array(list(range(len(corpora))))
#     all_indexes = set()
#     for col in norm_names.values():
#         t = social.Интересы.apply(lambda s: col in s)
#         col_labels = list(set(t[t == True].index).intersection(labels))[:50]
#         all_indexes = all_indexes.union(indexes[np.in1d(labels, col_labels)])
#     all_indexes = list(all_indexes)
#     new_corpora = list(np.array(corpora)[all_indexes])
#     new_labels = list(np.array(labels)[all_indexes])
#     new_corpora = [a[:DELIM] for a in new_corpora]
#     new_corpora, test_corpora, new_labels, test_labels = train_test_split(new_corpora, new_labels, train_size=k)
#     return new_corpora, test_corpora, new_labels, test_labels

# new_corpora, test_corpora, new_labels, test_labels = filter_corpora(corpora_user_gen.corpora, corpora_user_gen.labels)

# d = {}
# for col in norm_names.values():
#     t = social.Интересы.apply(lambda s: col in s[:3])
#     col_labels = list(set(t[t == True].index).intersection(new_labels))
#     d[col] = [label for label in col_labels if label not in list(itertools.chain(*(v for v in d.values())))][:10]
# d = {a: [int(c) for c in b] for (a, b) in d.items()}
# json.dump(d, open("assets/d.json", "w"))
    
# new_corpora = np.array(new_corpora)
# for col, labels in d.items():
#     with open(f"assets/corpora/{norm_names_reversed[col]}.txt", "a") as f:
#         for line in list(itertools.chain(*(item for item in new_corpora[np.in1d(new_labels, d[col])]))):
#             _ = f.write(line)

# Это те labels, которые мы приписали к corpora и должны учесть в y
d = json.load(open("assets/d.json"))

In [12]:
vectorizer = TfidfVectorizer(tokenizer=corpora_class.full_process, 
                             max_df=200, 
                             min_df=5, 
                             sublinear_tf=True, 
                             ngram_range=(1, 1))
docterm_matrix = vectorizer.fit_transform(list(itertools.chain(*(doc for doc in corpora))))


pickle.dump(docterm_matrix, open("assets/docterm_matrix.p", "wb"))
pickle.dump(vectorizer, open("assets/vectorizer.p", "wb"))

In [13]:
docterm_matrix_user_gen = vectorizer.transform(list(itertools.chain(*(doc for doc in new_corpora))))

In [14]:
docterm_matrix = pickle.load(open("assets/docterm_matrix.p", 'rb'))
vectorizer = pickle.load(open("assets/vectorizer.p", "rb"))

In [15]:
DELIM = 1300
NUM_OF_CLASSES = 22
vector_size = docterm_matrix[0].shape[1]

In [16]:
corpora_lengths = []
for i, x in enumerate(corpora):
    corpora_lengths.append(len(x))

#### y for created texts

In [17]:
y = []
for (i, item), cat in zip(zip(range(len(corpora)), corpora_lengths), categories):
    for _ in range(item):
        y_ = np.zeros(NUM_OF_CLASSES)
        t = np.zeros(NUM_OF_CLASSES)
        for arr in (social.loc[d[norm_names[cat]], 'y'] * 0.016):
            t = t + arr
        y_ = y_ + t
        y_[i] = 1
        y.append(y_)
y = np.array(y)

#### y for user-gen

In [52]:
y_user_gen = np.empty((docterm_matrix_user_gen.shape[0], NUM_OF_CLASSES))
y_ = np.array([np.array(a) for a in social.loc[new_labels, 'y']])
ind_arr = np.cumsum(np.array(corpora_lengths_user_gen))
int_a, int_b = 0, ind_arr[0]
for i in range(len(ind_arr) - 1):
    y_user_gen[int_a:int_b] = np.array([y_[i] for _ in range(int_b - int_a)])
    int_a, int_b = int_b, ind_arr[i + 1]

In [25]:
X = docterm_matrix.toarray()

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [34]:
input_vec = Input(shape=(vector_size,))
l1 = Dense(128, activation='relu')(input_vec)
l2 = Dense(64, activation='relu')(l1)
l3 = Dropout(0.2)(l2)
l4 = Dense(96, activation='sigmoid')(l3)
output = Dense(NUM_OF_CLASSES, activation='sigmoid')(l4)

# input_vec = Input(shape=(vector_size,))
# l1 = Dense(128, activation='relu')(input_vec)
# l2 = Dense(64, activation='relu')(l1)
# l3 = Dropout(0.3)(l2)
# l4 = Dense(32, activation='relu')(l3)
# l5 = Dense(96, activation='sigmoid')(l4)
# output = Dense(NUM_OF_CLASSES, activation='sigmoid')(l5)

classifier = Model(input_vec, output)
classifier.compile(optimizer='adadelta', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [35]:
# callbacks = [EarlyStopping(monitor='val_acc',
#                            patience=8,
#                            verbose=1,
#                            min_delta=1e-4,
#                            mode='max'),
#              ReduceLROnPlateau(monitor='val_acc',
#                                factor=0.1,
#                                patience=4,
#                                verbose=1,
#                                epsilon=1e-4,
#                                mode='max'),
#              ModelCheckpoint(monitor='val_acc',
#                              filepath='assets/vk_texts_classifier.h5',
#                              save_best_only=True,
#                              save_weights_only=False,
#                              mode='max'),
#              # TensorBoard(log_dir='logs'),
#              CSVLogger('assets/last_training_log.csv', separator=',', append=False)
#             ]

# classifier.load_weights('vk_texts_classifier.h5')

# for using sparse vectors
def nn_batch_generator(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].todense()
        y_batch = y_data[index_batch]
        counter += 1
        yield np.array(X_batch),y_batch
        if (counter > number_of_batches):
            counter=0

classifier.fit(X_train, 
               y_train, 
               validation_data=(X_test, y_test), 
               batch_size=196, 
               epochs=60, 
               class_weight='balanced'
#                callbacks=callbacks
              )

Train on 33870 samples, validate on 8468 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60


Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<keras.callbacks.History at 0x32cb60828>

In [36]:
classifier.save("assets/vk_texts_classifier.h5")

In [9]:
classifier = load_model("assets/vk_texts_classifier.h5")

# Result

In [10]:
norm_categories = np.array(list(norm_names.values()))

In [39]:
dict_for_mean = []

labels_ = np.array(test_labels)
corpora_ = np.array(test_corpora)

acc_d = {}
for col in norm_names.values():
    t = social.Интересы.apply(lambda s: col in s)
    col_labels = np.array(list(set(t[t == True].index).intersection(labels_)))
    c_0 = 0
    c_1 = 0
    c_2 = 0
    for item in corpora_[np.in1d(labels_, col_labels)]:
        t = np.sum(classifier.predict(vectorizer.transform(item).toarray()), axis=0)
        dict_for_mean.append(t)
        pred_categories = norm_categories[t.argsort()[::-1][:8]]
        if col in pred_categories[:2]:
            c_0 += 1
        if col in pred_categories[:5]:
            c_1 += 1
        if col in pred_categories:
            c_2 += 1
    l = len(col_labels)
    if l == 0:
        l = 1
    col, l, c_0 / l, c_1 / l, c_2 / l
    acc_d[col] = c_2 / l
    
    
#     sorted(np.sum(classifier.predict(vectorizer.transform(corpora_user_gen.corpora[5]).toarray()), axis=0), reverse=True)[:3]

('Искусство', 9, 0.2222222222222222, 0.2222222222222222, 0.3333333333333333)

('Политика', 17, 0.23529411764705882, 0.23529411764705882, 0.29411764705882354)

('Финансы', 6, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333)

('Стратегическое управление',
 27,
 0.25925925925925924,
 0.6666666666666666,
 0.8888888888888888)

('Юриспруденция',
 6,
 0.3333333333333333,
 0.3333333333333333,
 0.3333333333333333)

('Исследования и разработки',
 35,
 0.11428571428571428,
 0.45714285714285713,
 0.8)

('Промышленность', 17, 0.0, 0.11764705882352941, 0.23529411764705882)

('Образование', 33, 0.3333333333333333, 0.7272727272727273, 0.9090909090909091)

('Благотворительность', 1, 0.0, 0.0, 0.0)

('Здравоохранение', 10, 0.4, 0.4, 0.4)

('Сельское хозяйство', 4, 0.5, 0.5, 0.5)

('Государственное управление',
 23,
 0.21739130434782608,
 0.30434782608695654,
 0.6086956521739131)

('Реклама и маркетинг',
 9,
 0.2222222222222222,
 0.3333333333333333,
 0.3333333333333333)

('Инновации и модернизация',
 29,
 0.10344827586206896,
 0.3448275862068966,
 0.9310344827586207)

('Безопасность', 9, 0.3333333333333333, 0.4444444444444444, 0.4444444444444444)

('Военное дело', 4, 0.0, 0.0, 0.25)

('Корпоративное управление',
 17,
 0.11764705882352941,
 0.4117647058823529,
 0.6470588235294118)

('Социальная защита',
 3,
 0.3333333333333333,
 0.3333333333333333,
 0.3333333333333333)

('Строительство', 8, 0.125, 0.125, 0.125)

('Предпринимательство', 36, 0.25, 0.6666666666666666, 0.9722222222222222)

('Спорт', 1, 0.0, 0.0, 0.0)

('Инвестиции',
 21,
 0.19047619047619047,
 0.19047619047619047,
 0.3333333333333333)

In [40]:
np.mean(list(acc_d.values()))

0.45481123580974059

In [41]:
list(zip(categories, [np.mean(a) for a in np.array([np.array(b) for b in dict_for_mean]).T]))

[('art', 0.11978178),
 ('politics', 0.2571713),
 ('finances', 0.15092145),
 ('strateg_management', 0.27437153),
 ('law', 0.14367713),
 ('elaboration', 0.2649323),
 ('industry', 0.14514737),
 ('education', 0.29252267),
 ('charity', 0.11408015),
 ('public_health', 0.16153345),
 ('agriculture', 0.21364743),
 ('government_management', 0.26083517),
 ('smm', 0.23419189),
 ('innovations', 0.23632175),
 ('safety', 0.15871054),
 ('military', 0.059539381),
 ('corporative_management', 0.25076938),
 ('social_safety', 0.12933674),
 ('building', 0.14101389),
 ('entrepreneurship', 0.26689848),
 ('sport', 0.21294023),
 ('investitions', 0.18971834)]

In [11]:
# social = pd.read_excel("social.xlsx", names=['id', 'name', 'fb', 'ok', 'vk', 'email', 'twitter'])
# social.set_index('id', inplace=True)
# social = social[social.vk.notnull()]
# social.vk = social.vk.apply(lambda x: str(x).split("/")[-1])
# df = social[social.vk.notnull()]
# df = df[['name', 'vk']]

# def get_id(screen_name):
#     try:
#         item = vk.users.get(user_ids=screen_name)
#         if 'deactivated' not in item:
#             return item[0]['id']
#     except Exception as e:
#         print(e.args)
# # Нужно замапить screen_name с их vk_id, поэтому придется обрабатывать по одному

# df['vk_id'] = df.vk.progress_apply(get_id)
# df.dropna(inplace=True)
# df.vk_id = df.vk_id.astype('int')
# df.to_msgpack("df.msg")
df = pd.read_msgpack("assets/df.msg")
known = pd.read_csv("assets/known_users.csv")
known = known.merge(df[['vk_id']], left_on='Leader-ID', right_index=True, how='left')

In [12]:
result_class = ResultClass()

In [13]:
offset = 0
for index, row in tqdm.tqdm(known.iloc[offset:, :].iterrows(), total=len(known) - offset):
    user_vk = row['vk_id']
    if str(user_vk) == "nan":
        user_vk = None
    user_fb = row['FB']
    if str(user_fb) == "nan":
        user_fb = None
    try:
        verdict = result_class.get_result(user_vk, user_fb)
        result_class.texts = []
        for cat, value in verdict:
            known.loc[index, cat] = value
    except ValueError:
        for cat in categories:
            known.loc[index, cat] = 0
    except IndexError:
        for cat in categories:
            known.loc[index, cat] = 0

  0%|          | 0/44 [00:00<?, ?it/s]

FB Parse completed.
Added to corpora
Transformed corpora.


  7%|▋         | 3/44 [00:01<00:26,  1.57it/s]

FB Parse completed.
Added to corpora
Transformed corpora.
FB Parse completed.
Added to corpora
FB Parse completed.
Added to corpora


  9%|▉         | 4/44 [00:02<00:27,  1.45it/s]

Transformed corpora.


 11%|█▏        | 5/44 [00:03<00:30,  1.30it/s]

FB Parse completed.
Added to corpora
Transformed corpora.


 14%|█▎        | 6/44 [00:03<00:23,  1.61it/s]

FB Parse completed.
Added to corpora
Transformed corpora.


 16%|█▌        | 7/44 [00:03<00:18,  1.98it/s]

FB Parse completed.
Added to corpora
Transformed corpora.
FB Parse completed.

 18%|█▊        | 8/44 [00:04<00:25,  1.42it/s]


Added to corpora
Transformed corpora.


 20%|██        | 9/44 [00:05<00:19,  1.77it/s]

FB Parse completed.
Added to corpora
Transformed corpora.
FB Parse completed.

 23%|██▎       | 10/44 [00:05<00:17,  1.93it/s]


Added to corpora
Transformed corpora.


 25%|██▌       | 11/44 [00:05<00:14,  2.31it/s]

FB Parse completed.
Added to corpora
Transformed corpora.
VK Parse completed.
FB Parse completed.
Added to corpora

 27%|██▋       | 12/44 [00:07<00:24,  1.29it/s]


Transformed corpora.


 30%|██▉       | 13/44 [00:07<00:21,  1.42it/s]

FB Parse completed.
Added to corpora
Transformed corpora.


 32%|███▏      | 14/44 [00:08<00:21,  1.40it/s]

FB Parse completed.
Added to corpora
Transformed corpora.


 36%|███▋      | 16/44 [00:09<00:13,  2.15it/s]

FB Parse completed.
Added to corpora
Transformed corpora.
FB Parse completed.
Added to corpora


 39%|███▊      | 17/44 [00:09<00:12,  2.23it/s]

FB Parse completed.
Added to corpora
Transformed corpora.
FB Parse completed.

 41%|████      | 18/44 [00:10<00:12,  2.10it/s]


Added to corpora
Transformed corpora.


 43%|████▎     | 19/44 [00:10<00:10,  2.47it/s]

FB Parse completed.
Added to corpora
Transformed corpora.


 45%|████▌     | 20/44 [00:12<00:20,  1.17it/s]

FB Parse completed.
Added to corpora
Transformed corpora.


 48%|████▊     | 21/44 [00:12<00:15,  1.44it/s]

FB Parse completed.
Added to corpora
Transformed corpora.
1-th public have been parsed.
2-th public have been parsed.
3-th public have been parsed.
4-th public have been parsed.
5-th public have been parsed.
VK Parse completed.
FB Parse completed.
Added to corpora
Transformed corpora.


 52%|█████▏    | 23/44 [01:02<03:48, 10.86s/it]

FB Parse completed.
Added to corpora
Transformed corpora.


 55%|█████▍    | 24/44 [01:02<02:33,  7.70s/it]

FB Parse completed.
Added to corpora
Transformed corpora.
FB Parse completed.

 57%|█████▋    | 25/44 [01:02<01:44,  5.50s/it]


Added to corpora
Transformed corpora.
FB Parse completed.

 59%|█████▉    | 26/44 [01:03<01:10,  3.91s/it]


Added to corpora
Transformed corpora.
FB Parse completed.
Added to corpora
Transformed corpora.


 61%|██████▏   | 27/44 [01:05<01:00,  3.57s/it]

FB Parse completed.
Added to corpora

 64%|██████▎   | 28/44 [01:06<00:45,  2.83s/it]


Transformed corpora.
FB Parse completed.


 66%|██████▌   | 29/44 [01:08<00:34,  2.30s/it]

Added to corpora
Transformed corpora.
FB Parse completed.


 68%|██████▊   | 30/44 [01:09<00:26,  1.92s/it]

Added to corpora
Transformed corpora.


 70%|███████   | 31/44 [01:09<00:18,  1.40s/it]

FB Parse completed.
Added to corpora
Transformed corpora.
FB Parse completed.


 73%|███████▎  | 32/44 [01:10<00:15,  1.27s/it]

Added to corpora
Transformed corpora.


 75%|███████▌  | 33/44 [01:10<00:10,  1.02it/s]

FB Parse completed.
Added to corpora
Transformed corpora.


 77%|███████▋  | 34/44 [01:11<00:10,  1.01s/it]

FB Parse completed.
Added to corpora
Transformed corpora.


 80%|███████▉  | 35/44 [01:12<00:09,  1.01s/it]

FB Parse completed.
Added to corpora
Transformed corpora.
FB Parse completed.
Added to corpora

 82%|████████▏ | 36/44 [01:13<00:08,  1.01s/it]


Transformed corpora.
FB Parse completed.
Added to corpora


 84%|████████▍ | 37/44 [01:15<00:08,  1.26s/it]

Transformed corpora.


 86%|████████▋ | 38/44 [01:16<00:07,  1.17s/it]

FB Parse completed.
Added to corpora
Transformed corpora.
FB Parse completed.


 89%|████████▊ | 39/44 [01:17<00:05,  1.16s/it]

Added to corpora
Transformed corpora.
FB Parse completed.


 91%|█████████ | 40/44 [01:18<00:04,  1.24s/it]

Added to corpora
Transformed corpora.
FB Parse completed.
Added to corpora

 93%|█████████▎| 41/44 [01:20<00:03,  1.26s/it]


Transformed corpora.
FB Parse completed.


 95%|█████████▌| 42/44 [01:21<00:02,  1.24s/it]

Added to corpora
Transformed corpora.
FB Parse completed.
Added to corpora

 98%|█████████▊| 43/44 [01:22<00:01,  1.16s/it]


Transformed corpora.
1-th public have been parsed.
2-th public have been parsed.
3-th public have been parsed.
4-th public have been parsed.
VK Parse completed.
FB Parse completed.
Added to corpora
Transformed corpora.


100%|██████████| 44/44 [01:41<00:00,  6.58s/it]


In [14]:
norm_dict = {"values": []}
for name in known.ФИ:
    results = []
    for col in known[known.ФИ == name].iloc[:, 4:].columns:
        results.append({"name": norm_names[col], "value": float(known.loc[known.ФИ == name, col].values[0])})
    norm_dict['values'].append({"name": name, "results": results})

In [15]:
[(norm_categories[x.argsort()[-1]], norm_categories[x.argsort()[-2]]) for x in known.loc[:, categories].values]

[('Образование', 'Реклама и маркетинг'),
 ('Образование', 'Предпринимательство'),
 ('Инвестиции', 'Спорт'),
 ('Спорт', 'Благотворительность'),
 ('Образование', 'Реклама и маркетинг'),
 ('Исследования и разработки', 'Корпоративное управление'),
 ('Юриспруденция', 'Образование'),
 ('Образование', 'Исследования и разработки'),
 ('Промышленность', 'Образование'),
 ('Образование', 'Реклама и маркетинг'),
 ('Реклама и маркетинг', 'Стратегическое управление'),
 ('Спорт', 'Образование'),
 ('Образование', 'Предпринимательство'),
 ('Спорт', 'Предпринимательство'),
 ('Благотворительность', 'Политика'),
 ('Инвестиции', 'Спорт'),
 ('Образование', 'Корпоративное управление'),
 ('Корпоративное управление', 'Образование'),
 ('Предпринимательство', 'Социальная защита'),
 ('Реклама и маркетинг', 'Образование'),
 ('Благотворительность', 'Здравоохранение'),
 ('Спорт', 'Предпринимательство'),
 ('Реклама и маркетинг', 'Предпринимательство'),
 ('Спорт', 'Образование'),
 ('Предпринимательство', 'Образование')

In [18]:
means = []
for col in known.iloc[:, 4:].columns:
    norm_names[col], known.loc[:, col].mean()
    means.append(known.loc[:, col].mean())

('Искусство', 0.12107375)

('Политика', 0.19178158)

('Финансы', 0.1489549)

('Стратегическое управление', 0.25865808)

('Юриспруденция', 0.16852756)

('Исследования и разработки', 0.24883546)

('Промышленность', 0.17513922)

('Образование', 0.29297134)

('Благотворительность', 0.15965591)

('Здравоохранение', 0.12450378)

('Сельское хозяйство', 0.12930495)

('Государственное управление', 0.19284435)

('Реклама и маркетинг', 0.25356916)

('Инновации и модернизация', 0.22492985)

('Безопасность', 0.099284083)

('Военное дело', 0.049133826)

('Корпоративное управление', 0.23961683)

('Социальная защита', 0.13141078)

('Строительство', 0.1177691)

('Предпринимательство', 0.26709604)

('Спорт', 0.1504022)

('Инвестиции', 0.14617349)

In [23]:
# t is from user-gen up
means_2 = [(a + b) / 2 for a, b in zip([z[1] for z in t], [b for b in means])]

In [25]:
json.dump(dict(zip(categories, means_2)), open("margins.json", "w"))

In [27]:
known.to_csv("assets/known.csv")
json.dump(norm_dict, open("assets/temporary_result.json", "w"))

In [51]:
tt = []
for i, item in known.iterrows():
    print(item['ФИ'], end=" ")
    accepted_cols = []
    for col, margin in dict(zip(categories, means_2)).items():
        if item[col] > 1.1 * margin:
            accepted_cols.append(col)
    np.array(accepted_cols)[item[accepted_cols].argsort()[::-1]][:5]

Александрина Клюс 

array(['education', 'smm', 'entrepreneurship', 'strateg_management',
       'politics'],
      dtype='<U18')

Ангелина Зонова 

array(['education', 'entrepreneurship', 'smm', 'corporative_management',
       'building'],
      dtype='<U22')

Владислав Широков 

array([], dtype=float64)

Галина Жукова 

array(['sport'],
      dtype='<U5')

Жанна Кадылева 

array(['education', 'smm', 'entrepreneurship', 'law',
       'corporative_management'],
      dtype='<U22')

Ирина Горькова 

array(['elaboration', 'corporative_management', 'strateg_management',
       'education', 'innovations'],
      dtype='<U22')

Ирина Григоренко 

array(['law', 'education', 'government_management',
       'corporative_management', 'public_health'],
      dtype='<U22')

Наталья Кульбятская 

array(['education', 'elaboration', 'strateg_management',
       'entrepreneurship', 'corporative_management'],
      dtype='<U22')

Нелли Бадалян 

array(['industry', 'education', 'elaboration', 'corporative_management',
       'law'],
      dtype='<U22')

Юлия Ханьжина 

array(['education', 'smm', 'strateg_management', 'corporative_management',
       'finances'],
      dtype='<U22')

Alexey Khoryushin 

array(['smm', 'strateg_management', 'charity', 'art'],
      dtype='<U18')

Andrey Siling 

array(['sport', 'art', 'military'],
      dtype='<U8')

Denis  Trunov 

array(['education', 'entrepreneurship', 'elaboration', 'industry',
       'innovations'],
      dtype='<U16')

Denis Unzhakov 

array(['sport', 'charity', 'industry', 'military'],
      dtype='<U8')

Dima Blaginin 

array(['charity', 'politics', 'education', 'law', 'industry'],
      dtype='<U11')

Ekaterina Aksenova 

array([], dtype=float64)

Ekaterina Shakina 

array(['education', 'corporative_management', 'strateg_management',
       'elaboration', 'entrepreneurship'],
      dtype='<U22')

Eugene Colchev 

array(['corporative_management', 'education', 'strateg_management',
       'entrepreneurship', 'law'],
      dtype='<U22')

Eugene Kolganov 

array(['entrepreneurship', 'social_safety', 'elaboration', 'finances',
       'investitions'],
      dtype='<U16')

Evgeny Kovnir 

array(['smm', 'education', 'corporative_management', 'industry'],
      dtype='<U22')

George Boissonade 

array(['charity', 'public_health'],
      dtype='<U13')

Igor  Ruzhentsev 

array(['sport', 'charity'],
      dtype='<U7')

Irina Gordina-Nevmerzhitskaya 

array(['smm', 'entrepreneurship', 'strateg_management',
       'corporative_management', 'industry'],
      dtype='<U22')

Ivan  Aristov 

array(['sport'],
      dtype='<U5')

Katerina  Novikova 

array(['entrepreneurship', 'elaboration', 'sport', 'industry',
       'agriculture'],
      dtype='<U16')

Kirill Konev 

array(['safety', 'elaboration', 'innovations', 'building'],
      dtype='<U11')

Ksusha Andreeva 

array(['education', 'elaboration', 'entrepreneurship', 'smm', 'innovations'],
      dtype='<U16')

Liubov Kirienko 

array(['entrepreneurship', 'strateg_management', 'elaboration', 'politics',
       'innovations'],
      dtype='<U18')

Maria Dolgikh 

array(['politics', 'smm', 'industry', 'art'],
      dtype='<U8')

Nikita Lebedev 

array(['charity', 'smm', 'sport', 'social_safety'],
      dtype='<U13')

Oleg Podolskiy 

array(['building', 'social_safety', 'entrepreneurship', 'agriculture',
       'investitions'],
      dtype='<U16')

Olga Potemkina 

array(['smm', 'strateg_management', 'corporative_management', 'sport',
       'finances'],
      dtype='<U22')

Olya  Zaytseva 

array(['strateg_management', 'entrepreneurship', 'smm',
       'corporative_management', 'innovations'],
      dtype='<U22')

Ruslan Karmannyy 

array(['strateg_management', 'smm', 'entrepreneurship',
       'corporative_management', 'innovations'],
      dtype='<U22')

Sergey Nakvasin 

array(['education', 'entrepreneurship', 'elaboration',
       'corporative_management', 'strateg_management'],
      dtype='<U22')

Tatiana Anisimova 

array(['sport', 'strateg_management', 'entrepreneurship', 'smm', 'charity'],
      dtype='<U18')

Tatyana Mazhutis 

array(['charity', 'law', 'finances', 'social_safety'],
      dtype='<U13')

Tretyakov Vasily 

array(['education', 'elaboration', 'strateg_management',
       'entrepreneurship', 'smm'],
      dtype='<U22')

Varvara  Lukashina 

array(['education', 'entrepreneurship', 'charity', 'elaboration',
       'corporative_management'],
      dtype='<U22')

Yulia  Gudach 

array(['smm', 'education', 'politics', 'elaboration', 'industry'],
      dtype='<U11')

Yury Sushinov 

array(['smm', 'education', 'strateg_management', 'entrepreneurship',
       'industry'],
      dtype='<U18')

Песков Дмитрий 

array(['smm', 'education', 'strateg_management', 'entrepreneurship',
       'corporative_management'],
      dtype='<U22')

Гнитько Ксения 

array(['charity', 'education', 'law', 'smm', 'industry'],
      dtype='<U9')

Дмитрий Земцов 

array(['strateg_management', 'entrepreneurship', 'elaboration', 'smm',
       'innovations'],
      dtype='<U18')

In [None]:
import vk_api
from config import VK_TOKEN

In [None]:
vk = vk_api.VkApi(token=VK_TOKEN)
vk = vk.get_api()

In [None]:
t = [a['text'] for a in vk.newsfeed.search(q="корпоративное управление", count=200)['items']]

In [None]:
with open('t.txt', 'w') as f:
    for line in t:
        _ = f.write(f'{line}\n')