In [2]:
import itertools
import json
import warnings
warnings.filterwarnings("ignore")

from util import *
from scipy.sparse import vstack, csr_matrix, coo_matrix
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model
from sklearn.utils import class_weight
from keras.layers import *
from keras.models import Model
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard, CSVLogger

# Corpora for training model

In [3]:
sources = {'art': [60114472, 19191317], 
           'politics': [29534144, 23482909], 
           'finances': [62438886, 81354264],
           'strateg_management': ["http://www.stplan.ru/", "http://www.30n.ru/2/1.html"], 
           'law': [65974416, 79084019],
           'elaboration': ["https://vk.com/"], 
           'industry': [67183197, 66233468], 
           'education': [30558759, 98643656],
#            'social_business': [23509868, 56821139],
           'charity': [],
           'public_health': [78860407, 61490488],
           'agriculture': ["http://www.nsh.ru/", "http://россельхоз.рф/"],
           'government_management': ["http://be5.biz/upravlenie/gosudarstvennoe_upravlenie.html", 97296142],
           'smm': [74686342, 79925455],
           'innovations': [98643656, 63337812],
           'safety': [37959220, 10933209],
           'military': ["http://www.soldiering.ru", "https://voennoe-delo.com"],
           'corporative_management': ["http://www.cfin.ru/management/"],
           'social_safety': [49582956, 72388807],
           'building': [30713157, 26978036],
           'entrepreneurship': [69560028, 73537456],
           'sport': [29809500, 128350290],
           'investitions': [37876217, 3800580]
          }

In [4]:
# getting data for corpora
for tag, ids_ in sources.items():
    path = f"assets/corpora/{tag}.txt"
    s = set()
    if not os.path.exists(path):
        with open(path, "w") as f:
            for id_ in ids_:
                if isinstance(id_, int):
                    wall = ParseClass.getallwall({"owner_id": -id_}, 1000)
                    for post in tqdm.tqdm(wall):
                        if len(post) and post not in s:
                            s.add(post)
                            _ = f.write(f"{post}\n")
                elif isinstance(id_, str):
                    links = np.random.choice(
                        list(ParseClass.get_all_links(id_)), 1000)
                    for link in tqdm.tqdm(links):
                        try:
                            page = requests.get(link).text
                            soup = BeautifulSoup(page, "lxml")
                            for text in soup.text.strip().split("\n"):
                                if len(text) and text not in s:
                                    s.add(text)
                                    _ = f.write(f"{text}\n")
                        except:
                            pass

In [None]:
r = ResultClass()
for user_vk in tqdm.tqdm(social.vk.dropna()):
    r.parse_vk(user_vk)
    r.texts = []

 25%|██▍       | 91/368 [1:31:56<3:37:00, 47.00s/it]  

Manual processing

In [None]:
labels = ["Искусство", "Политика", "Финансы", "Стратегическое управление", "Юриспруденция", "Исследования и разработки",
          "Промышленность", "Образование", "Благотворительность", "Здравоохранение", "Сельское хозяйство", 
          "Государственное управление", "Реклама и маркетинг", "Инновации и модернизация", "Безопасность", 
          "Военное дело", "Корпоративное управление", "Социальная защита", "Строительство", "Предпринимательство",
          "Спорт", "Инвестиции"]
norm_names = dict(zip(categories, labels))
norm_names_reversed = dict([(a[1], a[0]) for a in norm_names.items()])

In [None]:
# competencies = pd.read_csv("assets/competencies.csv")
# competencies = competencies.dropna()[['Id', 'Интересы']]
# social = pd.read_excel("assets/social.xlsx", names=['id', 'name', 'fb', 'ok', 'vk', 'email', 'twitter'])
# accepted = social.fb.dropna().index | social.vk.dropna().index
# social = social.loc[accepted, ['name', 'id', 'vk', 'fb']]
# social.vk = social.vk.apply(lambda x: str(x).split("/")[-1])
# social.fb = social.fb.apply(lambda x: str(x).split("/")[-1])
# social.fb = social.fb.apply(lambda x: str(x).split("id=")[-1])
# social.fb = social.fb.apply(lambda x: str(x).split("?ref=")[0])
# social.fb = social.fb.replace('nan', np.nan)
# social = social.merge(df[['vk', 'vk_id']], how='outer', on='vk')
# social.vk = social.vk_id.fillna(0)
# social = social.replace(0, np.nan).drop(labels=['vk_id'], axis=1)
# social.set_index('id', inplace=True)
# social = social.merge(how='inner', right=competencies, left_index=True, right_on='Id')
# to_exclude = pd.read_csv("assets/known_users.csv")['Leader-ID'].dropna().astype('int').values
# social = social.loc[~social.Id.isin(to_exclude), :]
# social.to_msgpack("assets/social.msg")

# def get_id(screen_name):
#     try:
#         item = vk.users.get(user_ids=screen_name)
#         if 'deactivated' not in item:
#             return item[0]['id']
#     except Exception as e:
#         print(e.args)
# Нужно замапить screen_name с их vk_id, поэтому придется обрабатывать по одному

social = pd.read_msgpack("assets/social.msg")

In [None]:
label_encoder = LabelEncoder()
labels_trunc = ["Искусство", "Политика", "Финансы", "Стратегическое управление", "Юриспруденция", "Исследования и разработки",
          "Промышленность", "Образование", "Здравоохранение", "Сельское хозяйство", 
          "Государственное управление", "Реклама и маркетинг", "Инновации и модернизация", "Безопасность", 
          "Военное дело", "Корпоративное управление", "Социальная защита", "Строительство", "Предпринимательство",
          "Инвестиции"]
int_labels = label_encoder.fit_transform(labels_trunc)

def f(x):
    b = []
    for a in x.split(","):
        t = a.strip()
        if t == "Частный бизнес" or t == "Социальное предпринимательство":
            b.append("Предпринимательство")
        elif t == "Дошкольное образование/детский отдых":
            b.append("Образование")
        elif t in {'Журналистика', 'Управление персоналом', 'Управление рисками'}:
            pass
        else:
            b.append(t)
    return b[:6]
social.Интересы = social.Интересы.apply(f)
def g(x):
    if not x:
        return np.in1d(int_labels, []).astype('int')
    return np.in1d(int_labels, label_encoder.transform(x)).astype('int')
social.y = social.Интересы.apply(g)

In [None]:
all_competencies = []
for nabor in social.Интересы:
    all_competencies.extend(nabor)

In [None]:
pd.Series(all_competencies).value_counts()

In [5]:
categories = list(sources.keys())

# Making model

In [6]:
corpora_class = CorporaClass()

for i, row in social.iterrows():
    r = ResultClass()
    user_vk, user_fb = row.vk, row.fb
    if user_vk:
        r.parse_vk(user_vk)
    if user_fb:
        r.parse_fb(user_fb)
    corpora_class.add_to_corpora(r.texts, str(i))

pickle.dump(corpora_class.corpora, open("assets/corpora.p", "wb"))

In [7]:
corpora = pickle.load(open("assets/corpora.p", "rb"))

In [8]:
corpora_lengths = []
for i, x in enumerate(corpora):
    corpora_lengths.append(len(x))

In [14]:
vectorizer = TfidfVectorizer(tokenizer=corpora_class.full_process, max_df=0.5, sublinear_tf=True, ngram_range=(1, 2))
docterm_matrix = vectorizer.fit_transform(list(itertools.chain(*(doc for doc in corpora))))

pickle.dump(docterm_matrix, open("assets/docterm_matrix.p", "wb"))
pickle.dump(vectorizer, open("assets/vectorizer.p", "wb"))

In [None]:
docterm_matrix = pickle.load(open("assets/docterm_matrix.p", 'rb'))

In [18]:
DELIM = 1300
NUM_OF_CLASSES = 22
vector_size = docterm_matrix[0].shape[1]

In [74]:
X = csr_matrix(np.empty(docterm_matrix.shape), dtype='float32')
y = csr_matrix(social.y)

ind_arr = np.cumsum(np.array(corpora_lengths))
for c, item in enumerate(docterm_matrix):
    # get index with needed elements
    i = np.where(ind_arr > c)[0][0]
    num = min(item.shape[0], DELIM)
#     for _ in range(num):
#         y_ = np.zeros(NUM_OF_CLASSES)
#         y_[i] = 1
#     y[c] = y_
#     y[c] = i
    X[c] = item[:num]

In [75]:
X = X.toarray()
y = y.toarray()

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [72]:
input_vec = Input(shape=(vector_size,))
l1 = Dense(96, activation='relu')(input_vec)
l2 = Dense(18, activation='relu')(l1)
l3 = Dropout(0.15)(l2)
l4 = Dense(64, activation='sigmoid')(l3)
output = Dense(NUM_OF_CLASSES, activation='sigmoid')(l4)

classifier = Model(input_vec, output)
classifier.compile(optimizer='adadelta', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [73]:
# callbacks = [EarlyStopping(monitor='val_acc',
#                            patience=8,
#                            verbose=1,
#                            min_delta=1e-4,
#                            mode='max'),
#              ReduceLROnPlateau(monitor='val_acc',
#                                factor=0.1,
#                                patience=4,
#                                verbose=1,
#                                epsilon=1e-4,
#                                mode='max'),
#              ModelCheckpoint(monitor='val_acc',
#                              filepath='assets/vk_texts_classifier.h5',
#                              save_best_only=True,
#                              save_weights_only=False,
#                              mode='max'),
#              # TensorBoard(log_dir='logs'),
#              CSVLogger('assets/last_training_log.csv', separator=',', append=False)
#             ]

# classifier.load_weights('vk_texts_classifier.h5')
classifier.fit(X_train, 
               y_train, 
               validation_data=(X_test, y_test), 
               batch_size=196, 
               epochs=75, 
               class_weight='balanced'
#                callbacks=callbacks
              )

Train on 17233 samples, validate on 4309 samples
Epoch 1/75


InvalidArgumentError: Received a label value of 21 which is outside the valid range of [0, 1).  Label values: 4 6 12 19 5 18 12 5 20 12 12 8 19 10 20 12 12 12 17 14 10 12 12 12 20 13 8 13 5 19 4 18 10 20 10 7 19 1 14 19 19 10 8 10 2 8 12 18 19 2 14 1 17 8 17 19 20 12 20 18 12 19 10 18 18 12 4 17 17 5 12 6 6 17 4 12 3 12 19 12 8 18 3 17 19 17 12 12 6 12 20 17 7 10 16 4 4 10 16 17 12 10 10 6 12 19 13 2 12 16 12 13 17 6 15 5 12 10 7 20 7 12 10 0 18 8 17 19 12 5 16 12 20 10 5 8 18 12 9 11 6 4 17 1 1 6 21 18 20 17 4 19 8 17 12 12 12 20 5 12 12 3 12 1 7 8 19 20 1 12 6 19 21 3 18 12 10 19 13 8 12 11 13 10 7 4 12 8 12 7 18 2 8 11 8 17
	 [[Node: SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits = SparseSoftmaxCrossEntropyWithLogits[T=DT_FLOAT, Tlabels=DT_INT64, _device="/job:localhost/replica:0/task:0/cpu:0"](Reshape_1, Cast_4)]]

Caused by op 'SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits', defined at:
  File "/usr/local/Cellar/python3/3.6.2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/local/Cellar/python3/3.6.2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/usr/local/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/usr/local/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2808, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-72-7f12b39e75e7>", line 9, in <module>
    classifier.compile(optimizer='adadelta', loss='sparse_categorical_crossentropy', metrics=['categorical_accuracy'])
  File "/usr/local/lib/python3.6/site-packages/keras/engine/training.py", line 840, in compile
    sample_weight, mask)
  File "/usr/local/lib/python3.6/site-packages/keras/engine/training.py", line 446, in weighted
    score_array = fn(y_true, y_pred)
  File "/usr/local/lib/python3.6/site-packages/keras/losses.py", line 53, in sparse_categorical_crossentropy
    return K.sparse_categorical_crossentropy(y_pred, y_true)
  File "/usr/local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2782, in sparse_categorical_crossentropy
    logits=logits)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 1690, in sparse_softmax_cross_entropy_with_logits
    precise_logits, labels, name=name)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 2486, in _sparse_softmax_cross_entropy_with_logits
    features=features, labels=labels, name=name)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1269, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): Received a label value of 21 which is outside the valid range of [0, 1).  Label values: 4 6 12 19 5 18 12 5 20 12 12 8 19 10 20 12 12 12 17 14 10 12 12 12 20 13 8 13 5 19 4 18 10 20 10 7 19 1 14 19 19 10 8 10 2 8 12 18 19 2 14 1 17 8 17 19 20 12 20 18 12 19 10 18 18 12 4 17 17 5 12 6 6 17 4 12 3 12 19 12 8 18 3 17 19 17 12 12 6 12 20 17 7 10 16 4 4 10 16 17 12 10 10 6 12 19 13 2 12 16 12 13 17 6 15 5 12 10 7 20 7 12 10 0 18 8 17 19 12 5 16 12 20 10 5 8 18 12 9 11 6 4 17 1 1 6 21 18 20 17 4 19 8 17 12 12 12 20 5 12 12 3 12 1 7 8 19 20 1 12 6 19 21 3 18 12 10 19 13 8 12 11 13 10 7 4 12 8 12 7 18 2 8 11 8 17
	 [[Node: SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits = SparseSoftmaxCrossEntropyWithLogits[T=DT_FLOAT, Tlabels=DT_INT64, _device="/job:localhost/replica:0/task:0/cpu:0"](Reshape_1, Cast_4)]]


In [15]:
classifier.save("assets/vk_texts_classifier.h5")

# Result

In [16]:
# social = pd.read_excel("social.xlsx", names=['id', 'name', 'fb', 'ok', 'vk', 'email', 'twitter'])
# social.set_index('id', inplace=True)
# social = social[social.vk.notnull()]
# social.vk = social.vk.apply(lambda x: str(x).split("/")[-1])
# df = social[social.vk.notnull()]
# df = df[['name', 'vk']]

# def get_id(screen_name):
#     try:
#         item = vk.users.get(user_ids=screen_name)
#         if 'deactivated' not in item:
#             return item[0]['id']
#     except Exception as e:
#         print(e.args)
# # Нужно замапить screen_name с их vk_id, поэтому придется обрабатывать по одному

# df['vk_id'] = df.vk.progress_apply(get_id)
# df.dropna(inplace=True)
# df.vk_id = df.vk_id.astype('int')
# df.to_msgpack("df.msg")
df = pd.read_msgpack("assets/df.msg")
known = pd.read_csv("assets/known_users.csv")
known = known.merge(df[['vk_id']], left_on='Leader-ID', right_index=True, how='left')

In [17]:
result_class = ResultClass()

In [103]:
categories = json.load(open("assets/categories.json"))
classifier = load_model("assets/vk_texts_classifier.h5")
vectorizer = pickle.load(open("assets/vectorizer.p", "rb"))

with open("t.txt", "w") as f:
    for user_fb in known.FB:
        user_vk = None
        texts = []
        parse_class = ParseClass()
        if user_vk:
            texts.extend(parse_class.process_owner_vk(user_vk, owner_type='user'))
            public_ids = parse_class.get_publics(user_vk, 6)
            for public_id in public_ids:
                texts.extend(parse_class.process_owner_vk(public_id, owner_type='public', n_wall=2000))
        if user_fb:
            # texts.extend(parse_class.get_posts_fb(user_fb))
            texts.extend(parse_class.get_posts_fb_temp(user_fb))
        corpora_class = CorporaClass()
        corpora_class.add_to_corpora(texts)
        corpora_class.process_corpora()
        pred = categories[svm.predict(vectorizer.transform([" ".join(corpora_class.corpora[0])]).toarray())[0]]
        _ = f.write(f"{known.loc[known.FB == user_fb, 'ФИ']} – {pred}")
    #     verdict = normalize(np.sum(svm.predict(vectorizer.transform(corpora_class.corpora[0]).toarray()),
    #                                axis=0).reshape(1, -1))[0]
    #     list(zip(categories, verdict))

100%|██████████| 1/1 [00:00<00:00, 324.89it/s]
100%|██████████| 1/1 [00:00<00:00, 1964.55it/s]
100%|██████████| 1/1 [00:00<00:00, 1630.76it/s]
100%|██████████| 1/1 [00:00<00:00, 781.94it/s]
100%|██████████| 1/1 [00:00<00:00, 430.49it/s]
100%|██████████| 1/1 [00:00<00:00, 1848.53it/s]
100%|██████████| 1/1 [00:00<00:00, 553.41it/s]
100%|██████████| 1/1 [00:00<00:00, 705.28it/s]
100%|██████████| 1/1 [00:00<00:00, 3010.99it/s]
100%|██████████| 1/1 [00:00<00:00, 2857.16it/s]
100%|██████████| 1/1 [00:00<00:00, 2281.99it/s]
100%|██████████| 1/1 [00:00<00:00, 155.33it/s]
100%|██████████| 1/1 [00:00<00:00, 1901.32it/s]
100%|██████████| 1/1 [00:00<00:00, 22.87it/s]
100%|██████████| 1/1 [00:00<00:00, 1416.04it/s]
100%|██████████| 1/1 [00:00<00:00, 2309.64it/s]
100%|██████████| 1/1 [00:00<00:00, 1886.78it/s]
100%|██████████| 1/1 [00:00<00:00, 585.14it/s]
100%|██████████| 1/1 [00:00<00:00, 8473.34it/s]
100%|██████████| 1/1 [00:00<00:00, 549.42it/s]
100%|██████████| 1/1 [00:00<00:00, 5315.97it/s]
10

In [18]:
offset = 0
for index, row in tqdm.tqdm(known.iloc[offset:, :].iterrows(), total=len(known) - offset):
    user_vk = row['vk_id']
    if np.isnan(user_vk):
        user_vk = None
    user_fb = row['FB']
    try:
        verdict = result_class.get_result(user_vk, user_fb)
        for cat, value in verdict:
            known.loc[index, cat] = value
    except ValueError:
        for cat in categories:
            known.loc[index, cat] = 0

  0%|          | 0/44 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A
  2%|▏         | 1/44 [00:02<01:55,  2.69s/it][A
  0%|          | 0/1 [00:00<?, ?it/s][A
  5%|▍         | 2/44 [00:03<01:24,  2.01s/it]][A
  0%|          | 0/1 [00:00<?, ?it/s][A
  7%|▋         | 3/44 [00:03<01:00,  1.47s/it]][A
  0%|          | 0/1 [00:00<?, ?it/s][A
  9%|▉         | 4/44 [00:04<01:00,  1.52s/it][A
  0%|          | 0/1 [00:00<?, ?it/s][A
 11%|█▏        | 5/44 [00:06<00:59,  1.51s/it][A
  0%|          | 0/1 [00:00<?, ?it/s][A
 14%|█▎        | 6/44 [00:06<00:44,  1.18s/it]][A
  0%|          | 0/1 [00:00<?, ?it/s][A
 16%|█▌        | 7/44 [00:07<00:34,  1.06it/s]][A
  0%|          | 0/1 [00:00<?, ?it/s][A
 18%|█▊        | 8/44 [00:09<00:42,  1.18s/it][A
  0%|          | 0/1 [00:00<?, ?it/s][A
 20%|██        | 9/44 [00:09<00:32,  1.07it/s]][A
  0%|          | 0/1 [00:00<?, ?it/s][A
 23%|██▎       | 10/44 [00:09<00:27,  1.24it/s][A
  0%|          | 0/1 [00:00<?, ?it/s][A
 25%|

In [19]:
norm_dict = {"values": []}
for name in known.ФИ:
    results = []
    for col in known[known.ФИ == name].iloc[:, 4:].columns:
        results.append({"name": norm_names[col], "value": float(known.loc[known.ФИ == name, col].values[0])})
    norm_dict['values'].append({"name": name, "results": results})

In [20]:
for col in known.iloc[:, 4:].columns:
    norm_names[col], known.loc[:, col].mean()

('Искусство', 0.014755845)

('Политика', 0.026561856)

('Финансы', 0.039396241)

('Стратегическое управление', 0.039121598)

('Юриспруденция', 0.071112327)

('Исследования и разработки', 0.088504076)

('Промышленность', 0.29107782)

('Образование', 0.62175745)

('Благотворительность', 0.10370329)

('Здравоохранение', 0.052846637)

('Сельское хозяйство', 0.032095235)

('Государственное управление', 0.026598753)

('Реклама и маркетинг', 0.064131789)

('Инновации и модернизация', 0.16241147)

('Безопасность', 0.025394594)

('Военное дело', 0.017704837)

('Корпоративное управление', 0.1330671)

('Социальная защита', 0.12280205)

('Строительство', 0.060238123)

('Предпринимательство', 0.1269815)

('Спорт', 0.10918298)

('Инвестиции', 0.026866633)

In [54]:
for fb in known.loc[known['education'] < 0.1, 'FB']:
    with open(f"assets/corpora_cached_fb_users/{fb}.txt") as f:
        for line in f:
            print(line)
    print([a.values for a in known.loc[known['FB'] == fb, :].items()])
    print("End of file")

Открылись!!!! Приветствуем Точку кипения в Петрозаводске!



AttributeError: 'tuple' object has no attribute 'values'

In [21]:
known.to_csv("assets/known.csv")
json.dump(norm_dict, open("assets/temporary_result.json", "w"))

In [22]:
import vk_api
from config import VK_TOKEN

In [23]:
vk = vk_api.VkApi(token=VK_TOKEN)
vk = vk.get_api()

In [24]:
t = [a['text'] for a in vk.newsfeed.search(q="корпоративное управление", count=200)['items']]

In [25]:
with open('t.txt', 'w') as f:
    for line in t:
        _ = f.write(f'{line}\n')