In [1]:
import json
import warnings
warnings.filterwarnings("ignore")

from util import *
from scipy.sparse import vstack, csr_matrix, coo_matrix
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import load_model
from sklearn.utils import class_weight
from keras.layers import *
from keras.models import Model
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard, CSVLogger

Using TensorFlow backend.


# Corpora for training model

In [2]:
sources = {'art': [60114472, 19191317], 
           'politics': [29534144, 23482909], 
           'finances': [62438886, 81354264],
           'strateg_management': ["http://www.stplan.ru/", "http://www.30n.ru/2/1.html"], 
           'law': [65974416, 79084019],
           'elaboration': ["https://vk.com/"], 
           'industry': [67183197, 66233468], 
           'education': [30558759, 98643656],
           'social_business': [23509868, 56821139],
           'public_health': [78860407, 61490488],
           'agriculture': ["http://www.nsh.ru/", "http://россельхоз.рф/"],
           'government_management': ["http://be5.biz/upravlenie/gosudarstvennoe_upravlenie.html", 97296142],
           'smm': [74686342, 79925455],
           'innovations': [98643656, 63337812],
           'safety': [37959220, 10933209],
           'military': ["http://www.soldiering.ru", "https://voennoe-delo.com"],
           'corporative_management': ["http://www.cfin.ru/management/"],
           'social_safety': [49582956, 72388807],
           'building': [30713157, 26978036],
           'entrepreneurship': [69560028, 73537456],
           'sport': [29809500, 128350290],
           'investitions': [37876217, 3800580]
          }

In [3]:
# getting data for corpora
for tag, ids_ in sources.items():
    path = f"assets/corpora/{tag}.txt"
    s = set()
    if not os.path.exists(path):
        with open(path, "w") as f:
            for id_ in ids_:
                if isinstance(id_, int):
                    wall = ParseClass.getallwall({"owner_id": -id_}, 1000)
                    for post in tqdm.tqdm(wall):
                        if len(post) and post not in s:
                            s.add(post)
                            _ = f.write(f"{post}\n")
                elif isinstance(id_, str):
                    links = np.random.choice(list(ParseClass.get_all_links(id_)), 1000)
                    for link in tqdm.tqdm(links):
                        try:
                            page = requests.get(link).text
                            soup = BeautifulSoup(page, "lxml")
                            for text in soup.text.strip().split("\n"):
                                if len(text) and text not in s:
                                    s.add(text)
                                    _ = f.write(f"{text}\n")
                        except: pass

Manual processing

In [None]:
categories = list(sources.keys())

# Making model

In [None]:
corpora_class = CorporaClass()

for filename in categories:
    with open(f"assets/corpora/{filename}.txt") as f:
        corpora_class.add_to_corpora(f)
corpora_class.process_corpora()

json.dump(list(corpora_class.vocab), open("assets/vocab.json", "w"))
pickle.dump(corpora_class.corpora, open("assets/corpora.p", "wb"))

In [None]:
corpora = pickle.load(open("assets/corpora.p", "rb"))
vocab = json.load(open("assets/vocab.json"))

In [None]:
vectorizer = TfidfVectorizer(vocabulary=vocab)
vectors = []
for texts in tqdm.tqdm(corpora):
    try:
        vector = vectorizer.fit_transform(texts)
    except:
        vector = []
    vectors.append(vector)
    
pickle.dump(vectors, open("assets/vectors.p", "wb"))
pickle.dump(vectorizer, open("assets/vectorizer.p", "wb"))

In [None]:
vectors = pickle.load(open("assets/vectors.p", 'rb'))

In [None]:
DELIM = 1500
NUM_OF_CLASSES = 22
vector_size = vectors[0].shape[1]

In [None]:
y = []
for i, item in enumerate(vectors):
    if item != []:
        num = min(item.shape[0], DELIM)
        for _ in range(num):
            y_ = np.zeros(NUM_OF_CLASSES)
            y_[i] = 1
            y.append(y_)
y = np.array(y)

num = min(vectors[0].shape[0], DELIM)
X_temp = vectors[0][:num]
for item in vectors[1:]:
    if item != []:
        num = min(item.shape[0], DELIM)
        X_temp = vstack([X_temp, item[:num]], dtype='float64')
X = X_temp

y = csr_matrix(y)
X = csr_matrix(X, dtype='float32')

In [None]:
input_vec = Input(shape=(vector_size,))
l1 = Dense(96, activation='relu')(input_vec)
l2 = Dense(18, activation='relu')(l1)
l3 = Dropout(0.15)(l2)
l4 = Dense(64, activation='sigmoid')(l3)
output = Dense(NUM_OF_CLASSES, activation='sigmoid')(l4)

classifier = Model(input_vec, output)
classifier.compile(optimizer='adadelta', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
X_ = X.toarray()
y_ = y.toarray()

X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.3)

In [None]:
# callbacks = [EarlyStopping(monitor='val_acc',
#                            patience=8,
#                            verbose=1,
#                            min_delta=1e-4,
#                            mode='max'),
#              ReduceLROnPlateau(monitor='val_acc',
#                                factor=0.1,
#                                patience=4,
#                                verbose=1,
#                                epsilon=1e-4,
#                                mode='max'),
#              ModelCheckpoint(monitor='val_acc',
#                              filepath='assets/vk_texts_classifier.h5',
#                              save_best_only=True,
#                              save_weights_only=False,
#                              mode='max'),
#              # TensorBoard(log_dir='logs'),
#              CSVLogger('assets/last_training_log.csv', separator=',', append=False)
#             ]

# classifier.load_weights('vk_texts_classifier.h5')
classifier.fit(X_train, 
               y_train, 
               validation_data=(X_val, y_val), 
               batch_size=196, 
               epochs=55, 
               class_weight='balanced'
#                callbacks=callbacks
              )

In [None]:
classifier.save("assets/vk_texts_classifier.h5")

# Result

In [None]:
# social = pd.read_excel("social.xlsx", names=['id', 'name', 'fb', 'ok', 'vk', 'email', 'twitter'])
# social.set_index('id', inplace=True)
# social = social[social.vk.notnull()]
# social.vk = social.vk.apply(lambda x: str(x).split("/")[-1])
# df = social[social.vk.notnull()]
# df = df[['name', 'vk']]

# def get_id(screen_name):
#     try:
#         item = vk.users.get(user_ids=screen_name)
#         if 'deactivated' not in item:
#             return item[0]['id']
#     except Exception as e:
#         print(e.args)
# # Нужно замапить screen_name с их vk_id, поэтому придется обрабатывать по одному

# df['vk_id'] = df.vk.progress_apply(get_id)
# df.dropna(inplace=True)
# df.vk_id = df.vk_id.astype('int')
# df.to_msgpack("df.msg")
df = pd.read_msgpack("assets/df.msg")
known = pd.read_csv("assets/known_users.csv")
known = known.merge(df[['vk_id']], left_on='Leader-ID', right_index=True, how='left')

In [None]:
result_class = ResultClass()

In [None]:
offset = 0
for index, row in tqdm.tqdm(known.iloc[offset:, :].iterrows(), total=len(known) - offset):
    user_vk = row['vk_id']
    if np.isnan(user_vk):
        user_vk = None
    user_fb = row['FB']
    try:
        verdict = result_class.get_result(user_vk, user_fb)
        for cat, value in verdict:
            known.loc[index, cat] = value
    except ValueError:
        for cat in categories:
            known.loc[index, cat] = 0

In [None]:
norm_names = dict(zip(sources.keys(), 
         ["Искусство", "Политика", "Финансы", "Стратегическое управление", "Юриспруденция", "Исследования и разработки",
          "Промышленность", "Образование", "Социальное предпринимательство", "Здравоохранение", "Сельское хозяйство", 
          "Государственное управление", "Реклама и маркетинг", "Инновации и модернизация", "Безопасность", 
          "Военное дело", "Корпоративное управление", "Социальная защита", "Строительство", "Предпринимательство",
          "Спорт", "Инвестиции"]))
norm_names_reversed = dict([(a[1], a[0]) for a in norm_names.items()])

norm_dict = {"values": []}
for name in known.ФИ:
    results = []
    for col in known[known.ФИ == name].iloc[:, 4:].columns:
        results.append({"name": norm_names[col], "value": float(known.loc[known.ФИ == name, col].values[0])})
    norm_dict['values'].append({"name": name, "results": results})

In [None]:
for col in known.iloc[:, 4:].columns:
    norm_names[col], known.loc[:, col].mean()

In [None]:
known.to_csv("assets/known.csv")
json.dump(norm_dict, open("assets/temporary_result.json", "w"))