# Пайлайн обучения модели предсказания зарплатных ожиданий

В ноутбуке реализован процесс предобработки данных и обучения регрессионной модели для предсказания зарплатных ожиданий по данным вакансии.

------

## Импорт библиотек и загрузка данных

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from my_parsers import download, parse
from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup

pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

import pymorphy2
morph = pymorphy2.MorphAnalyzer()
russian_stopwords = pickle.load(open("stopwords.pkl", "rb"))

In [2]:
df = pd.read_csv('vprod_train/TRAIN_SAL.csv')
df = df[df["salary"] != 0]

In [3]:
# пайплайн предобработки данных и обучения занимает много времени. Для тестирования можем обрезать датасет

DEBUG_MODE = False
if DEBUG_MODE:
    df = df.iloc[:1000]

Удалим все коррелирующие с остальными или не несущими полезной информации признаки.

In [4]:
COLS_FOR_DROP = ['foreign_workers_capability', 'oknpo_code', 'regionNameTerm',
       'retraining_condition', 'contactList', 'company_name', "id", "change_time", "code_external_system",
        "company_code", "contact_person", "data_ids", "date_create", "date_modify", "deleted",
        "original_source_type", "publication_period", "published_date", "salary_min", "salary_max", 
        "vacancy_address_code", "vacancy_address", "visibility", "company", "company_inn", 
        "industryBranchName", "state_region_code", "vacancy_address_house", "metro_ids",
        "vacancy_address_additional_info", "is_moderated", "languageKnowledge"]

In [5]:
df.drop(COLS_FOR_DROP, axis=1, inplace=True)

## Предобработка данных

Удалим html-элементы из текстовых столбцов, в которых они присутствуют

In [6]:
def remove_html_tags(text):
    if pd.isna(text):
        return None
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()


html_tag_cols = ["additional_requirements", "other_vacancy_benefit", "position_requirements", 
                 "position_responsibilities"]
for html_col in tqdm(html_tag_cols):
    df[html_col] = df[html_col].map(remove_html_tags)

100%|█████████████████████████████████████████████| 4/4 [03:51<00:00, 57.89s/it]


Предобработаем по отдельности каждый признак, учитывая его логику

In [7]:
# создадим конфиг с метаинформацией для будущей предобработки тест

PREPROC_CONFIG = {}

In [8]:
# заменим None на 0, а все остальные значения - на 1, т.к. мало уникальных значений

df.loc[~df["academic_degree"].isna(), "academic_degree"] = 1
df["academic_degree"] = df["academic_degree"].fillna(0)

In [9]:
# объединим похожие по смыслу значения в одни классы, сделаем из пропусков отдельный класс

df.loc[df["accommodation_type"].isin(["FLAT", "ROOM", "HOUSE"]), "accommodation_type"] = 2
df.loc[df["accommodation_type"] == "DORMITORY", "accommodation_type"] = 1
df.loc[df["accommodation_type"].isna(), "accommodation_type"] = 0

In [10]:
# т.к. переменная вещественного типа, заменим пропуски на -1, чтобы катбусту было легче отделить пропуски

df.loc[df["additional_premium"].isna(), "additional_premium"] = -1

In [11]:
# заменим пропуски на отдельные классы или подхдодящие по смыслу значения

df.loc[df["bonus_type"].isna(), "bonus_type"] = 0
df.loc[df["measure_type"].isna(), "measure_type"] = 0
df.loc[df["code_professional_sphere"].isna(), "code_professional_sphere"] = "unknown"
df.loc[df["contact_source"].isna(), "contact_source"] = "unknown"
df.loc[df["is_mobility_program"].isna(), "is_mobility_program"] = False
df.loc[df["need_medcard"].isna(), "need_medcard"] = "unknown"
df.loc[df["regionName"].isna(), "regionName"] = "unknown"
df.loc[df["required_experience"].isna(), "required_experience"] = -1
df.loc[df["retraining_capability"].isna(), "retraining_capability"] = "unknown"
df.loc[df["retraining_grant_value"].isna(), "retraining_grant_value"] = 0
df.loc[df["transport_compensation"].isna(), "transport_compensation"] = -1
df.loc[df["federalDistrictCode"].isna(), "federalDistrictCode"] = 4

In [12]:
# сделаем из пропусков и редких значений в кодах ОКСО и кодах профессий отдельный класс

mask = ~df["okso_code"].isna()
rare_okso = df[mask]["okso_code"].value_counts()[df[mask]["okso_code"].value_counts() < 500].keys()

df.loc[df["okso_code"].isin(rare_okso), "okso_code"] = -1
df.loc[df["okso_code"].isna(), "okso_code"] = -1


mask = ~df["code_profession"].isna()
rare_profs = df[mask]["code_profession"].value_counts()[df[mask]["code_profession"].value_counts() < 500].keys()

df.loc[df["code_profession"].isin(rare_profs), "code_profession"] = -1
df.loc[df["code_profession"].isna(), "code_profession"] = -1

PREPROC_CONFIG["rare_values_replace"] = {"okso_code": rare_okso, "code_profession": rare_profs}

In [13]:
def list_col_preproc(text):
    if pd.isna(text):
        return []
    if isinstance(text, list):
        return text
    if "[" in text:
        text = eval(text)
    else:
        text = text.split(",")
    lst = [x.strip() for x in text]
    return lst


PREPROC_CONFIG["list_OHE_preproc"] = {}
for col in tqdm(["required_drive_license", "social_protected_ids", "vacancy_benefit_ids"]):
    df[col] = df[col].map(list_col_preproc)
    uniq_values = set(sum(df[col].tolist(), []))
    for l in uniq_values:
        df[f"{col}_type_{l}"] = df[col].map(lambda x: l in x)
    df.drop(col, axis=1, inplace=True)
    PREPROC_CONFIG["list_OHE_preproc"][col] = uniq_values

100%|█████████████████████████████████████████████| 3/3 [02:31<00:00, 50.64s/it]


In [14]:
# сделаем признак, есть ли в текст в полях, где он может быть

potential_none_texts = ["additional_requirements", "other_vacancy_benefit", "position_requirements",
                        "position_responsibilities", "required_certificates", "education_speciality"]
for col in potential_none_texts:
    df[f"have_{col}"] = df[col].isna()

In [15]:
# преобразуем список хард скиллы в слитный текст для дальнейшей работы как с текстовой фичей

df["hardSkills"] = df["hardSkills"].map(lambda x: [k["hard_skill_name"] for k in eval(x)])
df["hardSkills"] = df["hardSkills"].map(lambda x: " ".join(x))

df["softSkills"] = df["softSkills"].map(lambda x: [k["soft_skill_name"] for k in eval(x)])
df["softSkills"] = df["softSkills"].map(lambda x: " ".join(x))

In [16]:
# создадим списки со всеми категориальными и текстовыми столбцами

cat_features = ["academic_degree", "accommodation_capability", "busy_type", "career_perspective",
                "code_professional_sphere", "contact_source", "education", "is_mobility_program",
                "is_uzbekistan_recruitment", "is_quoted", "need_medcard", "okso_code", 
                "accommodation_type", "additional_requirements", "bonus_type", "measure_type",
                "regionName", "retraining_capability", "retraining_grant", "schedule_type",
                "status", "source_type", "transport_compensation", "professionalSphereName",
                "federalDistrictCode", "company_business_size"
               ]

text_features=  ["additional_requirements", "education_speciality", "other_vacancy_benefit",
                 "position_requirements", "position_responsibilities", "required_certificates",
                "vacancy_name", "full_company_name", "hardSkills", "softSkills"]

## Предобработка текста

In [17]:
# сконкатинируем все текстовые признаки в один длинный текст, который затем будет векторизировать

df["full_text"] = ""
for c in text_features:
    df["full_text"] += " " + df[c].fillna("")

In [18]:
# напишем функцию для стандартного препроцессинга, включающую в себя удаление лишних символов и лемматизацию


def base_preproc(text):
    text = text.replace(",", " ").strip().lower()
    text = re.sub(' +', ' ', text)
    text = "".join([x for x in text if not x.isdigit()])
    
    text = " ".join(
        [morph.parse(word)[0].normal_form for word in text.split() if word not in russian_stopwords])
    return text

In [19]:
preproc_text = []
for txt in tqdm(df["full_text"]):
    preproc_text.append(base_preproc(txt))

100%|██████████████████████████████████| 614629/614629 [51:54<00:00, 197.33it/s]


In [20]:
df["full_text"] = preproc_text

## Векторизационные модели

Обучим на полученном предобработанном тексте tfidf для создания интерпретируемых для модели статистических признаков. Гипотеза в том, что важные текстовые признаки эффективно описываются с помощью частотности определенных слов, которую можно получить с помощью tfidf.

In [21]:
FEATURES_COUNT = 650

tfidf = TfidfVectorizer(max_features=FEATURES_COUNT).fit(preproc_text)
vector = tfidf.transform(preproc_text)
PREPROC_CONFIG["text_vectorizer"] = tfidf

vector = pd.DataFrame(vector.toarray(), columns=[f"text_embed_{i}" for i in range(FEATURES_COUNT)])
df = pd.concat([df, vector], axis=1, ignore_index=False).dropna(subset=["salary"])

In [22]:
cols_for_aggregation = ["code_professional_sphere", "regionName", "busy_type", "education"]

PREPROC_CONFIG["salary_aggregators"] = {}
for col in cols_for_aggregation:
    col_dct = {}
    for e in df[col].unique():
        tmp = df[df[col] == e]
        min_, max_, med_, mean_ = tmp["salary"].min(), tmp["salary"].max(), tmp["salary"].median(), tmp["salary"].mean()
        col_dct[e] = [min_, max_, med_, mean_]
    df[f"min_{col}_salary"] = df[col].map(lambda x: col_dct[x][0])
    df[f"max_{col}_salary"] = df[col].map(lambda x: col_dct[x][1])
    df[f"med_{col}_salary"] = df[col].map(lambda x: col_dct[x][2])
    df[f"mean_{col}_salary"] = df[col].map(lambda x: col_dct[x][3])
    
    PREPROC_CONFIG["salary_aggregators"][col] = col_dct

In [23]:
# на основе используемых аггрегаторов посчита

t = ["mean" in x for x in df.columns.tolist()]
df["nearest_mean_sal"] = df[df.columns[t]].mean(axis=1)

In [24]:
df["vacancy_address_latitude"] = df["vacancy_address_latitude"].fillna(df["vacancy_address_latitude"].mean())
df["vacancy_address_longitude"] = df["vacancy_address_longitude"].fillna(df["vacancy_address_longitude"].mean())

In [25]:
float_f = [x for x in list(set(df.columns[df.dtypes == float]) - set(cat_features)) if "text" not in x]
for_rescale = ["work_places", "additional_premium", 
 "vacancy_address_longitude",
 "required_experience", "vacancy_address_latitude"] + [x for x in float_f if "sal" in x]
for f in for_rescale:
    df[f] = np.log(df[f])
    
df = df.fillna(0)

In [26]:
cat_features = list(set(cat_features) - set(text_features))
df[cat_features] = df[cat_features].map(lambda x: str(x))

df.drop(text_features, axis=1, inplace=True)
df.drop("full_text", axis=1, inplace=True)

In [32]:
X_train, X_valid, y_train, y_valid = train_test_split(df.drop("salary", axis=1), df["salary"],
                                                      test_size=0.15)

In [33]:
train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

In [34]:
clf = CatBoostRegressor(max_depth=8, loss_function="RMSE",
                       subsample=0.9).fit(train_pool, 
                                                               eval_set=valid_pool, early_stopping_rounds=5)

Learning rate set to 0.136028
0:	learn: 0.4636218	test: 0.4645702	best: 0.4645702 (0)	total: 1.12s	remaining: 18m 37s
1:	learn: 0.4418826	test: 0.4430555	best: 0.4430555 (1)	total: 2s	remaining: 16m 38s
2:	learn: 0.4243846	test: 0.4257486	best: 0.4257486 (2)	total: 2.96s	remaining: 16m 22s
3:	learn: 0.4104805	test: 0.4120454	best: 0.4120454 (3)	total: 3.8s	remaining: 15m 47s
4:	learn: 0.3993361	test: 0.4010873	best: 0.4010873 (4)	total: 4.66s	remaining: 15m 27s
5:	learn: 0.3906573	test: 0.3925205	best: 0.3925205 (5)	total: 5.49s	remaining: 15m 8s
6:	learn: 0.3830807	test: 0.3850622	best: 0.3850622 (6)	total: 6.16s	remaining: 14m 34s
7:	learn: 0.3773326	test: 0.3794573	best: 0.3794573 (7)	total: 6.87s	remaining: 14m 11s
8:	learn: 0.3720703	test: 0.3742931	best: 0.3742931 (8)	total: 7.71s	remaining: 14m 9s
9:	learn: 0.3681901	test: 0.3705046	best: 0.3705046 (9)	total: 8.62s	remaining: 14m 13s
10:	learn: 0.3649501	test: 0.3673787	best: 0.3673787 (10)	total: 9.42s	remaining: 14m 7s
11:	lea

92:	learn: 0.3212655	test: 0.3242788	best: 0.3242788 (92)	total: 1m 37s	remaining: 15m 53s
93:	learn: 0.3211941	test: 0.3242211	best: 0.3242211 (93)	total: 1m 38s	remaining: 15m 52s
94:	learn: 0.3209489	test: 0.3239706	best: 0.3239706 (94)	total: 1m 40s	remaining: 15m 55s
95:	learn: 0.3208052	test: 0.3238301	best: 0.3238301 (95)	total: 1m 41s	remaining: 15m 57s
96:	learn: 0.3207273	test: 0.3237585	best: 0.3237585 (96)	total: 1m 42s	remaining: 15m 58s
97:	learn: 0.3205996	test: 0.3236446	best: 0.3236446 (97)	total: 1m 44s	remaining: 15m 59s
98:	learn: 0.3204285	test: 0.3234874	best: 0.3234874 (98)	total: 1m 45s	remaining: 16m 1s
99:	learn: 0.3200638	test: 0.3230991	best: 0.3230991 (99)	total: 1m 47s	remaining: 16m 3s
100:	learn: 0.3198935	test: 0.3229213	best: 0.3229213 (100)	total: 1m 48s	remaining: 16m 4s
101:	learn: 0.3197473	test: 0.3227949	best: 0.3227949 (101)	total: 1m 49s	remaining: 16m 6s
102:	learn: 0.3196213	test: 0.3226524	best: 0.3226524 (102)	total: 1m 51s	remaining: 16m 7

181:	learn: 0.3099617	test: 0.3133814	best: 0.3133814 (181)	total: 3m 40s	remaining: 16m 29s
182:	learn: 0.3098695	test: 0.3132634	best: 0.3132634 (182)	total: 3m 41s	remaining: 16m 29s
183:	learn: 0.3098027	test: 0.3131978	best: 0.3131978 (183)	total: 3m 42s	remaining: 16m 27s
184:	learn: 0.3097486	test: 0.3131382	best: 0.3131382 (184)	total: 3m 44s	remaining: 16m 27s
185:	learn: 0.3096796	test: 0.3130802	best: 0.3130802 (185)	total: 3m 45s	remaining: 16m 25s
186:	learn: 0.3096272	test: 0.3130402	best: 0.3130402 (186)	total: 3m 46s	remaining: 16m 24s
187:	learn: 0.3095946	test: 0.3130206	best: 0.3130206 (187)	total: 3m 47s	remaining: 16m 23s
188:	learn: 0.3095323	test: 0.3129698	best: 0.3129698 (188)	total: 3m 48s	remaining: 16m 22s
189:	learn: 0.3093989	test: 0.3128284	best: 0.3128284 (189)	total: 3m 49s	remaining: 16m 20s
190:	learn: 0.3092987	test: 0.3127378	best: 0.3127378 (190)	total: 3m 51s	remaining: 16m 19s
191:	learn: 0.3092169	test: 0.3126482	best: 0.3126482 (191)	total: 3m 

270:	learn: 0.3046270	test: 0.3088263	best: 0.3088263 (270)	total: 5m 40s	remaining: 15m 15s
271:	learn: 0.3046017	test: 0.3088079	best: 0.3088079 (271)	total: 5m 41s	remaining: 15m 15s
272:	learn: 0.3045770	test: 0.3087934	best: 0.3087934 (272)	total: 5m 43s	remaining: 15m 14s
273:	learn: 0.3045039	test: 0.3087005	best: 0.3087005 (273)	total: 5m 44s	remaining: 15m 13s
274:	learn: 0.3044826	test: 0.3086824	best: 0.3086824 (274)	total: 5m 46s	remaining: 15m 12s
275:	learn: 0.3044175	test: 0.3086278	best: 0.3086278 (275)	total: 5m 47s	remaining: 15m 12s
276:	learn: 0.3043644	test: 0.3085893	best: 0.3085893 (276)	total: 5m 49s	remaining: 15m 11s
277:	learn: 0.3043239	test: 0.3085561	best: 0.3085561 (277)	total: 5m 50s	remaining: 15m 11s
278:	learn: 0.3042361	test: 0.3084949	best: 0.3084949 (278)	total: 5m 52s	remaining: 15m 10s
279:	learn: 0.3041899	test: 0.3084531	best: 0.3084531 (279)	total: 5m 53s	remaining: 15m 9s
280:	learn: 0.3041341	test: 0.3084184	best: 0.3084184 (280)	total: 5m 5

359:	learn: 0.3010369	test: 0.3059941	best: 0.3059941 (359)	total: 7m 44s	remaining: 13m 45s
360:	learn: 0.3009938	test: 0.3059514	best: 0.3059514 (360)	total: 7m 45s	remaining: 13m 44s
361:	learn: 0.3009595	test: 0.3059250	best: 0.3059250 (361)	total: 7m 46s	remaining: 13m 42s
362:	learn: 0.3009554	test: 0.3059239	best: 0.3059239 (362)	total: 7m 47s	remaining: 13m 40s
363:	learn: 0.3009000	test: 0.3058840	best: 0.3058840 (363)	total: 7m 48s	remaining: 13m 38s
364:	learn: 0.3008814	test: 0.3058874	best: 0.3058840 (363)	total: 7m 49s	remaining: 13m 37s
365:	learn: 0.3008202	test: 0.3058152	best: 0.3058152 (365)	total: 7m 50s	remaining: 13m 35s
366:	learn: 0.3007534	test: 0.3057576	best: 0.3057576 (366)	total: 7m 51s	remaining: 13m 34s
367:	learn: 0.3007266	test: 0.3057503	best: 0.3057503 (367)	total: 7m 53s	remaining: 13m 32s
368:	learn: 0.3007085	test: 0.3057501	best: 0.3057501 (368)	total: 7m 54s	remaining: 13m 30s
369:	learn: 0.3006935	test: 0.3057490	best: 0.3057490 (369)	total: 7m 

448:	learn: 0.2987108	test: 0.3046090	best: 0.3046090 (448)	total: 9m 25s	remaining: 11m 34s
449:	learn: 0.2987017	test: 0.3046129	best: 0.3046090 (448)	total: 9m 26s	remaining: 11m 32s
450:	learn: 0.2986751	test: 0.3045889	best: 0.3045889 (450)	total: 9m 27s	remaining: 11m 31s
451:	learn: 0.2986593	test: 0.3045876	best: 0.3045876 (451)	total: 9m 28s	remaining: 11m 29s
452:	learn: 0.2986253	test: 0.3045656	best: 0.3045656 (452)	total: 9m 29s	remaining: 11m 28s
453:	learn: 0.2985992	test: 0.3045536	best: 0.3045536 (453)	total: 9m 31s	remaining: 11m 26s
454:	learn: 0.2985846	test: 0.3045416	best: 0.3045416 (454)	total: 9m 32s	remaining: 11m 25s
455:	learn: 0.2985757	test: 0.3045445	best: 0.3045416 (454)	total: 9m 33s	remaining: 11m 24s
456:	learn: 0.2985656	test: 0.3045460	best: 0.3045416 (454)	total: 9m 34s	remaining: 11m 22s
457:	learn: 0.2985565	test: 0.3045461	best: 0.3045416 (454)	total: 9m 35s	remaining: 11m 21s
458:	learn: 0.2985248	test: 0.3045317	best: 0.3045317 (458)	total: 9m 

537:	learn: 0.2968108	test: 0.3035183	best: 0.3035183 (537)	total: 11m 18s	remaining: 9m 43s
538:	learn: 0.2968028	test: 0.3035175	best: 0.3035175 (538)	total: 11m 20s	remaining: 9m 41s
539:	learn: 0.2967975	test: 0.3035169	best: 0.3035169 (539)	total: 11m 21s	remaining: 9m 40s
540:	learn: 0.2967689	test: 0.3034967	best: 0.3034967 (540)	total: 11m 22s	remaining: 9m 39s
541:	learn: 0.2967588	test: 0.3034956	best: 0.3034956 (541)	total: 11m 24s	remaining: 9m 38s
542:	learn: 0.2967476	test: 0.3034967	best: 0.3034956 (541)	total: 11m 25s	remaining: 9m 36s
543:	learn: 0.2967053	test: 0.3034758	best: 0.3034758 (543)	total: 11m 26s	remaining: 9m 35s
544:	learn: 0.2966936	test: 0.3034683	best: 0.3034683 (544)	total: 11m 27s	remaining: 9m 34s
545:	learn: 0.2966802	test: 0.3034535	best: 0.3034535 (545)	total: 11m 28s	remaining: 9m 32s
546:	learn: 0.2966706	test: 0.3034561	best: 0.3034535 (545)	total: 11m 30s	remaining: 9m 31s
547:	learn: 0.2966591	test: 0.3034576	best: 0.3034535 (545)	total: 11m

In [35]:
from sklearn.metrics import root_mean_squared_error

In [36]:
max(0, 1 - root_mean_squared_error(np.exp(y_valid), np.exp(clf.predict(valid_pool))) / 33000)

0.5229237429910527

In [37]:
pickle.dump(clf, open("salary_regressor.pickle", "wb"))

In [10]:
df_res.dropna(subset=["demands"], axis=0, inplace=True)

In [13]:
df_res = df_res[df_res["job_title"] != "специалист"]

In [17]:
df_res["corpus"] = df_res["demands"] + df_res["job_title"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_res["corpus"] = df_res["demands"] + df_res["job_title"]


In [19]:
def base_preproc(text):
    text = text.replace(",", " ").strip().lower()
    text = re.sub(' +', ' ', text)
    text = "".join([x for x in text if not x.isdigit()])
    return text

In [20]:
df_res["corpus"] = df_res["corpus"].map(base_preproc)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_res["corpus"] = df_res["corpus"].map(base_preproc)


In [16]:
df_res

Unnamed: 0,achievements,achievements_modified,company_name,demands,id_cv,job_title
0,,,Управление социальной защиты населения Сердобс...,"Работала,непосредственно, с федеральными льгот...",52860148-26da-11e8-a184-9122a281f90e,инспектор
1,,,Администрация города Сердобска,Работа с реестром муниципальной собственности,52860148-26da-11e8-a184-9122a281f90e,инспектор
2,"Работы не закончены, т.к. нет настоящего желан...","Работы не закончены, т.к. нет настоящего желан...",январь 2016-настоящее времяООО «АРТЕЛЬ СТАРАТЕ...,Курирую работы по рудному направлению: соавтор...,e3c857b0-26a0-11e7-bcca-736ab11edb0c,геолог
3,,,"ОАО «Васильевский рудник», п.Мотыгино, Красноя...",ГРП. Документация керна разведочных скважин,e3c857b0-26a0-11e7-bcca-736ab11edb0c,геолог
4,,,"ООО «Артель старателей «ТАЛ»», Усть-Нера, Респ...",Поисково-разведочные работы на месторождении р...,e3c857b0-26a0-11e7-bcca-736ab11edb0c,геолог
...,...,...,...,...,...,...
1287460,Создал работоспособную команду| Увеличил объем...,Создал работоспособную команду| Увеличил объем...,РУСБИЗНЕСАВТО,Общение с клиентами по вопросам ремонта и обсл...,3386f2f0-263e-11e8-94aa-736ab11edb0c,руководитель
1287461,За период работы увеличен объем производства к...,За период работы увеличен объем производства к...,"ООО ""Москва на Дону""",Руководство финансово-хозяйственной деятельнос...,8cc2102c-fd0d-11e7-a659-e37b4be0b9ed,директор
1287462,И меются публикации в российских изданиях,И меются публикации в российских изданиях,ФБУН ФНЦ Медико-профилактических технологий уп...,"Пробоподготовка, иммуноферментый анализ, владе...",bbdbb110-f94b-11e7-b311-736ab11edb0c,иммунолог
1287463,Самостоятельно внедрила методы количественного...,Самостоятельно внедрила методы количественного...,"ГБУЗ ПК ""КДКБ"" Медико-генетическая консультация",Лабораторная диагностика (проведение пренаталь...,bbdbb110-f94b-11e7-b311-736ab11edb0c,биолог


In [25]:
pickle.dump(df_res["corpus"].tolist(), open("demands_w2v_corpus.pickle", "wb"))

In [3]:
job_list = pd.read_csv("vprod_train/JOB_LIST.csv", encoding_errors="ignore")

In [4]:
job_list.dropna(inplace=True, axis=0)

In [5]:
def base_preproc(text):
    text = text.replace(",", " ").strip().lower()
    text = re.sub(' +', ' ', text)
    text = "".join([x for x in text if not x.isdigit()])
    return text

In [6]:
job_list["job_title"] = job_list["job_title"].map(base_preproc)

In [None]:
my_counter = {}
for job in job_list["job_title"].tolist():
    for x in job.split():
        if x not in list(my_counter.keys())

In [None]:
words = sum([x.split() for x in job_list["job_title"].tolist()], [])

In [None]:
len(words)

In [107]:
bad_classes_df = df_res[df_res["id_cv"].isin(df_res[df_res["job_title"] == "специалист"]["id_cv"].unique())]

In [115]:
k = []

for ids, gr in tqdm(df_res.groupby("id_cv")):
    k.append(len(gr["company_name"].unique()))

100%|████████████████████████████████| 745438/745438 [00:15<00:00, 47155.67it/s]
