In [1]:
import yargy
import natasha
import json
import os
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pymorphy2 as pm

from copy import deepcopy
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from yargy import Parser

from yargy_parser import MONEY
from Model import Model
from Preprocessor import Preprocessor
from ForkExtractor import ForkExtractor
from VacancyExtractor import VacancyExtractor

import pickle

preprocessor = Preprocessor()
preprocessor.load()

model = Model()
model.load('./models')

vac_extractor = VacancyExtractor(preprocessor=preprocessor)
fork_extractor = ForkExtractor(preprocessor=preprocessor, model=model)

In [4]:
# load data
from data_loader import Dataloader

def digit_extractor(txt):
    tmp = re.search(r'(\d)', txt, flags=re.IGNORECASE|re.DOTALL)
    if tmp:
        return True
    return np.NaN

DATA_FOLDER = 'data/'
list_dir = os.listdir(DATA_FOLDER)
data_len = len(list_dir)

print(f'number of jobs data (days): {data_len} (years: {round(data_len/365, 3)})')

seed = 253
dataloader = Dataloader(DATA_FOLDER, random_state=seed)

preprocessed_data = []

for index, filename in enumerate(list_dir):
    if index%200==0:
        print (f'#progress: {index}/{data_len}')
    posts = dataloader.parse_one_day(filename)
    
    reply_count = 0
    for post in posts:
        # filter out comments
        if 'subtype' in post or not post.get('text', None):
            continue
        if post.get('reply_count', None) is None:
            continue

        preprocessed_data.append([filename] + [int(round(float(post[key]))) if key in ['ts', 'reply_count'] and post.get(key, None) is not None else post[key] if post.get(key, None) else np.NaN for key in ['text', 'user', 'ts', 'reply_count', 'reactions']])
        
df = pd.DataFrame(preprocessed_data, columns=['filename', 'text', 'user', 'ts', 'reply_count', 'reactions'])
print(f'Shape before droping short messages: {df.shape}')
# from timestamp to datetime
df['ts_date'] = pd.to_datetime(df['ts'], unit='s')
# measure the text lenght
df['text_len'] = df['text'].str.len()
# drop short messages
df.drop(df[df['text_len']<200].index, inplace=True)
df.reset_index(drop=True, inplace=True)
print(f'Shape after droping short messages: {df.shape}')

df['is_any_digit'] = df['text'].apply(digit_extractor)
print(f'Shape before droping rows with no any digit: {df.shape}')
df.drop(df[df['is_any_digit'].isna()].index, inplace=True)
df.reset_index(drop=True, inplace=True)
print(f'Shape after droping rows with no any digit: {df.shape}')


number of jobs data (days): 1273 (years: 3.488)
#progress: 0/1273
#progress: 200/1273
#progress: 400/1273
#progress: 600/1273
#progress: 800/1273
#progress: 1000/1273
#progress: 1200/1273
Shape before droping short messages: (1093, 6)
Shape after droping short messages: (1038, 8)
Shape before droping rows with no any digit: (1038, 9)
Shape after droping rows with no any digit: (1008, 9)


## Train fork model if you'd like to train again models with your own feature set

In [5]:
messages = df['text'].values

results = []
indexes = []

# extract from every message potential forks
for index, message in enumerate(messages):
    if index%50==0:
        print(f'#PROGRESS: {index}/{len(messages)}')
        
    mes_forks = fork_extractor.parse(message)
#     mes_forks structure e.g.:
#         100 - 250k$ per month, ... [[100, 250, 'USD', 1000, 'month', span.start, span.end], ...]
#         Salary: ~100 ... [[100, -1, '-', -1, '-', span.start, span.end], ...]

    indexes.append(index)
    results.append(mes_forks)
        
# determine indexes of messages where:
# null_indexes - no found any forks based on created yargy.Parser(MONEY)
# one_indexes - found only 1 fork per message
# multi_indexes - found multiple forks per message
null_indexes, one_indexes, multi_indexes = fork_extractor.group_parsed_messages(results)
print(f'null_indexes_len: {len(null_indexes)}, one_indexes_len: {len(one_indexes)}, multi_indexes_len: {len(multi_indexes)}')

preprocessor = Preprocessor()
# label `one_indexes` messages forks based on some specified rules defined in preprocessor.__unsupervised_parse_error_detect method
# positive - indexes of messages where fork is legitimate. Message e.g. "Вилка от 100 до бесконечности"
# negative - indexes of messages where fork is not legitimate. Message e.g. "В нашей команде 5 человек"
positive, negative = preprocessor.one_index_unsupervised_parse_error_detect(one_indexes, results, messages)
print(f'positive examples number: {len(positive)}, negative examples number: {len(negative)}')
# as you can see - the dataset is a little bit imbalanced

# create train dataset with some feature engineering
lcl_df = preprocessor.prepare_train_dataset(positive, negative, results, messages)


with open('./preprocessor.pkl', 'wb') as file:
    pickle.dump([preprocessor.pos_words, # words in positive messages
                 preprocessor.neg_words, # words in negative messages
                 preprocessor.ohe_dc, 
                 preprocessor.ohe_out_columns], file)
    


#PROGRESS: 0/1008
#PROGRESS: 50/1008
#PROGRESS: 100/1008
#PROGRESS: 150/1008
#PROGRESS: 200/1008
#PROGRESS: 250/1008
#PROGRESS: 300/1008
#PROGRESS: 350/1008
#PROGRESS: 400/1008
#PROGRESS: 450/1008
#PROGRESS: 500/1008
#PROGRESS: 550/1008
#PROGRESS: 600/1008
#PROGRESS: 650/1008
#PROGRESS: 700/1008
#PROGRESS: 750/1008
#PROGRESS: 800/1008
#PROGRESS: 850/1008
#PROGRESS: 900/1008
#PROGRESS: 950/1008
#PROGRESS: 1000/1008
null_indexes_len: 27, one_indexes_len: 208, multi_indexes_len: 773
positive examples number: 179, negative examples number: 29
implicit target assign error


Define your own model with your own feature set

I trained Decision tree ({'max_depth': 4}) and Logistic Regression ('solver': 'liblinear').

All features that have been engineered during `prepare_train_dataset` step:
    `2gte1`, - is the second value of fork >= the first one
    `len1`, - length of the first value of fork
    `1mod5==0`, - is the first value of fork `mod` 5 == 0
    `1<10`, - is the first value of fork < 10
    `2==-1`, - is the second value of fork the default value (-1)
    `len2`, - length of the second value of fork
    `2mod5==0`, - is the second value of fork `mod` 5 == 0
    `2<10`, - is the second value of fork < 10
    `curMultPeriod`, - is yargy.Parser detect any either `currency` or `multiplier` or `period`
    `only_digit`, - is the anything except of degitits in predefined `local_context`
    `contex`, - yargy.Parser() span +- 50 chars in both sides
    `local_context`, - ....
    `len_tokens`, - number of `tokens` in `context`
    `tokens`,  - ...
    `all_words_score`, - just look at the source code to understand how i calculated `all_words` and `short_words` scores. Intuition behind it - it reflects how the context correspond to the extracted yargy.Parser structure. In short, its calculation depends on `context` `tokens`, its count appearance in `positive` and `negative` examples and it's all weighted based on the distance to the yargy.Parser extracted strcture. 
    `short_words_score`
    
    
The chosen features for both models are in the source code of Model() class.
Validation strategy - leave one out, because the training set is small

In [6]:
# define your own model with your own feature set
model = Model()
model.train(lcl_df)

#save your model
# model.save('./models')

#MODEL: training start
Base models training
TRAIN: Acc: 0.9951690821256035, Roc_auc: 0.9990028592928535, PRC_auc: 0.9998331874576499
TEST: Acc: 0.9663461538461539
TRAIN: Acc: 0.9764725009290228, Roc_auc: 0.9680445051433092, PRC_auc: 0.988056589275073
TEST: Acc: 0.9711538461538461
Stacked models training
TRAIN: Acc: 1.0, Roc_auc: 1.0, PRC_auc: 1.0
TEST: Acc: 0.9951923076923077
TRAIN: Acc: 0.9902684875510962, Roc_auc: 0.9984983029631341, PRC_auc: 0.9997441100377977
TEST: Acc: 0.9855769230769231


In [8]:
# predict
lcl_df = model.predict(lcl_df)

## If you'd like to look at the error of models then see below:

In [9]:
print('TREE MODEL ERROR')
lcl_df[lcl_df['pred_label_tree']!=lcl_df['target']][['1', '2', '3', '4', '5', 'context', 'tokens', 'all_words_score',
       'short_words_score', 'target', 'pred_proba_linear', 'pred_label_linear', 'pred_proba_tree', 'pred_label_tree', 'pred_proba_linear_stack', 'pred_proba_tree_stack']].head(20)

TREE MODEL ERROR


Unnamed: 0,1,2,3,4,5,context,tokens,all_words_score,short_words_score,target,pred_proba_linear,pred_label_linear,pred_proba_tree,pred_label_tree,pred_proba_linear_stack,pred_proba_tree_stack
25,100,-1,-,-1,-,Официальное трудоустройство и 100% официальная...,"['официальный', 'трудоустройство', 'и', '100%'...",0.596345,0.749624,0,0.69456,1.0,0.909091,1.0,0.835748,0.0


In [10]:
print('LINEAR MODEL ERROR')
lcl_df[lcl_df['pred_label_linear']!=lcl_df['target']][['1', '2', '3', '4', '5', 'context', 'tokens', 'all_words_score',
       'short_words_score', 'target', 'pred_proba_linear', 'pred_label_linear', 'pred_proba_tree', 'pred_label_tree', 'pred_proba_linear_stack', 'pred_proba_tree_stack']].head(20)

LINEAR MODEL ERROR


Unnamed: 0,1,2,3,4,5,context,tokens,all_words_score,short_words_score,target,pred_proba_linear,pred_label_linear,pred_proba_tree,pred_label_tree,pred_proba_linear_stack,pred_proba_tree_stack
18,20,-1,-,-1,-,"плюшки стандартны, правило 20% в нашем случае...","['плюшка', 'стандартный', 'правило', '20%', 'в...",0.33778,0.545446,0,0.592706,1.0,0.0,0.0,0.215607,0.0
25,100,-1,-,-1,-,Официальное трудоустройство и 100% официальная...,"['официальный', 'трудоустройство', 'и', '100%'...",0.596345,0.749624,0,0.69456,1.0,0.909091,1.0,0.835748,0.0
27,50,50,-,-1,-,"С точки зрения задач, примерно 50/50 это анали...","['с', 'точка', 'зрение', 'задача', 'примерно',...",0.247281,0.247281,0,0.779783,1.0,0.0,0.0,0.492163,0.0
28,15000,4000,USD,-1,-,вилка зарплат(от 15000 $ до 4000$),"['вилка', 'зарплата', 'от', '15000', 'до', '40...",1.85296,2.667786,0,0.99852,1.0,0.0,0.0,0.897424,0.0
207,20,-1,-,-1,-,"Есть предложение тех, кто собрал 20 :ban: под ...","['есть', 'предложение', 'тот', 'кто', 'собрать...",0.157154,0.273504,0,0.501762,1.0,0.0,0.0,0.192746,0.0


## As you can see the models most of the time make the false predictions on forks with the context containing '%' symbol. That's why i made some post filtering based on some rules in the source code of ForkExtractor. Look in the extract method.

# Examples of Fork and Vacancy parsing

In [2]:

fork_extractor = ForkExtractor(preprocessor=preprocessor, model=model)

# fork_extractor extract method return 2 list:
# 1 list - yargy.parser captured list of forks. During yargy parsing i didn't consider gross/net property.
# 2 list - filtered out list of normalized forks of the 1 list. E.g. it transform sallary per hour/day/year to month format and multiply according to multiplier

cases_fork = [('от 60К руб net до 300К net', '60000 RUB-300000 RUB'),
# in the case above due to the yargy.Parser didn't consider net/gross property, it splits one fork into 2 seperate forks
                ('от 60К до 300К грязными', '60000 RUB-300000 RUB'),
              ('от 60к до 300к gross', '60000 RUB-300000 RUB'),
              ('120т.р. - 160 т.р. чистыми', '120000 RUB-160000 RUB'),
              ('$5k–$8k', '5000 USD-8000 USD'),
              ('150-250 т.р. «чистыми»', '150000 RUB-250000 RUB'),
              ('2.5-4.5k USD', '2500.0 USD-4500.0 USD'),
              ('2.5-4.5k $', '2500.0 USD-4500.0 USD'),
              ('2.5-4.5k$', '2500.0 USD-4500.0 USD'),
              ('1K - 2K EUR нетто ', '1000 EUR-2000 EUR'),
              ('1K - 2K € нетто ', '1000 EUR-2000 EUR'),
              ('1K - 2K€ нетто ', '1000 EUR-2000 EUR'),
              ('€1K - €2K EUR нетто ', '1000 EUR-2000 EUR'),
              ('1K - 2K € нетто ', '1000 EUR-2000 EUR'),
              ('1000 - 2000 € нетто ', '1000 EUR-2000 EUR'),
              ('Оклад в вилке от 150 до 250 гросс', '150000 RUB-250000 RUB'),
              ('ЗП: 130-200к руб.', '130000 RUB-200000 RUB'),
              ('Зарплату от 200К до 1М рублей', '200000 RUB-1000000 RUB'),
              # ForkExtracotr filters out values bigger than 700k rubles and 200k$ if there is no mention about the sallary `per year`
              # in order to avoid false capturing like `we have 20 milions of users` or `we got 980k$ round investments`
              ('зп: 60 000 - 120 000 т.р. net', '60000 RUB-120000 RUB'),
              ('от 3,4 до 4,8 млн.рублей', '3400000 RUB-4800000 RUB'),
              ('280-400+ тысяч рублей', '280000 RUB-400000 RUB'),
              ('вилка $$1000-5000', '1000 USD-5000 USD'),
              ('1000-2500k USD', '1000 USD-2500 USD'), 
              # ForkExtractor interprets the values like above: [ 1000, 2500, USD, 1000] -> [ 1000 000, 2500 000, USD]
              ('от $ 800 до 1100 net', '800 USD-1100 USD'),
              ('от 3,4 до 4,8 млн.рублей', '3400000 RUB-4800000 RUB'),
              ('280+ тысяч рублей', '280000 RUB-400000 RUB'),
              ('вилка $1000', '1000 USD-5000 USD'),
              ('вилка рублей 21 000 + бонусы', '1000 USD-5000 USD'),
              ('2500k USD', '1000 USD-2500 USD'),
             ('50-259k рублей (jun-senior 50-120, 120-200, 200 -250)', '1000 USD-2500 USD'),]

cases_not_fork = ['+7(495)6386767',
                  'tel:+7(906)747-73-90',
                  '2018/01/29',
                  'Более 20 000 сотрудников по всей России',
                  'http://andrewgelman.com/2017/01/16/hiring-hiring-hiring-hiring/',
                  'Белая зп.: 150 000 рублей',
                  'График работы с 9:30 до 18:00',
                  'мы планируем вырасти с 1,5 до 50 миллионов пользователей',
                  'Equity range: 0.25-1.5%']

for x in cases_fork:
    print(x, fork_extractor.extract(x[0])[1])
print('------')
for x in cases_not_fork:
    print(x, fork_extractor.extract(x)[1])

('от 60К руб net до 300К net', '60000 RUB-300000 RUB') [[60000.0, -1.0, 'RUB'], [300000.0, -1.0, '-']]
('от 60К до 300К грязными', '60000 RUB-300000 RUB') [[60000.0, 300000.0, '-']]
('от 60к до 300к gross', '60000 RUB-300000 RUB') [[60000.0, 300000.0, '-']]
('120т.р. - 160 т.р. чистыми', '120000 RUB-160000 RUB') [[120000.0, 160000.0, 'RUB']]
('$5k–$8k', '5000 USD-8000 USD') [[5000.0, 8000.0, 'USD']]
('150-250 т.р. «чистыми»', '150000 RUB-250000 RUB') [[150000.0, 250000.0, 'RUB']]
('2.5-4.5k USD', '2500.0 USD-4500.0 USD') [[4000.0, 5000.0, 'USD']]
('2.5-4.5k $', '2500.0 USD-4500.0 USD') [[4000.0, 5000.0, 'USD']]
('2.5-4.5k$', '2500.0 USD-4500.0 USD') [[4000.0, 5000.0, 'USD']]
('1K - 2K EUR нетто ', '1000 EUR-2000 EUR') [[1000.0, 2000.0, 'EUR']]
('1K - 2K € нетто ', '1000 EUR-2000 EUR') [[1000.0, 2000.0, 'EUR']]
('1K - 2K€ нетто ', '1000 EUR-2000 EUR') [[1000.0, 2000.0, 'EUR']]
('€1K - €2K EUR нетто ', '1000 EUR-2000 EUR') [[1000.0, 2000.0, 'EUR']]
('1K - 2K € нетто ', '1000 EUR-2000 EUR

In [3]:
vac_extractor = VacancyExtractor(preprocessor=preprocessor)
# VacancyExtractor extractor based on regexps
# VacancyExtractor regexp patern - level_re + field_re + vac_name_re + level_re + field_re

cases = ['Junior ML Engineer',
        'Jun/senior ds',
        'Jun/mid/senior (nlp/cv) ds',
        'ds Jun/mid/senior (nlp/cv)', # in case when fields are in the end and there are multiple fields - the last field isn't caught
        'ds (nlp/cv) Jun/mid/senior', # doesn't work, because it didn't correspond the regexp pattern. But usually nobody writes in such format
        'Jun/mid/senior ds (nlp/cv)', 
        'Ваша задача будет менторить Jun ds-ов и делать код ревью', 
        'Мы ищем кучу Jun ds-ов, чтобы они батрачали на нас', 
        'Ведется набор Jun ds-ов, чтобы они батрачали на нас', 
        'Вы владеете стандартным набором ds утилиток', 
        ]

# VacancyExtractor.extract returns 2 lists

# the first list is unfiltered set of captured vacancies sorted by VacancyExtractor.__vacany_sort_func1
# The first list structure : [[level, (field, field, ...), vacancy_name, span.start, span.end, left_context]]

# the second list is filtered set of captured UNIQUE vacancies based on the first list. It filters based on the left context:
# if no left context of there is not any alphabet character in the left context
# either in the left context there is such lemmatized parts of words:
# ['vaca', 'posi', 'role', 'need', 'look', 'job', 'вака', 'назв', 'роль', 'треб', 'нужн', 'поис', 'разы', 'иска', 'необ'] or 'позици' in tkn and 'позицио' not in tkn:
# Due to the last 2 cases, i didn't inclue 'набо' in the filter list

for x in cases:
    print(x, vac_extractor.extract(x), '\n-----')

Junior ML Engineer ([['junior', ('ml',), 'engineer', 0, 18, '']], [['junior', ('ml',), 'engineer']]) 
-----
Jun/senior ds ([['junior', (), 'ds', 0, 13, ''], ['senior', (), 'ds', 0, 13, '']], [['junior', (), 'ds'], ['senior', (), 'ds']]) 
-----
Jun/mid/senior (nlp/cv) ds ([['junior', ('nlp', 'cv'), 'ds', 0, 26, ''], ['middle', ('nlp', 'cv'), 'ds', 0, 26, ''], ['senior', ('nlp', 'cv'), 'ds', 0, 26, '']], [['junior', ('nlp', 'cv'), 'ds'], ['middle', ('nlp', 'cv'), 'ds'], ['senior', ('nlp', 'cv'), 'ds']]) 
-----
ds Jun/mid/senior (nlp/cv) ([['junior', ('nlp',), 'ds', 0, 22, ''], ['middle', ('nlp',), 'ds', 0, 22, ''], ['senior', ('nlp',), 'ds', 0, 22, '']], [['junior', ('nlp',), 'ds'], ['middle', ('nlp',), 'ds'], ['senior', ('nlp',), 'ds']]) 
-----
ds (nlp/cv) Jun/mid/senior ([['', ('nlp',), 'ds', 0, 7, '']], [['', ('nlp',), 'ds']]) 
-----
Jun/mid/senior ds (nlp/cv) ([['junior', ('nlp',), 'ds', 0, 22, ''], ['middle', ('nlp',), 'ds', 0, 22, ''], ['senior', ('nlp',), 'ds', 0, 22, '']], [['jun

# Extract all vacancy names and forks for our DF vacancies (Optional. If you want to continue the project and try to link multple vacancies to corresponding multiple forks).

In [None]:

messages = df['text'].values
vacancies = [[], []]
indexes = []

for index, message in enumerate(messages):
    if index%50==0:
        print(f'#PROGRESS: {index}/{len(messages)}')
    try:
        mes_forks, mes_normalized_forks = fork_extractor.extract(message)
        mes_vacancies, mes_filt_vacancies = vac_extractor.extract(message)

        if [] in [mes_forks, mes_normalized_forks, mes_vacancies, mes_filt_vacancies]:
            continue

        indexes.append(index)
        forks[0].append(mes_forks)
        forks[1].append(mes_normalized_forks)
        vacancies[0].append(mes_vacancies)
        vacancies[1].append(mes_filt_vacancies)
    except Exception as e:
        print(index, e)
        assert False
        