In [1]:
import pandas as pd
import numpy as np
import io
import re
from tqdm import tqdm

# Printing amount of lines and numbers

In [None]:
def count_lines(x):
    return sum([1 for i in x.splitlines() if i.strip()])

In [None]:
def count_numbers(x):
    return len(x)

In [None]:
with open("user_messages_clear_w_dt.json",'r') as fout:
    msg_dt = pd.read_json(fout, compression=None, orient='table')

In [None]:
print(max(msg_dt['message'].apply(np.vectorize(count_lines))))

In [None]:
print(max(msg_dt['numbers'].apply(count_numbers)))

# Marking data with key words

In [None]:
def search_by_key_words(x, numbers, key_words):
    for key in key_words:
        key_found = re.search(key, x, re.IGNORECASE)
        if key_found:
            for i, n in enumerate(numbers):
                number_found = re.search(str(n), x[key_found.start():])
                if number_found:
                    if key_found.start() + number_found.end() < len(x):
                        if not x[key_found.start() + number_found.end()].isdigit():
                            if i + 1 < len(numbers):
                                return (len(x[:key_found.start() + number_found.end()].splitlines()), n, numbers[i + 1] if (n < numbers[i + 1]) else n)
                            else:
                                return (len(x[:key_found.start() + number_found.end()].splitlines()), n, n)
    return (None, None, None)

In [None]:
msg_dt['salary_line'] = None
msg_dt['up_fork'] = None
msg_dt['low_fork'] = None

In [None]:
key_words = ['pay', 'fork', 'вилка', 'зп']

In [None]:
#search_vectorized = lambda x,y: search_by_key_words(x, y, key_words)
#(msg_dt['salary_line'][0], msg_dt['low_fork'][0], msg_dt['up_fork'][0]) = search_vectorized(msg_dt['message'][0], msg_dt['numbers'][0])
#(msg_dt['salary_line'], msg_dt['low_fork'], msg_dt['up_fork']) = np.vectorize(search_vectorized)(msg_dt['message'], msg_dt['numbers'])

In [None]:
for i in tqdm(msg_dt.index):
    (msg_dt['salary_line'][i], msg_dt['low_fork'][i], msg_dt['up_fork'][i]) = search_by_key_words(msg_dt['message'][i], msg_dt['numbers'][i], key_words)

In [None]:
print(msg_dt[['salary_line', 'low_fork', 'up_fork']])

In [None]:
with open("user_messages_clear_w_dt.json",'w') as fout:
    msg_dt.to_json(fout, compression=None, orient='table')

# Copying training examples 

In [None]:
with open("user_messages_clear_w_dt.json",'r') as fout:
    msg_dt = pd.read_json(fout, compression=None, orient='table')

In [None]:
msg_dt_true = msg_dt.loc[msg_dt['salary_line'].notnull()]
msg_dt_true.reset_index(inplace=True)
msg_dt_true = msg_dt_true.sample(100).reset_index(drop=True)

In [None]:
def manual_check(msg):
    print()
    lines = msg['message'].splitlines()
    for n, line in enumerate(lines):
        print("{} {}".format(n, line))
    print('Is message a vacancy post? (y/n)')
    c = input()
    if c == 'y':
        print('Input a number of salary line:')
        msg.loc['salary_line'] = int(input())
        print('Input down salary value:')
        msg.loc['low_fork'] = float(input())
        print('Input top salary value:')
        msg.loc['up_fork'] = float(input())
        #return False
    #return True
    return msg

In [None]:
sample_length = 100

msg_dt_false = pd.DataFrame(columns=msg_dt.columns)
msg_dt_unknown = msg_dt.loc[msg_dt['salary_line'].isnull()]
while msg_dt_false.shape[0] < sample_length:
    msg_dt_unknown_sample = msg_dt_unknown.sample(n=sample_length - msg_dt_false.shape[0]).apply(manual_check, axis=1)
    msg_dt_false = msg_dt_false.append(msg_dt_unknown_sample[msg_dt_unknown_sample['salary1_line'].isnull()])
    msg_dt_unknown.drop(msg_dt_unknown_sample.index[msg_dt_unknown_sample['salary_line'].notnull()], inplace=True)
    msg_dt.loc[msg_dt_unknown_sample.index[msg_dt_unknown_sample['salary_line'].notnull()]] = msg_dt_unknown_sample[msg_dt_unknown_sample['salary_line'].notnull()]

**Ключевые слова**
зарплата
salary
оклад
компенсация
з/п
т.р.
гросс
net
yearly

**Валюты**
RUB
UAH
AED
BYN
CAD
MYR

In [None]:
with open("user_messages_clear_w_dt.json",'w') as fout:
    msg_dt.to_json(fout, compression=None, orient='table')

In [None]:
with open("salary_posts.json",'w') as fout:
    msg_dt_true.to_json(fout, compression=None)

In [None]:
with open("non_salary_posts.json",'w') as fout:
    msg_dt_false.to_json(fout, compression=None)

# Lines dataframe

In [None]:
def get_features(msg_df, special_symbols):
        
    cols = ['id',
            'symbol_number', 
            'word_number', 
            'line_number', 
            'number_number', 
            'len_1_lines', 
            'len_2_lines', 
            'special_symbols',
            'isVacancy']
    
    feat_df = pd.Series(0, index=cols)
    
    feat_df['id'] = msg_df['index']
    
    feat_df['symbol_number'] = len(msg_df['message'])
    words = re.split(' |;|,|\*|\n|\t', msg_df['message'])
    lines = msg_df['message'].splitlines()
    
    feat_df['word_number'] = len(words)
    
    feat_df['line_number'] = len(lines)
    
    feat_df['number_number'] = len(msg_df['numbers'])
    
    feat_df['len_1_lines'] = len([x for x in words if len(x) == 1])
    
    feat_df['len_2_lines'] = len([x for x in words if len(x) == 2])
    
    for symbol in special_symbols:
        feat_df['special_symbols'] += msg_df['message'].upper().count(symbol)
        
    if not np.isnan(msg_df['salary_line']):
        feat_df['isVacancy'] = 1
        
    return feat_df

In [None]:
with open("non_salary_posts.json",'r') as fin:
    msg_dt_false = pd.read_json(fin, orient='table')

In [None]:
with open("salary_posts.json",'r') as fin:
    msg_dt_true = pd.read_json(fin, compression=None)

In [None]:
print(msg_dt_true)

In [None]:
train_df = pd.concat([msg_dt_false, msg_dt_true], ignore_index=True, sort=False)

In [None]:
cols = ['id',
        'symbol_number', 
        'word_number', 
        'line_number', 
        'number_number', 
        'len_1_lines', 
        'len_2_lines', 
        'special_symbols',
        'isVacancy']
train_feat = pd.DataFrame(columns=cols, index=msg_dt.index)

In [24]:
special_symbols = ["EUR","USD","РУБ","К","ТЫС","€","$","ДОЛЛАР","ЕВРО", "RUB", "UAH", "AED", "BYN", "CAD", "MYR", "Т.Р."]

In [None]:
get_features_vectorized = lambda x: get_features(x, special_symbols)

train_feat = train_df.apply(get_features_vectorized, axis=1)

In [None]:
with open("salary_dataset.json",'w') as fout:
    train_feat.to_json(fout, compression=None, orient='table')

# GradBoost Salary Recognition

In [2]:
import numpy as np
import pandas as pd
import catboost
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

In [12]:
class Model_ctbst:
    def __init__(self, objective='CrossEntropy'):
        '''input:
        objective - loss function
        eval - metric for loggigng'''
        self.objective = objective
        self.model = None
        '''Try to tune it'''
        self.model_params = dict(
            thread_count=8,
            iterations=2000,
            depth=8,
#             bagging_temperature=0.33,
#             learning_rate=0.1,
#             l2_leaf_reg=3,
#             random_strength=0.7,
            loss_function=self.objective
            )
        self.training_params = dict(
            use_best_model=True,
            early_stopping_rounds=50,
            verbose=10
            )
        
    def _set_model_(self):
        self.model = CatBoostClassifier()
        assert self.objective in ['CrossEntropy', 'Logloss']
        self.model.set_params(**self.model_params)
        
    def train(self, X_train, y_train, X_valid, y_valid):
        train_cat_features_indices = np.where(X_train.dtypes != np.float)[0]
        valid_cat_features_indices = np.where(X_valid.dtypes != np.float)[0]
        '''setting pools without weights'''
        ctbst_train_pool = Pool(data=X_train, label=y_train, cat_features=train_cat_features_indices)
        ctbst_val_pool = Pool(data=X_valid, label=y_valid, cat_features=valid_cat_features_indices)
        '''logging'''
        print('Training Model CatBoost')
        print('X_train = %s Y_train = %s' % (X_train.shape, y_train.shape))
        print('X_valid = %s Y_valid = %s' % (X_valid.shape, y_valid.shape))
        print()
        '''training'''
        self._set_model_()
        self.model = self.model.fit(ctbst_train_pool,
                                    eval_set=ctbst_val_pool,
                                    **self.training_params)
        '''feature importances'''
        print('Top features')
        feature_importances = self.model.get_feature_importance(ctbst_train_pool)
        feature_names = X_train.columns
        for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
            print('{}: {}'.format(name, score))
        
    def predict(self, X):
        if self.model is None:
            raise Exception('Train your model before')
        print('Predicting Model CatBoost')
        print('X = %s' % (X.shape,))
        print()
        X_cat_features_indices = np.where(X.dtypes != np.float)[0]
        ctbst_data_pool = Pool(data=X, cat_features=X_cat_features_indices)
        '''predict'''
        prediction = self.model.predict(ctbst_data_pool, prediction_type='Probability')
        '''get pred for 1 class'''
        prediction = pd.Series(prediction[:, 1], index=X.index)
        return prediction


In [13]:
class Trainer:
    def __init__(self, model, train_type, target, features, eval='roc-auc', n_folds=4, seed=42):
        self.model = model
        self.train_type = train_type
        self.target = target
        self.features = features
        self.eval = eval
        self.n_folds = n_folds
        self.seed = seed
        
    def _generate_folds_(self, data, val_ratio=0.2):
        index = np.asarray(data.index)
        n = index.size
        if self.n_folds == 1:
            random_state = np.random.RandomState(self.seed)
            i_valid = random_state.choice(range(n), size=int(np.floor(n * val_ratio)), replace=False)
            i_train = np.setdiff1d(range(n), i_valid, assume_unique=True)
            folds = [(i_train, i_valid)]
        else:
            fold = KFold(n_splits=self.n_folds, shuffle=True, random_state = self.seed)
            folds = []
            for i_train, i_valid in fold.split(np.arange(n)):
                folds.append((i_train, i_valid))
        return folds
    
    def _get_fold_(self, data, fltr):
        train = data.iloc[fltr[0]].reset_index(drop=True)
        valid = data.iloc[fltr[1]].reset_index(drop=True)
        return train, valid
    
    def _get_error_(self, Y, P):
        assert Y.shape[0] == P.shape[0]
        if self.eval == 'logloss':
            error = log_loss(Y, P)
        elif self.eval == 'roc-auc':
            error = roc_auc_score(Y, P)
        else:
            raise Exception('Error: unknown eval = %s' % (self.eval,))
        return error

    def train(self, data):
        print('Training with %s' % (self.train_type,))
        print()
        if self.train_type == 'validation':
            self._train_with_validation_(data)
        elif self.train_type == 'cross-validation':
            self._train_with_cross_validation_(data)
        else:
            raise Exception('Error: unknown train type = %s' % (self.train_type,))
            
    def _train_basic_(self, train, valid):
        X_train, y_train = train[self.features], train[self.target]
        X_valid, y_valid = valid[self.features], valid[self.target]
        self.model.train(X_train, y_train, X_valid, y_valid)
        pred_df = self.model.predict(X_valid)
        error = self._get_error_(y_valid, pred_df)
        print('Error %s: %s' % (self.eval, error))
        print()
        return pred_df, error

    def _train_with_validation_(self, data):
        print('Train with validation...')
        print()
        folds = self._generate_folds_(data)
        train, valid = self._get_fold_(data, folds[0])
        pred_df, error = self._train_basic_(train, valid)
    
    def _train_with_cross_validation_(self, data):
        errors = []
        print('Train with cross-validation...')
        print()
        folds = self._generate_folds_(data)
        print('Cross-validation %d folds' % (self.n_folds,))
        print()
        for i_fold in range(self.n_folds):
            print("Fold = %d / %d" % (i_fold + 1, self.n_folds))
            print()
            train, valid = self._get_fold_(data, folds[i_fold])
            pred_df, error = self._train_basic_(train, valid)
            errors.append(error)
        print('Mean %s error on CV: %s' % (self.eval, np.mean(errors)))
        print()

In [None]:
model = Model_ctbst()

In [None]:
feats = ['symbol_number', 
        'word_number', 
        'line_number', 
        'number_number', 
        'len_1_lines', 
        'len_2_lines', 
        'special_symbols']
target = 'isVacancy'

trainer = Trainer(model=model,  train_type='cross-validation', target=target, 
                  features=feats)

In [None]:
train_test_division = lambda x: (x.iloc[:180], x.iloc[180:])
(train_feat, test_feat) = train_test_division(train_feat.sample(frac=1))

In [None]:
trainer.train(train_feat)

In [None]:
test_prediction = model.predict(test_feat).to_frame()
test_prediction.columns = ['probability']

#print(type(test_feat.isVacancy))

comparison = test_prediction.join(test_feat['isVacancy'])
print(comparison)
roc_auc_score(test_feat['isVacancy'], test_prediction)

In [None]:
model.model.save_model("ClassifierModel",format="cbm", )

# Message recognition by model

In [None]:
#classifier = CatBoostClassifier()
#classifier.load_model("ClassifierModel", format='catboost')

In [None]:
with open("user_messages_clear_w_dt.json",'r') as fin:
    msg_df = pd.read_json(fin, compression=None, orient='table')

In [None]:
msg_df.reset_index(inplace=True)

In [None]:
get_features_vectorized = lambda x: get_features(x, special_symbols)

msg_feat = msg_df.apply(get_features_vectorized, axis=1)

In [None]:
msg_df['probability'] = model.predict(msg_feat)

In [None]:
with open("user_messages_clear_w_dt_p.json.json",'w') as fout:
    msg_df.to_json(fout, compression=None)

# Creating line features dataframe

In [22]:
def get_line_features(msg, special_symbols):
        
    cols = ['id',
            'order_number',
            'symbol_number', 
            'word_number',  
            'number_number', 
            'len_1_lines', 
            'len_2_lines', 
            'special_symbols',
            'msg_probability',
            'isSalary']
    
    lines = msg['message'].splitlines()
    
    lines_feat_df = pd.DataFrame(columns=cols)
    
    for i, line in enumerate(lines):
    
        feat_df = pd.Series(0, index=cols)
    
        feat_df['id'] = msg['index']
        
        feat_df['order_number'] = i
    
        feat_df['symbol_number'] = len(line)
        words = re.split(' |;|,|\*|\n|\t', line)
    
        feat_df['word_number'] = len(words)
        
        feat_df['number_number'] = len(re.findall(r'[0-9]+', line))
    
        feat_df['len_1_lines'] = len([x for x in words if len(x) == 1])
    
        feat_df['len_2_lines'] = len([x for x in words if len(x) == 2])
    
        for symbol in special_symbols:
            feat_df['special_symbols'] += line.upper().count(symbol)
        
        if not np.isnan(msg['salary_line']):
            feat_df['msg_probability'] = 1
        
        if i == msg['salary_line']:
            feat_df['isSalary'] = 1
                
        lines_feat_df = lines_feat_df.append(feat_df, ignore_index=True)
        
   # print(lines_feat_df)
    return lines_feat_df

In [None]:
with open("salary_posts.json",'r') as fin:
    train_true = pd.read_json(fin, compression=None)

In [None]:
with open("non_salary_posts.json",'r') as fin:
    train_false = pd.read_json(fin, compression=None, orient='table')

In [None]:
train_feat = pd.concat([train_true, train_false], ignore_index=True, sort=False)

In [None]:
cols = ['id',
            'order_number',
            'symbol_number', 
            'word_number',  
            'number_number', 
            'len_1_lines', 
            'len_2_lines', 
            'special_symbols',
            'msg_probability',
            'isSalary']

lines_feat_ready = pd.DataFrame(columns=cols)

In [None]:
#get_line_features_vectorized = lambda x: get_line_features(x, special_symbols)
#append_vectorized = lambda x,y: pd.concat([x,y.apply(get_line_features_vectorized, axis=1)], ignore_index=True)

for i in tqdm(train_feat.index):
    lines_feat_cur = get_line_features(train_feat.iloc[i], special_symbols)
    lines_feat_ready = lines_feat_ready.append(lines_feat_cur, ignore_index=True)

In [None]:
with open("lines_salary_dataset.json",'w') as fout:
    lines_feat_ready.to_json(fout, compression=None)

# Salary line recognition via cadboost

In [5]:
with open("lines_salary_dataset.json",'r') as fin:
    lines_feat_ready = pd.read_json(fin, compression=None)

In [6]:
line_model = Model_ctbst()

In [14]:
feats = ['order_number',
         'symbol_number', 
         'word_number',  
         'number_number', 
         'len_1_lines', 
         'len_2_lines', 
         'special_symbols',
         'msg_probability']
target = 'isSalary'

line_trainer = Trainer(model=line_model,  train_type='cross-validation', target=target, 
                  features=feats)

In [15]:
train_test_division = lambda x: (x.iloc[:4*len(x.index)//5], x.iloc[4*len(x.index)//5:])
(train_line_feat, test_line_feat) = train_test_division(lines_feat_ready.sample(frac=1))

In [16]:
line_trainer.train(train_line_feat)

Training with cross-validation

Train with cross-validation...

Cross-validation 4 folds

Fold = 1 / 4

Training Model CatBoost
X_train = (5304, 8) Y_train = (5304,)
X_valid = (1769, 8) Y_valid = (1769,)

Learning rate set to 0.069103
0:	learn: 0.5755696	test: 0.5781020	best: 0.5781020 (0)	total: 114ms	remaining: 3m 48s
10:	learn: 0.1598000	test: 0.1801766	best: 0.1801766 (10)	total: 758ms	remaining: 2m 17s
20:	learn: 0.0995245	test: 0.1249448	best: 0.1249448 (20)	total: 1.54s	remaining: 2m 25s
30:	learn: 0.0789728	test: 0.0966990	best: 0.0966990 (30)	total: 2.43s	remaining: 2m 34s
40:	learn: 0.0716910	test: 0.0835381	best: 0.0835381 (40)	total: 3.19s	remaining: 2m 32s
50:	learn: 0.0687756	test: 0.0786694	best: 0.0786694 (50)	total: 4.04s	remaining: 2m 34s
60:	learn: 0.0666624	test: 0.0752212	best: 0.0752212 (60)	total: 4.94s	remaining: 2m 37s
70:	learn: 0.0638095	test: 0.0716303	best: 0.0716303 (70)	total: 6s	remaining: 2m 43s
80:	learn: 0.0627445	test: 0.0706090	best: 0.0706090 (80)	

140:	learn: 0.0604587	test: 0.0336162	best: 0.0336162 (140)	total: 13.8s	remaining: 3m 2s
150:	learn: 0.0591919	test: 0.0333303	best: 0.0332686 (148)	total: 15.4s	remaining: 3m 8s
160:	learn: 0.0572361	test: 0.0330030	best: 0.0330030 (160)	total: 17.1s	remaining: 3m 15s
170:	learn: 0.0552083	test: 0.0328489	best: 0.0327914 (168)	total: 19s	remaining: 3m 22s
180:	learn: 0.0527561	test: 0.0328081	best: 0.0325502 (175)	total: 20.7s	remaining: 3m 28s
190:	learn: 0.0507811	test: 0.0330578	best: 0.0325502 (175)	total: 22.4s	remaining: 3m 32s
200:	learn: 0.0482795	test: 0.0328705	best: 0.0325502 (175)	total: 24.2s	remaining: 3m 36s
210:	learn: 0.0464626	test: 0.0326817	best: 0.0325502 (175)	total: 25.9s	remaining: 3m 39s
220:	learn: 0.0448368	test: 0.0325392	best: 0.0325392 (220)	total: 27.7s	remaining: 3m 42s
230:	learn: 0.0430615	test: 0.0326069	best: 0.0325392 (220)	total: 29.4s	remaining: 3m 45s
240:	learn: 0.0417079	test: 0.0324517	best: 0.0323527 (234)	total: 31.1s	remaining: 3m 47s
250

Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.04701361132
bestIteration = 318

Shrink model to first 319 iterations.
Top features
order_number: 28.144215125550044
symbol_number: 24.680628001389206
special_symbols: 10.369425378049836
word_number: 10.324310417899447
len_1_lines: 9.445115227689882
len_2_lines: 6.794344131590693
msg_probability: 5.97440323378437
number_number: 4.267558484046486
Predicting Model CatBoost
X = (1768, 8)

Error roc-auc: 0.9611927485734485

Mean roc-auc error on CV: 0.9553179733465296



In [18]:
test_prediction = line_model.predict(test_line_feat).to_frame()
test_prediction.columns = ['probability']

comparison = test_prediction.join(test_line_feat['isSalary'])
print(comparison)
roc_auc_score(test_line_feat['isSalary'], test_prediction)

Predicting Model CatBoost
X = (1769, 10)

      probability  isSalary
4967     0.001112         0
7137     0.024518         0
8398     0.001308         0
6886     0.006759         0
3877     0.000726         0
7504     0.022347         0
6038     0.001412         0
8657     0.000140         0
1200     0.002021         0
7192     0.023611         0
4890     0.000188         0
3041     0.000255         0
7455     0.000151         0
3480     0.000459         0
6358     0.034051         0
1953     0.004370         0
1886     0.032620         0
5726     0.055632         0
5473     0.005013         0
7729     0.030063         0
1081     0.001352         0
6269     0.006277         0
7949     0.000179         0
2404     0.001980         0
1386     0.003395         0
3009     0.003016         0
6729     0.029443         0
6863     0.036362         0
786      0.029874         0
6018     0.020375         0
...           ...       ...
7218     0.013759         0
2736     0.001393         0
3432  

0.945840554592721

# Line recognition by model

In [82]:
with open("user_messages_clear_w_dt_p.json.json",'r') as fin:
    msg_df = pd.read_json(fin, compression=None)

In [83]:
msg_df_recognized_before_ml = msg_df.dropna(subset=['salary_line'])
print(msg_df_recognized_before_ml.shape)

(797, 11)


In [20]:
cols = ['id',
            'order_number',
            'symbol_number', 
            'word_number',  
            'number_number', 
            'len_1_lines', 
            'len_2_lines', 
            'special_symbols',
            'msg_probability',
            'isSalary']

lines_feat_ready = pd.DataFrame(columns=cols)

In [25]:
for i in tqdm(msg_df.index):
    lines_feat_cur = get_line_features(msg_df.iloc[i], special_symbols)
    lines_feat_ready = lines_feat_ready.append(lines_feat_cur, ignore_index=True)

100%|██████████| 2431/2431 [04:36<00:00,  8.79it/s]


In [26]:
lines_feat_ready['probability'] = line_model.predict(lines_feat_ready)

Predicting Model CatBoost
X = (52328, 10)



In [75]:
ids = 0
print(lines_feat_ready['probability'].loc[lines_feat_ready['id'] == ids])
print(msg_df.columns)

0     0.002804
1     0.011396
2     0.024408
3     0.030171
4     0.004773
5     0.024775
6     0.003046
7     0.000715
8     0.000621
9     0.016613
10    0.000751
11    0.000493
12    0.030167
13    0.000383
14    0.000357
15    0.024124
16    0.000549
17    0.000325
18    0.048977
19    0.002949
20    0.000262
21    0.000253
22    0.017933
23    0.000260
24    0.000268
25    0.013738
26    0.000240
27    0.000298
28    0.020788
29    0.000406
        ...   
42    0.000228
43    0.011116
44    0.000241
45    0.000228
46    0.016233
47    0.036146
48    0.000227
49    0.000345
50    0.022912
51    0.001684
52    0.000237
53    0.000287
54    0.064169
55    0.003057
56    0.000185
57    0.043907
58    0.000698
59    0.000414
60    0.035541
61    0.000196
62    0.000655
63    0.106714
64    0.000395
65    0.002935
66    0.021185
67    0.096008
68    0.010863
69    0.037928
70    0.046494
71    0.012778
Name: probability, Length: 72, dtype: float64
Index(['index', 'author', 'link', 'mess

In [96]:
for ids in tqdm(lines_feat_ready['id'].unique()):
    if msg_df.loc[msg_df['index'] == ids].iloc[0]['probability'] > 0.6:
        #print(msg_df.loc[msg_df['index'] == ids].iloc[0]['salary_line'])
        if np.isnan(msg_df.loc[msg_df['index'] == ids].iloc[0]['salary_line']):
            msg_df.loc[(msg_df['index'] == ids),'salary_line'] = lines_feat_ready['order_number'].iloc[lines_feat_ready.loc[(lines_feat_ready['id'] == ids), 'probability'].idxmax]
            line = msg_df.loc[msg_df['index'] == ids].iloc[0]['message'].splitlines()[msg_df.loc[msg_df['index'] == ids].iloc[0]['salary_line'].astype(int)]
            numbers = msg_df.loc[msg_df['index'] == ids, 'numbers'].iloc[0]
            for i, n in enumerate(numbers):
                number_found = re.search(str(n), line)
                if number_found:
                    if number_found.end()<len(line):
                        if not line[number_found.end()].isdigit():
                            msg_df.loc[(msg_df['index'] == ids), 'low_fork'] = n
                            if i + 1 < len(numbers):
                                msg_df.loc[(msg_df['index'] == ids), 'up_fork'] = numbers[i + 1] if (n < numbers[i + 1]) else n
                            else:
                                msg_df.loc[(msg_df['index'] == ids), 'up_fork'] = n
                            break





  0%|          | 0/2431 [00:00<?, ?it/s][A[A[A[A



  1%|          | 20/2431 [00:00<00:12, 199.79it/s][A[A[A[A



  2%|▏         | 39/2431 [00:00<00:12, 196.33it/s][A[A[A[A



  3%|▎         | 62/2431 [00:00<00:11, 203.89it/s][A[A[A[A



  3%|▎         | 81/2431 [00:00<00:11, 196.24it/s][A[A[A[A



  4%|▍         | 101/2431 [00:00<00:11, 196.01it/s][A[A[A[A



  5%|▍         | 119/2431 [00:00<00:12, 189.71it/s][A[A[A[A



  6%|▌         | 138/2431 [00:00<00:12, 189.04it/s][A[A[A[A



  6%|▋         | 156/2431 [00:00<00:12, 184.43it/s][A[A[A[A



  7%|▋         | 174/2431 [00:00<00:12, 175.91it/s][A[A[A[A



  8%|▊         | 195/2431 [00:01<00:12, 183.22it/s][A[A[A[A



  9%|▉         | 213/2431 [00:01<00:12, 174.94it/s][A[A[A[A



 10%|▉         | 231/2431 [00:01<00:12, 173.12it/s][A[A[A[A



 10%|█         | 249/2431 [00:01<00:12, 170.70it/s][A[A[A[A



 11%|█         | 267/2431 [00:01<00:12, 172.30it/s][A[A[A[A



 12%|█

 85%|████████▌ | 2076/2431 [00:13<00:02, 140.58it/s][A[A[A[A



 86%|████████▌ | 2092/2431 [00:13<00:02, 145.17it/s][A[A[A[A



 87%|████████▋ | 2108/2431 [00:13<00:02, 134.87it/s][A[A[A[A



 88%|████████▊ | 2129/2431 [00:13<00:02, 150.74it/s][A[A[A[A



 88%|████████▊ | 2146/2431 [00:13<00:01, 155.10it/s][A[A[A[A



 89%|████████▉ | 2163/2431 [00:14<00:02, 120.85it/s][A[A[A[A



 90%|████████▉ | 2177/2431 [00:14<00:02, 108.56it/s][A[A[A[A



 90%|█████████ | 2196/2431 [00:14<00:01, 121.02it/s][A[A[A[A



 91%|█████████ | 2210/2431 [00:14<00:01, 116.29it/s][A[A[A[A



 91%|█████████▏| 2223/2431 [00:14<00:01, 112.70it/s][A[A[A[A



 92%|█████████▏| 2238/2431 [00:14<00:01, 120.68it/s][A[A[A[A



 93%|█████████▎| 2251/2431 [00:14<00:01, 119.55it/s][A[A[A[A



 93%|█████████▎| 2264/2431 [00:14<00:01, 120.78it/s][A[A[A[A



 94%|█████████▎| 2277/2431 [00:15<00:01, 106.78it/s][A[A[A[A



 95%|█████████▍| 2299/2431 [00:15<00:01, 126.01i

In [98]:
msg_df_recognized = msg_df.dropna(subset=['salary_line', 'low_fork'])
print(msg_df_recognized.shape)

(929, 11)


In [99]:
with open("user_messages_recognized.json",'w') as fout:
    msg_df_recognized.to_json(fout)

# Getting positions

In [105]:
with open("user_messages_recognized.json",'r') as fin:
    vacancy_df = pd.read_json(fin)

In [107]:
positions = {'Intern': ['intern', 'стажер', 'интерн', 'стажировк'],
             'Junior': ['junior', 'джун', 'джуниор'],
             'Middle': ['middle', 'мидл'],
             'Senior': ['senior', 'сеньор']}

In [121]:
def get_position(message, positions):
    
    pos = 'Unknown'
    flag = False
    
    for position in positions:
        for word in positions[position]:
            if word in message.lower():
                if flag:
                    return 'Unknown'
                else:
                    flag = True
                    pos = position
    return pos

In [122]:
get_position_vectorized = lambda x: get_position(x, positions)
vacancy_df['position'] = vacancy_df['message'].apply(get_position_vectorized)

In [126]:
print(vacancy_df.shape)

(929, 12)


In [125]:
with open("user_messages_recognized.json",'w') as fout:
    vacancy_df.to_json(fout)