# Libs & Settings

In [2]:
import re
import sys
import time
import locale
import warnings
import numpy as np
import pandas as pd
from ncls import NCLS
from pathlib import Path
from rapidfuzz import fuzz
from tqdm.auto import tqdm
from functools import reduce
from bs4 import BeautifulSoup
from datetime import datetime
from typing import Tuple, List, Optional
from sentence_transformers import SentenceTransformer, util

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
# Import developer modules.
module_path = str(Path.cwd().parent / "utils")
if module_path not in sys.path:
    sys.path.append(module_path)

from processing import (
    process_time,
    count_months,
    parse_resume,
    hh_job_preparation,
    get_job_rank
)

In [5]:
# Define data folder.
data_folder = Path().cwd().parent / 'data'

# Parsing Data

In [34]:
# File path generator.
files = data_folder.glob('*.html')
# Collect jobs from all resumes.
df = reduce(
    # Concatenate all dataframes together.
    lambda x, y: pd.concat([x, y]),
    # Generate a list of dataframes with jobs.
    [
        pd.DataFrame(
            data=parse_resume(
                file=file, 
                user_id=id,
                # name_process_func=hh_job_preparation # put here string processing function
            ),
            columns=['user_id', 'month_cnt', 'start_date', 'end_date', 'job_name', 'job_desc']
        ) for id, file in enumerate(tqdm(list(files), 'Read and process resume'))
    ]
)

Read and process resume:   0%|          | 0/1001 [00:00<?, ?it/s]

In [35]:
# Save data.
# df.to_pickle('parsed.pickle')

# Process Data

In [5]:
# Load data.
# df = pd.read_pickle('parsed.pickle')

In [36]:
df_rank = get_job_rank(df=df)

Processing users:   0%|          | 0/1001 [00:00<?, ?it/s]

In [37]:
df_rank.head()

Unnamed: 0,user_id,month_cnt,start_date,end_date,job_name,job_desc,job_level_simple,job_level_intersect
0,0,68,2018-04-01,-1,Специалист сервисно-монтажной службы,"Сборка, ремонт устройств селскохозяйственного ...",9,7
1,0,8,2017-09-01,2018-04-01,Менеджер по поиску клиентов,"Совершение холодных звонков, поиск клиентов, з...",8,6
2,0,3,2016-08-01,2016-10-01,Продавец-консультант,Консультирование и обслуживание покупателей.\n...,7,5
3,0,12,2015-08-01,2016-07-01,Менеджер товароучета и хранения,"Работа с клиентами, обработка заявок в програм...",6,4
4,0,6,2013-11-01,2014-04-01,Заведующий складом в магазине одежды,"Руководство работой склада по приему, хранению...",5,3


# Clusterization

In [104]:
def clusterize(model: object, job_name_list: list, **kwargs) -> List[list]:
    """Клсастеризация набора должностей."""
    # Apply encoding.
    print("Encode the corpus. This might take a while")
    corpus_embeddings = model.encode(
        job_name_list, 
        batch_size=64, 
        show_progress_bar=True, 
        convert_to_tensor=True
    )
    # Get clusters
    print("Start clustering")
    start_time = time.time()
    #Two parameters to tune:
    #min_cluster_size: Only consider cluster that have at least 25 elements
    #threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
    clusters = util.community_detection(corpus_embeddings, **kwargs)
    print("Clustering done after {:.2f} sec".format(time.time() - start_time))
    return clusters

In [116]:
def get_clusters(df, model, **kwargs):
    """Кластеризауия по всем уровням позиций"""
    _df_cluster_list = []
    with warnings.catch_warnings():
        # Irnore pandas warnings.
        warnings.filterwarnings("ignore")
        for i in df.job_level_intersect.unique():
            # Select level job names.
            _df = df[df.job_level_intersect == i]
            job_name_list = list(_df.job_name)
            print('Job level:', i)
            cluster = clusterize(model, job_name_list, **kwargs)
            _df['cluster'] = -1
            for num, clst in enumerate(cluster):
                for idx in clst:
                    _df.loc[_df.index == idx, 'cluster'] = f'Cluster_{num}_level{i}'
            _df_cluster_list.append(_df)

    return reduce(lambda x, y: pd.concat([x, y]), _df_cluster_list)

In [None]:
# Load model.
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [117]:
data_cluster = get_clusters(df_rank, model, min_community_size=25, threshold=0.75)

Job level: 7
Encode the corpus. This might take a while


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.01 sec
Job level: 6
Encode the corpus. This might take a while


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Job level: 5
Encode the corpus. This might take a while


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.02 sec
Job level: 4
Encode the corpus. This might take a while


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.03 sec
Job level: 3
Encode the corpus. This might take a while


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.06 sec
Job level: 2
Encode the corpus. This might take a while


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.10 sec
Job level: 1
Encode the corpus. This might take a while


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.29 sec
Job level: 8
Encode the corpus. This might take a while


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.01 sec
Job level: 10
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Job level: 9
Encode the corpus. This might take a while


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Job level: 13
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Job level: 12
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Job level: 11
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Job level: 15
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Job level: 14
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Job level: 21
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Job level: 20
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Job level: 19
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Job level: 18
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Job level: 17
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Job level: 16
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec


In [119]:
data_cluster

Unnamed: 0,user_id,month_cnt,start_date,end_date,job_name,job_desc,job_level_simple,job_level_intersect,cluster
0,0,68,2018-04-01,-1,Специалист сервисно-монтажной службы,"Сборка, ремонт устройств селскохозяйственного ...",9,7,-1
1,8,6,2015-10-01,2016-03-01,Зам.директора по транспорту и логистике,Управление штатом водителей и диспетчером;\n\n...,8,7,-1
1,19,19,2019-08-01,2021-02-01,Тракторист-машинист,"Работа на бульдозере CatD6NXL, CatD6R. Планиро...",9,7,-1
3,21,5,2014-04-01,2014-08-01,Агроном,Организация производства сельскохозяйственной ...,7,7,-1
3,28,24,2018-09-01,2020-08-01,Дорожный рабочий,Обслуживание дорог федерального и местного зна...,7,7,-1
...,...,...,...,...,...,...,...,...,...
4,793,6,2009-10-01,2010-03-01,Машинист трубоукладчика 5 разряда,Ведение технологического процесса по трубоукла...,17,17,-1
0,936,56,2019-04-01,-1,Главный ветеринарно-санитарный врач,"Организация ветеринарно-санитарной экспертизы,...",17,17,-1
5,544,14,2007-10-01,2008-11-01,Агроном,Закупка посадочного материала и рулонного газо...,16,16,-1
5,793,12,2008-05-01,2009-04-01,Машинист трубоукладчика 5 разряда,Ведение технологического процесса по трубоукла...,16,16,-1


# Cluster summarization

In [41]:
temp = list(set([corpus_sentences[c] for c in clusters[0]]))

In [42]:
def compress_jobs(jobs_list, threshold, element=0):
    indexes = []
    for j in range(element+1, len(jobs_list)):
        score = fuzz.token_set_ratio(jobs_list[element], jobs_list[j])
        if score >= threshold:
            indexes.append(j)
    for index in sorted(indexes, reverse=True):
        del jobs_list[index]

In [53]:
# compress_jobs(jobs_list=temp, threshold=60, element=5)

In [58]:
all_jobs = []
for n in range(len(clusters)):
    jobs_list = list(set([corpus_sentences[i] for i in clusters[n]]))
    length = len(jobs_list)
    j= 0
    while len(jobs_list) - length != 0:
        compress_jobs(jobs_list=jobs_list, threshold=60, element=j)
        length = len(jobs_list)
        j =+ 1
    all_jobs.append(jobs_list)

In [59]:
all_jobs

[['Заместитель главного ветеринарного врача',
  'ветеринарный фельдшер',
  'Ветеринарный врач отделения',
  'Стажер ветеринарного врача',
  'Ветеринарный врач, Управляюший МТФ',
  'Ассистент ветеринарного врача',
  'старший ветеринарный врач',
  'участковый ветеринарный врач',
  'Ветеринарный фельдшер',
  'Асистент ветеринарного врача',
  'Ветеринарный врач эпизоотолог',
  'Ассистент ветеринарного врача ( стажёр)',
  'главный ветеринарный врач',
  'Ветеринарный врач,Главный ветеринарный врач',
  'Помощник ветеринарного врача',
  'Ветеринар',
  'Менеджер - ветеринарный врач',
  'Ветеринарный врач',
  'Старший ветеринарный врач',
  'Ветеринарный врач откорма',
  'Помощник старшего ветеринарного врача',
  'Ветеринарный врач-серолог',
  'ветеринарный врач',
  'Главный ветеринарный врач',
  'Помощник главного ветеринарного врача',
  'врач ветеринарной медицины',
  'Врач ветеринарной медицины',
  'Ветеринарный санитар'],
 ['механизатор-водитель',
  'Заведующий машинно-тракторной мастерской',

# References

1. [Sentence Transformers](https://github.com/UKPLab/sentence-transformers): Multilingual Sentence, Paragraph, and Image Embeddings using BERT & Co.
2. [Fast clustering](https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/clustering/fast_clustering.py) implementation.
3. [Rapid fuzzy string matching ](https://github.com/maxbachmann/RapidFuzz) in Python and C++ using the Levenshtein Distance.
4. [Gradio](https://www.gradio.app/guides/quickstart)
5. [Streamsync](https://www.streamsync.cloud/getting-started.html)
6. [head_hunter job positions clustering](https://www.kaggle.com/code/bogdankishchak/head-hunter-job-positions-clustering-in-progress)