# Libs & Settings

In [1]:
import re
import sys
import time
import locale
import warnings
import numpy as np
import pandas as pd
from ncls import NCLS
from pathlib import Path
from rapidfuzz import fuzz
from tqdm.auto import tqdm
from functools import reduce
from bs4 import BeautifulSoup
from datetime import datetime
from typing import Tuple, List, Optional
from sentence_transformers import SentenceTransformer, util

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Import developer modules.
module_path = str(Path.cwd().parent / "utils")
if module_path not in sys.path:
    sys.path.append(module_path)

from processing import (
    process_time,
    count_months,
    parse_resume,
    hh_job_preparation,
    get_job_rank
)

In [4]:
# Define data folder.
data_folder = Path().cwd().parent / 'data'

# Parsing Data

In [5]:
# File path generator.
files = data_folder.glob('*.html')
# Collect jobs from all resumes.
df = reduce(
    # Concatenate all dataframes together.
    lambda x, y: pd.concat([x, y]),
    # Generate a list of dataframes with jobs.
    [
        pd.DataFrame(
            data=parse_resume(
                file=file, 
                user_id=id,
                name_process_func=hh_job_preparation # put here string processing function
            ),
            columns=['user_id', 'month_cnt', 'start_date', 'end_date', 'job_name', 'job_desc']
        ) for id, file in enumerate(tqdm(list(files), 'Read and process resume'))
    ]
)

Read and process resume:   0%|          | 0/1001 [00:00<?, ?it/s]

In [6]:
# Save data.
# df.to_pickle('parsed.pickle')

# Process Data

In [7]:
# Load data.
# df = pd.read_pickle('parsed.pickle')

In [8]:
df_rank = get_job_rank(df=df)

Processing users:   0%|          | 0/1001 [00:00<?, ?it/s]

In [9]:
df_rank.head()

Unnamed: 0,user_id,month_cnt,start_date,end_date,job_name,job_desc,job_level_simple,job_level_intersect
0,0,68,2018-04-01,-1,специалист сервисно-монтажной службы,"Сборка, ремонт устройств селскохозяйственного ...",9,7
1,0,8,2017-09-01,2018-04-01,менеджер,"Совершение холодных звонков, поиск клиентов, з...",8,6
2,0,3,2016-08-01,2016-10-01,продавец-консультант,Консультирование и обслуживание покупателей.\n...,7,5
3,0,12,2015-08-01,2016-07-01,менеджер товароучета и хранения,"Работа с клиентами, обработка заявок в програм...",6,4
4,0,6,2013-11-01,2014-04-01,заведующий складом в магазине одежды,"Руководство работой склада по приему, хранению...",5,3


# Clusterization

In [10]:
def clusterize(model: object, job_name_list: list, **kwargs) -> List[list]:
    """Клсастеризация набора должностей."""
    # Apply encoding.
    print("Encode the corpus. This might take a while")
    corpus_embeddings = model.encode(
        job_name_list, 
        batch_size=64, 
        show_progress_bar=True, 
        convert_to_tensor=True
    )
    # Get clusters
    print("Start clustering")
    start_time = time.time()
    #Two parameters to tune:
    #min_cluster_size: Only consider cluster that have at least 25 elements
    #threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
    clusters = util.community_detection(corpus_embeddings, **kwargs)
    print("Clustering done after {:.2f} sec".format(time.time() - start_time))
    return clusters

In [32]:
def get_clusters(df, model, **kwargs):
    """Кластеризация по всем уровням позиций"""
    _df_cluster_list = []
    with warnings.catch_warnings():
        # Irnore pandas warnings.
        warnings.filterwarnings("ignore")
        for i in df.job_level_intersect.unique():
            print('Job level:', i)
            # Select level job names.
            _df = df[df.job_level_intersect == i].reset_index(drop=True)
            job_name_list = list(_df.job_name)
            cluster = clusterize(model, job_name_list, **kwargs)
            _df['cluster'] = -1
            for num, clst in enumerate(cluster):
                for idx in clst:
                    _df.loc[_df.index == idx, 'cluster'] = f'Cluster_{num}_level{i}'
            _df_cluster_list.append(_df)

    return reduce(lambda x, y: pd.concat([x, y]), _df_cluster_list), cluster

In [33]:
# Load model.
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [38]:
data_cluster, clst = get_clusters(df_rank, model, min_community_size=25, threshold=0.75)

Encode the corpus. This might take a while


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.01 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.01 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.03 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.08 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.12 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.14 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.37 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec
Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec


# Cluster summarization

In [39]:
def compress_jobs(jobs_list, threshold, element=0):
    indexes = []
    for j in range(element+1, len(jobs_list)):
        score = fuzz.token_set_ratio(jobs_list[element], jobs_list[j])
        if score >= threshold:
            indexes.append(j)
    for index in sorted(indexes, reverse=True):
        del jobs_list[index]

In [58]:
# all_jobs['Cluster_0_level1']
temp = data_cluster[data_cluster.cluster == 'Cluster_0_level1'].job_name.to_list()

In [59]:
# TODO: function to iteratively throw away similar jobs
compress_jobs(jobs_list=temp, threshold=70, element=0)
compress_jobs(jobs_list=temp, threshold=70, element=1)
compress_jobs(jobs_list=temp, threshold=70, element=3)

In [60]:
temp

['главный ветеринарный врач',
 'стажер ветеринарного врача',
 'менеджер-ветеринарный врач',
 'врач гениколог',
 'ветеринар']

# References

1. [Sentence Transformers](https://github.com/UKPLab/sentence-transformers): Multilingual Sentence, Paragraph, and Image Embeddings using BERT & Co.
2. [Fast clustering](https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/clustering/fast_clustering.py) implementation.
3. [Rapid fuzzy string matching ](https://github.com/maxbachmann/RapidFuzz) in Python and C++ using the Levenshtein Distance.
4. [Gradio](https://www.gradio.app/guides/quickstart)
5. [Streamsync](https://www.streamsync.cloud/getting-started.html)
6. [head_hunter job positions clustering](https://www.kaggle.com/code/bogdankishchak/head-hunter-job-positions-clustering-in-progress)