# Libs & Settings

In [1]:
import re
import sys
import locale
import warnings
import numpy as np
import pandas as pd
from ncls import NCLS
from pathlib import Path
from tqdm.auto import tqdm
from functools import reduce
from bs4 import BeautifulSoup
from datetime import datetime
from typing import Tuple, List, Optional

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Import developer modules.
module_path = str(Path.cwd().parent / "utils")
if module_path not in sys.path:
    sys.path.append(module_path)

from processing import (
    process_time,
    count_months,
    parse_resume,
    hh_job_preparation,
    get_job_rank
)

In [4]:
# Define data folder.
data_folder = Path().cwd().parent / 'data'

# Parsing Data

In [7]:
# File path generator.
files = data_folder.glob('*.html')
# Collect jobs from all resumes.
df = reduce(
    # Concatenate all dataframes together.
    lambda x, y: pd.concat([x, y]),
    # Generate a list of dataframes with jobs.
    [
        pd.DataFrame(
            data=parse_resume(
                file=file, 
                user_id=id,
                name_process_func=hh_job_preparation # put here string processing function
            ),
            columns=['user_id', 'month_cnt', 'start_date', 'end_date', 'job_name', 'job_desc']
        ) for id, file in enumerate(tqdm(list(files), 'Read and process resume'))
    ]
)

Read and process resume:   0%|          | 0/1001 [00:00<?, ?it/s]

In [10]:
# Save data.
# df.to_pickle('parsed.pickle')

# Process Data

In [5]:
# Load data.
# df = pd.read_pickle('parsed.pickle')

In [8]:
df_rank = get_job_rank(df=df)

Processing users:   0%|          | 0/1001 [00:00<?, ?it/s]

In [9]:
df_rank.head()

Unnamed: 0,user_id,month_cnt,start_date,end_date,job_name,job_desc,job_level_simple,job_level_intersect
0,0,68,2018-04-01,-1,специалист сервисно-монтажной службы,"Сборка, ремонт устройств селскохозяйственного ...",9,7
1,0,8,2017-09-01,2018-04-01,менеджер,"Совершение холодных звонков, поиск клиентов, з...",8,6
2,0,3,2016-08-01,2016-10-01,продавец-консультант,Консультирование и обслуживание покупателей.\n...,7,5
3,0,12,2015-08-01,2016-07-01,менеджер товароучета и хранения,"Работа с клиентами, обработка заявок в програм...",6,4
4,0,6,2013-11-01,2014-04-01,заведующий складом в магазине одежды,"Руководство работой склада по приему, хранению...",5,3
