Загружем данные, достаём навыки из вакансий и матчим вакансии с работами в резюме

In [1]:
import json
from typing import List, Tuple, Sequence
import pickle
from tqdm import tqdm_notebook as tqdm
import pymorphy2
import nltk
from cytoolz import pipe
from collections import Counter
import re
import theano
from collections import defaultdict
from collections import Counter

In [2]:
morph = pymorphy2.MorphAnalyzer()
stopwords = nltk.corpus.stopwords.words(
    'russian') + nltk.corpus.stopwords.words('english')
stopwords += [
    'отличный', 'метр', 'наш', 'клиент', 'банка', 'проект', 'литр',
    'желательный', 'др', 'самый', 'мочь', 'хороший', 'год', 'чел', 'обязательный'
]

cache = {}


def remove_numbers(text):
    return re.sub(r'\d+', '', text)


def _get_POS(word):
    if word not in cache:
        cache[word] = morph.parse(word)[0]
    return cache[word].tag.POS


def normal_form(word):
    if word not in cache:
        cache[word] = morph.parse(word)[0]
    return cache[word].normal_form


def is_word_pos_in(word: str, pos: List[str] = None) -> bool:
    if not pos:
        pos = ['NOUN', "ADJF", 'INFN', 'VERB', 'ADJS']

    return _get_POS(word) in pos


def get_words(text):
    return re.findall(r'\w+', text)


def nonempty(x):
    if isinstance(x, Sequence):
        return filter(lambda x: len(x) > 0 and x != ' ', x)
    return x


helper = {}


def remove_numbers(text):
    return re.sub(r'\d+', '', text)


def normalize_skill(skill: str):
    parsed = tuple(
        pipe(
            skill,
            lambda x: x.lower(),
            remove_numbers,
            get_words,
        ))

    clear_skill = []
    dirty_skill = []

    # Последнее стоп слово для dirty_skill
    last_stopword = None

    # Для каждого слова в скилле
    for i in parsed:
        # Нормализуем слово
        word = normal_form(i)

        # Если стоп слово - запомним его
        if word in nltk.corpus.stopwords.words('russian'):
            last_stopword = word

            if word == "без":
                clear_skill.append(word)

        # Проверим на часть речи, длинну и стоплова
        elif is_word_pos_in(word) and len(word) > 3 and word not in stopwords:

            # Если до этого было стоп слово, добавим его в dirty
            if last_stopword and len(dirty_skill) > 0:
                dirty_skill.append(last_stopword)
                last_stopword = None

            # Добавим в чистый скилл слово
            clear_skill.append(word)

            if is_word_pos_in(word, ['NOUN', 'ADJF']):
                dirty_skill.append(i)

    if len(clear_skill) > 1 and len(clear_skill) < 8:
        return clear_skill, dirty_skill

    else:
        return []

In [3]:
def split_skill(a):
    '''Рассказ о программистах или о жизни -> рассказ о программистах, рассказ о жизни'''
    main_skill = a

    def repl(x): return x.replace(',', '|').replace(
        ' или ', '|').replace(' и ', '|').replace('/', '|').split('|')

    first_match = re.match(r'([а-яА-ЯA-Za-z\-\s]*([,и\/]|или)\s)', a)

    if first_match:
        first_match = first_match.group()
        a = repl(a.replace(first_match, ''))

        for i in a:
            if len(i.split()) > 4:
                return repl(main_skill)

        variants = first_match.replace(',', ' ').replace(
            ' или ', ' ').replace(' и ', ' ').replace('/', ' ')
        variants = list(filter(len, variants.split(' ')))
        if len(variants) > 1:
            main_phrase = variants[:-1]
            a.append(variants[-1])

            skills = []

            for i in a:
                skills.append(" ".join(main_phrase + [i]))
            return skills
        else:
            return a + variants
    else:
        return [a]

In [4]:
def split_into_skills(text: str) -> List[str]:
    def _split_into_skills(x):
        x = re.sub(r'([А-ЯA-Z])', r'\n\1', text)
        return re.split(r'[\n\.,]', x)

    pre_skills = list(nonempty(_split_into_skills(text)))
    done_skills = []

    for skill in pre_skills:
        skill = skill.replace('/', ' / ').replace("\xa0", " ")
        if len(re.findall("\([а-яА-ЯA-Za-z\-\s]*\)", skill)) > 0:
            skill1 = re.findall(r'\([а-яА-ЯA-Za-z\-\s]*\)', skill)[0]
            skill2 = re.findall(r'[а-яА-ЯA-Za-z\-\s]*\(', skill)[0][:-1]
            done_skills.extend([skill1, skill2])
        elif len(re.findall("([а-яА-ЯA-Za-z\-\s]*([и\/]|или)\s[а-яА-ЯA-Za-z\-])", skill)) > 0:
            new_skills = split_skill(skill)
            if new_skills:
                done_skills.extend(new_skills)

        else:
            done_skills.append(skill)
    return done_skills

In [5]:
from joblib import delayed, Parallel

def parallel(f, data):
    """Run parallel your func on all CPU"""
    
    return Parallel(n_jobs=-1, verbose=3, max_nbytes='1G')(delayed(f)(x) for x in data)

In [6]:
def load_resume_vacancy() -> Tuple[List[dict], List[dict]]:
    """Loads JSON data of SuperJob"""

    def load_json_by_lines(file):
        tmp = []
        
        with open(file) as f:
            for line in tqdm(f.read().split("\n")):
                if len(line) > 0:
                    tmp.append(json.loads(line))
        return tmp
    
    resume = load_json_by_lines("data/resume.json")
    vacancy = load_json_by_lines("data/vacancy.json")
    
    return resume, vacancy

In [None]:
resume, vacancy = load_resume_vacancy()

In [None]:
pickle.dump(resume, open("resume.pck", "wb"))
pickle.dump(vacancy, open("vacancy.pck", "wb"))

In [7]:
resume = pickle.load(open("resume.pck", "rb"))
vacancy = pickle.load(open("vacancy.pck", "rb"))

In [8]:
def add_skills(vac):
    vac = vac.copy()
    skills = split_into_skills(vac['candidat'])

    clear_skills, dirty_skills = [], []
    
    for skill in skills:
        normalized = normalize_skill(skill)
        
        if len(normalized) > 0:
            clear_skills.append(" ".join(normalized[0]))
            dirty_skills.append(" ".join(normalized[1]))
        
    vac['clear_skills'] = clear_skills
    vac['dirty_skills'] = dirty_skills
    
    return vac

In [None]:
data = []

for vac in tqdm(vacancy):
    data.append(add_skills(vac))

In [None]:
pickle.dump(data, open("vacancy_with_skills.pck", "wb"))

In [251]:
vacancy = pickle.load(open("vacancy_with_skills.pck", "rb"))

In [10]:
themes = defaultdict(lambda: [])
dirty_to_normal = {}

for d in data:
    themes[d['profession_tree_name']].extend(d['dirty_skills'])

In [11]:
works = []

for i in resume:
    works.extend(i['work_history'])

In [None]:
words = set()

In [None]:
for work in tqdm(works):
    work_words = list(filter(lambda x: x not in stopwords, get_words(remove_numbers(work['work'].lower()))))
    words |= set(work_words)

In [None]:
for vac in tqdm(vacancy):
    vac_words = list(filter(lambda x: x not in stopwords, get_words(remove_numbers(vac['work'].lower()))))
    words |= set(vac_words)

In [None]:
pickle.dump(words, open("words.pck", "wb"))

In [12]:
words = pickle.load(open("words.pck", "rb"))

In [13]:
import gensim
import numpy as np

In [14]:
model = gensim.models.KeyedVectors.load_word2vec_format("ruwikiruscorpora_upos_cbow_300_20_2017.bin.gz", binary=True)

In [15]:
tmp_vector = np.array([0] * 300, dtype=np.float32)

def _word2vec(word):
    for i in ["_NOUN", "_ADJ", "_VERB"]:
        tmp = "{}{}".format(word, i)
        
        if tmp in model:
            return model[tmp]
        else:
            return tmp_vector

In [None]:
cahce = {}

for word in tqdm(words):
    normal_form = morph.parse(word)[0].normal_form
    cahce[word] = _word2vec(normal_form)
    

In [None]:
pickle.dump(cahce, open("cahce.pck", "wb"))

In [16]:
cahce = pickle.load(open("cahce.pck", "rb"))

In [None]:
for work in tqdm(works):
    words = list(filter(lambda x: x not in stopwords, get_words(remove_numbers(work['work'].lower()))))
    work['vec'] = np.mean([cahce[word] for word in words], axis=0)

In [None]:
pickle.dump(works, open("works_with_vecs.pck", "wb"))

In [183]:
works = pickle.load(open("works_with_vecs.pck", "rb"))

In [None]:
for vac in tqdm(vacancy):
    words = list(filter(lambda x: x not in stopwords, get_words(remove_numbers(vac['work'].lower()))))
    vac['vec'] = np.mean([cahce[word] for word in words], axis=0)

In [None]:
pickle.dump(vacancy, open("vacancy_with_vecs.pck", "wb"))

In [252]:
vacancy2 = pickle.load(open("vacancy_with_vecs.pck", "rb"))

In [253]:
vacancy_by_name = {vac['id']: vac for vac in vacancy}

In [254]:
for vac in vacancy2:
    vacancy_by_name[vac['id']]['vac'] = vac['vec']

In [344]:
pickle.dump(vacancy_by_name, open("vacancy_by_name.pck", "wb"))

In [255]:
vac_text_by_theme = defaultdict(lambda: [])

for vac in tqdm(vacancy_by_name):
    vac = vacancy_by_name[vac]
    vac_text_by_theme[vac['profession_tree_name']].append(list(
        map(normal_form, filter(lambda x: x not in stopwords, get_words(remove_numbers(vac['work'].lower()))))))

A Jupyter Widget




In [24]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

In [42]:
uniq_words = set()

for i in vac_text_by_theme:
    for vac in vac_text_by_theme[i]:
        uniq_words |= set(vac)

In [53]:
def get_tfidf(theme):
    tmp = []

    for i in vac_text_by_theme[theme]:
        tmp.extend(i)

    tf = Counter(tmp)

    for j in tf:
        tf[j] /= len(tmp)

    for word in tf:
        idf = np.log(len(vac_text_by_theme[theme])/ sum([word in doc for doc in vac_text_by_theme[theme]]))
        tf[word] *= idf

    return tf

In [54]:
themes = set()

for vac in vacancy:
    themes |= set([vac['profession_tree_name']])

In [56]:
tfidf_by_theme = {}

for theme in tqdm(themes):
    tfidf_by_theme[theme] = get_tfidf(theme)

A Jupyter Widget




In [58]:
pickle.dump(tfidf_by_theme, open("tfidf_by_theme.pck", "wb"))

In [61]:
tfidf_by_theme['Менеджер по продажам'].most_common(10)

[('работа', 0.022307421858022796),
 ('продажа', 0.021950147772064409),
 ('ведение', 0.020618958264782841),
 ('компания', 0.019961008935604898),
 ('клиент', 0.01944525647462917),
 ('контроль', 0.019364919817266072),
 ('база', 0.018261795390999546),
 ('договор', 0.017776226844607307),
 ('проведение', 0.017389752815440362),
 ('услуга', 0.017233408579110175)]

In [195]:
for work in tqdm(works):
    text = list(map(normal_form, filter(lambda x: x not in stopwords, get_words(remove_numbers(work['work'].lower())))))
    work['clear'] = text

A Jupyter Widget




In [196]:
pickle.dump(works, open("works_with_vecs.pck", "wb"))

In [105]:
for theme in tfidf_by_theme:
    g = {}

    for x in tfidf_by_theme[theme].most_common():
        if len(x) > 1:
            g[x[0]] = x[1]
    
    tfidf_by_theme[theme] = g

In [191]:
a = 1
for work in tqdm(works):
    work['id'] = a
    a += 1

A Jupyter Widget




In [206]:
most = {}

def f(work):
    matrix = defaultdict(lambda: {})
    text = Counter(work['clear'])
    
    for theme in tfidf_by_theme:
        for word in text:
            if word in tfidf_by_theme[theme]:
                matrix[theme][word] = text[word] * tfidf_by_theme[theme][word]
            else:
                matrix[theme][word] = 0
            
    for key in matrix:
        matrix[key] = sum([matrix[key][val] for val in matrix[key]])
    
    return work['id'], Counter(matrix).most_common()

In [207]:
data = parallel(f, works)

[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 3096 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 10776 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 21528 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 35352 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 52248 tasks      | elapsed:   40.4s
[Parallel(n_jobs=-1)]: Done 72216 tasks      | elapsed:   56.8s
[Parallel(n_jobs=-1)]: Done 95256 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 121368 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 150552 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 182808 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 218136 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 256536 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 283440 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 305712 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 329520 t

In [208]:
data = {x[0]: x[1] for x in data}

In [209]:
pickle.dump(data, open("data.pck", "wb"))

In [211]:
list(data.keys())[:10]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [223]:
a = 0

for work in tqdm(works):
    try:
        work['profession_tree_name'] = Counter(data[work['id']]).most_common(1)[0][0]
    except:
        work['profession_tree_name'] = None
        continue

A Jupyter Widget




In [234]:
vac_by_theme = defaultdict(lambda: [])

for vac in vacancy:
    vac_by_theme[vac['profession_tree_name']].append(vac)

In [289]:
tmp = Counter([work['profession_tree_name'][0] if work['profession_tree_name'] else None for work in works])

In [291]:
tmp.most_common()

[(None, 30592),
 ('Промоутер', 23860),
 ('Водитель', 20957),
 ('Помощник бухгалтера', 17606),
 ('Программист 1С', 12886),
 ('Менеджер по продажам', 12438),
 ('Оператор ПК', 10162),
 ('Инженер программист', 9868),
 ('Мастер', 9813),
 ('Финансовый директор', 9401),
 ('Литературный редактор', 9104),
 ('Журналист', 8747),
 ('Доцент', 8546),
 ('Помощник юриста', 7991),
 ('Переводчик', 7836),
 ('Инженер конструктор', 7474),
 ('Специалист по маркетингу', 7403),
 ('Инженер проектировщик', 7352),
 ('Учитель английского языка', 7098),
 ('Охранник', 6900),
 ('Бухгалтер', 6593),
 ('Оператор call центра', 6193),
 ('Кассир', 5818),
 ('Конструктор', 5532),
 ('Помощник руководителя', 5479),
 ('Системный администратор', 5361),
 ('Менеджер по туризму', 5254),
 ('Экономист', 4801),
 ('Техник', 4765),
 ('Юрист', 4584),
 ('Продавец', 4567),
 ('Помощник системного администратора', 4004),
 ('Специалист отдела кадров', 3861),
 ('Финансовый менеджер', 3840),
 ('Дизайнер', 3778),
 ('Менеджер по развитию', 3734)

In [238]:
def compile_cos_sim_theano():
    v1 = theano.tensor.vector(dtype='float32')
    v2 = theano.tensor.vector(dtype='float32')
    
    numerator = theano.tensor.sum(v1*v2)
    denominator = theano.tensor.sqrt(theano.tensor.sum(v1**2)*theano.tensor.sum(v2**2))
   
    return theano.function([v1, v2], numerator/denominator)

cos_sim_theano_fn = compile_cos_sim_theano()

In [316]:
def get_sim_to_work(work):
    if work['profession_tree_name']:
        all_similar_vacs = vac_by_theme[work['profession_tree_name'][0]]

        tmp = {}

        for vac in all_similar_vacs:
            if not np.isnan(vac['vec']).all():
                tmp[vac['id']] =  float(cos_sim_theano_fn(vac['vec'], work['vec']))

        return work['id'], Counter(tmp).most_common(10)
    return 0, None

In [317]:
sim_to_work = parallel(get_sim_to_work, works)

[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1304 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 3864 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 6728 tasks      | elapsed:   26.2s
[Parallel(n_jobs=-1)]: Done 9032 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done 11848 tasks      | elapsed:   49.0s
[Parallel(n_jobs=-1)]: Done 15176 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 19016 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 23368 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 28232 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 33608 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 39496 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 45896 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 52808 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 60232 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 68168 tasks      |

In [318]:
sim_to_work = {x[0]: x[1] for x in sim_to_work}

In [319]:
pickle.dump(sim_to_work, open("sim_to_work.pck", "wb"))

In [332]:
def get_skills(work):
    sim = sim_to_work[work['id']]
    skills = []
    for s in sim:
        skills.append(vacancy_by_name[s[0]]['clear_skills'])
    
    return skills

In [354]:
tmp = {x['id']: x['profession_tree_name'][0] if x['profession_tree_name'] else None for x in works}

In [356]:
pickle.dump(tmp, open("tmp.pck", "wb"))

In [343]:
get_skills(works[69])

[[],
 ['опыт работа водитель бурильный крановый установка'],
 ['водительский удостоверение категория',
  'безаварийный стаж вождение',
  'основа электроснабжение',
  'основа организация',
  'технический характеристика',
  'режим работа дизель генераторный установка'],
 ['опыт работа минимум официальный подтверждение'],
 ['наличие удостоверение машинист',
  'разряд автогидроподъёмник',
  'опыт работа грузовой авто'],
 ['искать водитель испытатель проведение испытание гусеничный машина',
  'наличие удостоверение механик водитель гусеничный машина',
  'навык чтение технологический документация',
  'навык чтение конструкторский'],
 ['наличие право категория',
  'опыт работа грузовой техника',
  'опыт работа строительный',
  'бережный отношение техника'],
 ['средний высокий технический профильный образование',
  'средний профессиональный',
  'знание испытание скважина',
  'знание технология',
  'водительский удостоверение категория',
  'опыт управление грузовой',
  'условие крайний север',


In [250]:
all_similar_vacs[0].keys()

dict_keys(['id', 'id_client', 'town', 'profession', 'work', 'candidat', 'drive_license', 'type_of_work_value', 'place_of_work_value', 'experience_value', 'education_value', 'employer_type_value', 'payment_from', 'payment_to', 'profession_tree_id', 'profession_tree_name', 'languages', 'metro', 'vec'])

In [189]:
list(most.keys())[:10]

['b471044536f405677193e3d7fd775970',
 'c567f6d49312f22fc00a277e2e55822b',
 '04fef9a9e506216b6b232839bcac9717',
 '25c9cc24963d4299e55b3c66cba47dba',
 '633d35cf8de238404f60f9ab6683fe5e',
 '056f4fa32f57f7aea88b04c5bc268c31',
 '7119f26423d0e892d1111d8a49372b15',
 '49748b69e3d3435cd98d241a6935f3d1',
 '336d370779b3261dcb00331fba5f7f47',
 '082c33c70583f4d098c8eb7e1f671a18']

In [21]:
clear_works = []
clear_vectors_works = []

for work in works:
    if not np.isnan(work['vec']).all():
        clear_works.append(work)
        clear_vectors_works.append(work['vec'])

In [None]:
def f(vac):
    return list(map(lambda work: cos_sim_theano_fn(work, vac['vec']), clear_vectors_works))

In [None]:
f(vacancy[0])

In [None]:
data = Parallel(n_jobs=-1, verbose=3)(delayed(f)(vac) for vac in vacancy)

In [None]:
print(len(data))

In [None]:
cosine_similarity([vacancy[0]['vec'], vacancy[1]['vec']], [works[0]['vec'], works[0]['vec']])

In [None]:
for vac in tqdm(vacancy):
    vectors = []
    for work in tqdm(works):
        answer[vac['id']][work['name']] = 1 - spatial.distance.cosine(work['vec'], vac['vec'])

In [None]:
pickle.dump(works, open("vac_with_vecs.pck", "wb"))

In [None]:
from scipy import spatial

In [None]:
works[0]['work'], works[1]['work']

In [None]:
1 - spatial.distance.cosine(works[0]['vec'], works[1]['vec'])

In [None]:
works[0]['vec'], works[1]['vec']

In [None]:
# def f(work):      
#     print(work)
#     clear_skills, dirty_skills = [], []
    
#     for skill in split_into_skills(work['work']):
#         normalized = normalize_skill(skill)
        
#         if len(normalized) > 0:
#             clear_skills.append(" ".join(normalized[0]))
#             dirty_skills.append(" ".join(normalized[1]))
        
#     return {"name": work['name'], "clear_skills": clear_skills, "dirty_skills": dirty_skills} 

In [None]:
# works_clear = parallel(f, works)

In [None]:
# def f(i):
#     tmp = []
#     for j in split_into_skills(i['work']):
#         for g in get_words(j.lower()):
#             tmp.append(g)
#     return tmp
# words = parallel(f, works)

In [None]:
# words_only = []
# for word in words:
#     for k in word:
#         words_only.append(k)

In [None]:
# words_only = list(set(words_only))

In [None]:
# cache = {}

# for word in tqdm(words_only):
#     cache[word] = morph.parse(word)[0]

In [None]:
# get_words(works[0]["work"])

In [None]:
# for work in tqdm(works):
#     text = work["work"]
#     text = text.lower()
#     text = remove_numbers(text)
#     text = split_into_skills(text)
    
#     for i in range(len(text)):
#         text[i] = get_words(text[i])
        
#         def g(x):
#             if x in cache:
#                 return cache[x].normal_form
#             else:
#                 print(x)
#                 return None
#         text[i] = list(filter(lambda x: x, map(g, text[i])))
        
#     work["work_clear"] = text