In [None]:
import json
from typing import List, Tuple, Sequence
import pickle
from tqdm import tqdm_notebook as tqdm
import pymorphy2
import nltk
from cytoolz import pipe
from collections import Counter
import re
from collections import defaultdict
from collections import Counter

In [None]:
morph = pymorphy2.MorphAnalyzer()
stopwords = nltk.corpus.stopwords.words(
    'russian') + nltk.corpus.stopwords.words('english')
stopwords += [
    'отличный', 'метр', 'наш', 'клиент', 'банка', 'проект', 'литр',
    'желательный', 'др', 'самый', 'мочь', 'хороший', 'год', 'чел', 'обязательный'
]

cache = {}


def remove_numbers(text):
    return re.sub(r'\d+', '', text)


def _get_POS(word):
    if word not in cache:
        cache[word] = morph.parse(word)[0]
    return cache[word].tag.POS


def normal_form(word):
    if word not in cache:
        cache[word] = morph.parse(word)[0]
    return cache[word].normal_form


def is_word_pos_in(word: str, pos: List[str] = None) -> bool:
    if not pos:
        pos = ['NOUN', "ADJF", 'INFN', 'VERB', 'ADJS']

    return _get_POS(word) in pos


def get_words(text):
    return re.findall(r'\w+', text)


def nonempty(x):
    if isinstance(x, Sequence):
        return filter(lambda x: len(x) > 0 and x != ' ', x)
    return x


helper = {}


def remove_numbers(text):
    return re.sub(r'\d+', '', text)


def normalize_skill(skill: str):
    parsed = tuple(
        pipe(
            skill,
            lambda x: x.lower(),
            remove_numbers,
            get_words,
        ))

    clear_skill = []
    dirty_skill = []

    # Последнее стоп слово для dirty_skill
    last_stopword = None

    # Для каждого слова в скилле
    for i in parsed:
        # Нормализуем слово
        word = normal_form(i)

        # Если стоп слово - запомним его
        if word in nltk.corpus.stopwords.words('russian'):
            last_stopword = word

            if word == "без":
                clear_skill.append(word)

        # Проверим на часть речи, длинну и стоплова
        elif is_word_pos_in(word) and len(word) > 3 and word not in stopwords:

            # Если до этого было стоп слово, добавим его в dirty
            if last_stopword and len(dirty_skill) > 0:
                dirty_skill.append(last_stopword)
                last_stopword = None

            # Добавим в чистый скилл слово
            clear_skill.append(word)

            if is_word_pos_in(word, ['NOUN', 'ADJF']):
                dirty_skill.append(i)

    if len(clear_skill) > 1 and len(clear_skill) < 8:
        return clear_skill, dirty_skill

    else:
        return []

In [None]:
def split_skill(a):
    '''Рассказ о программистах или о жизни -> рассказ о программистах, рассказ о жизни'''
    main_skill = a

    def repl(x): return x.replace(',', '|').replace(
        ' или ', '|').replace(' и ', '|').replace('/', '|').split('|')

    first_match = re.match(r'([а-яА-ЯA-Za-z\-\s]*([,и\/]|или)\s)', a)

    if first_match:
        first_match = first_match.group()
        a = repl(a.replace(first_match, ''))

        for i in a:
            if len(i.split()) > 4:
                return repl(main_skill)

        variants = first_match.replace(',', ' ').replace(
            ' или ', ' ').replace(' и ', ' ').replace('/', ' ')
        variants = list(filter(len, variants.split(' ')))
        if len(variants) > 1:
            main_phrase = variants[:-1]
            a.append(variants[-1])

            skills = []

            for i in a:
                skills.append(" ".join(main_phrase + [i]))
            return skills
        else:
            return a + variants
    else:
        return [a]

In [None]:
def split_into_skills(text: str) -> List[str]:
    def _split_into_skills(x):
        x = re.sub(r'([А-ЯA-Z])', r'\n\1', text)
        return re.split(r'[\n\.,]', x)

    pre_skills = list(nonempty(_split_into_skills(text)))
    done_skills = []

    for skill in pre_skills:
        skill = skill.replace('/', ' / ').replace("\xa0", " ")
        if len(re.findall("\([а-яА-ЯA-Za-z\-\s]*\)", skill)) > 0:
            skill1 = re.findall(r'\([а-яА-ЯA-Za-z\-\s]*\)', skill)[0]
            skill2 = re.findall(r'[а-яА-ЯA-Za-z\-\s]*\(', skill)[0][:-1]
            done_skills.extend([skill1, skill2])
        elif len(re.findall("([а-яА-ЯA-Za-z\-\s]*([и\/]|или)\s[а-яА-ЯA-Za-z\-])", skill)) > 0:
            new_skills = split_skill(skill)
            if new_skills:
                done_skills.extend(new_skills)

        else:
            done_skills.append(skill)
    return done_skills

In [None]:
from joblib import delayed, Parallel

def parallel(f, data):
    """Run parallel your func on all CPU"""
    
    return Parallel(n_jobs=-1, verbose=3, max_nbytes='1G')(delayed(f)(x) for x in data)

In [None]:
def load_resume_vacancy() -> Tuple[List[dict], List[dict]]:
    """Loads JSON data of SuperJob"""

    def load_json_by_lines(file):
        tmp = []
        
        with open(file) as f:
            for line in tqdm(f.read().split("\n")):
                if len(line) > 0:
                    tmp.append(json.loads(line))
        return tmp
    
    resume = load_json_by_lines("data/resume.json")
    vacancy = load_json_by_lines("data/vacancy.json")
    
    return resume, vacancy

In [None]:
resume, vacancy = load_resume_vacancy()

In [None]:
pickle.dump(resume, open("resume.pck", "wb"))
pickle.dump(vacancy, open("vacancy.pck", "wb"))

In [None]:
resume = pickle.load(open("resume.pck", "rb"))
vacancy = pickle.load(open("vacancy.pck", "rb"))

In [None]:
def add_skills(vac):
    vac = vac.copy()
    skills = split_into_skills(vac['candidat'])

    clear_skills, dirty_skills = [], []
    
    for skill in skills:
        normalized = normalize_skill(skill)
        
        if len(normalized) > 0:
            clear_skills.append(" ".join(normalized[0]))
            dirty_skills.append(" ".join(normalized[1]))
        
    vac['clear_skills'] = clear_skills
    vac['dirty_skills'] = dirty_skills
    
    return vac

In [None]:
data = []

for vac in tqdm(vacancy):
    data.append(add_skills(vac))

In [None]:
#pickle.dump(data, open("vacancy_with_skills.pck", "wb"))

In [None]:
data = pickle.load(open("vacancy_with_skills.pck", "rb"))

In [None]:
themes = defaultdict(lambda: [])
dirty_to_normal = {}

for d in data:
    themes[d['profession_tree_name']].extend(d['dirty_skills'])

In [None]:
works = []

for i in resume:
    works.extend(i['work_history'])

In [None]:
words = set()

In [None]:
for work in tqdm(works):
    work_words = list(filter(lambda x: x not in stopwords, get_words(remove_numbers(work['work'].lower()))))
    words |= set(work_words)

In [None]:
for vac in tqdm(vacancy):
    vac_words = list(filter(lambda x: x not in stopwords, get_words(remove_numbers(vac['work'].lower()))))
    words |= set(vac_words)

In [None]:
#pickle.dump(words, open("words.pck", "wb"))

In [None]:
words = pickle.load(open("words.pck", "rb"))

In [None]:
import gensim
import numpy as np

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format("ruwikiruscorpora_upos_cbow_300_20_2017.bin.gz", binary=True)

In [None]:
tmp_vector = np.array([0] * 300, dtype=np.float32)

def _word2vec(word):
    for i in ["_NOUN", "_ADJ", "_VERB"]:
        tmp = "{}{}".format(word, i)
        
        if tmp in model:
            return model[tmp]
        else:
            return tmp_vector

In [None]:
cahce = {}

for word in tqdm(words):
    normal_form = morph.parse(word)[0].normal_form
    cahce[word] = _word2vec(normal_form)
    

In [None]:
#pickle.dump(cahce, open("cahce.pck", "wb"))

In [None]:
cahce = pickle.load(open("cahce.pck", "rb"))

In [None]:
works[0]

In [None]:
for work in tqdm(works):
    words = list(filter(lambda x: x not in stopwords, get_words(remove_numbers(work['work'].lower()))))
    work['vec'] = np.mean([cahce[word] for word in words], axis=0)

In [None]:
#pickle.dump(works, open("works_with_vecs.pck", "wb"))

In [None]:
works = pickle.load(open("works_with_vecs.pck", "rb"))

In [None]:
for vac in tqdm(vacancy):
    words = list(filter(lambda x: x not in stopwords, get_words(remove_numbers(vac['work'].lower()))))
    vac['vec'] = np.mean([cahce[word] for word in words], axis=0)

In [None]:
#pickle.dump(vacancy, open("vacancy_with_vecs.pck", "wb"))

In [None]:
vacancy = pickle.load(open("vacancy_with_vecs.pck", "rb"))

In [None]:
answer = defaultdict(lambda: {})

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

In [None]:
work_vectors = []
normal_works = []

for work in tqdm(works):
    try:
        work_vectors.append(list(work['vec']))
        normal_works.append(work)
    except:
         pass

In [None]:
work_vectors = np.asarray(work_vectors)
now_matrix = np.asarray([vacancy[0]['vec'] for i in range(len(work_vectors))])

In [1]:
import theano


You can find the C code in this temporary file: /tmp/theano_compilation_error_y07t_vdp


Exception: Compilation failed (return status=1): /usr/bin/ld: /nix/store/f111ij1fc83965m48bf2zqgiaq88fqv5-glibc-2.25/lib/../lib64/crti.o: unrecognized relocation (0x2a) in section `.init'. /usr/bin/ld: final link failed: Bad value. collect2: error: ld returned 1 exit status. 

In [None]:
def compile_cos_sim_theano():
    v1 = theano.tensor.vector(dtype='float32')
    v2 = theano.tensor.vector(dtype='float32')
    
    numerator = theano.tensor.sum(v1*v2)
    denominator = theano.tensor.sqrt(theano.tensor.sum(v1**2)*theano.tensor.sum(v2**2))
   
    return theano.function([v1, v2], numerator/denominator)

cos_sim_theano_fn = compile_cos_sim_theano()

In [None]:
work_vectors.shape, now_matrix.shape

In [None]:
test = pairwise_distances(work_vectors, now_matrix, metric='cosine', n_jobs=-1)

In [None]:
cosine_similarity([vacancy[0]['vec'], vacancy[1]['vec']], [works[0]['vec'], works[0]['vec']])

In [None]:
for vac in tqdm(vacancy):
    vectors = []
    for work in tqdm(works):
        answer[vac['id']][work['name']] = 1 - spatial.distance.cosine(work['vec'], vac['vec'])

In [None]:
pickle.dump(works, open("vac_with_vecs.pck", "wb"))

In [None]:
from scipy import spatial

In [None]:
works[0]['work'], works[1]['work']

In [None]:
1 - spatial.distance.cosine(works[0]['vec'], works[1]['vec'])

In [None]:
works[0]['vec'], works[1]['vec']

In [None]:
# def f(work):      
#     print(work)
#     clear_skills, dirty_skills = [], []
    
#     for skill in split_into_skills(work['work']):
#         normalized = normalize_skill(skill)
        
#         if len(normalized) > 0:
#             clear_skills.append(" ".join(normalized[0]))
#             dirty_skills.append(" ".join(normalized[1]))
        
#     return {"name": work['name'], "clear_skills": clear_skills, "dirty_skills": dirty_skills} 

In [None]:
# works_clear = parallel(f, works)

In [None]:
# def f(i):
#     tmp = []
#     for j in split_into_skills(i['work']):
#         for g in get_words(j.lower()):
#             tmp.append(g)
#     return tmp
# words = parallel(f, works)

In [None]:
# words_only = []
# for word in words:
#     for k in word:
#         words_only.append(k)

In [None]:
# words_only = list(set(words_only))

In [None]:
# cache = {}

# for word in tqdm(words_only):
#     cache[word] = morph.parse(word)[0]

In [None]:
# get_words(works[0]["work"])

In [None]:
# for work in tqdm(works):
#     text = work["work"]
#     text = text.lower()
#     text = remove_numbers(text)
#     text = split_into_skills(text)
    
#     for i in range(len(text)):
#         text[i] = get_words(text[i])
        
#         def g(x):
#             if x in cache:
#                 return cache[x].normal_form
#             else:
#                 print(x)
#                 return None
#         text[i] = list(filter(lambda x: x, map(g, text[i])))
        
#     work["work_clear"] = text