In [1]:
from IPython.display import clear_output

In [2]:
!pip install pymorphy2 sparse_dot_topn
!python -m nltk.downloader stopwords
!python -m nltk.downloader wordnet

clear_output()

In [3]:
import os
import re

import json
import gzip
import codecs

from itertools import islice, chain, filterfalse
from collections import Counter, defaultdict
from operator import itemgetter

import numpy as np
import scipy.sparse as sp
import pandas as pd

import lxml.html as lhtml

# from tqdm.notebook import tqdm
from tqdm import tqdm

In [4]:
WORKDIR = '.'

In [5]:
!mkdir -p "{WORKDIR}/data" "{WORKDIR}/models"

In [6]:
def save_array(a, filename: str, sparse: bool = False, **params):
    if sparse and not sp.issparse(a):
        a = sp.csr_matrix(a)
    elif not sparse and sp.issparse(a):
        a = np.asarray(a.todense())

    with open(filename, 'wb') as f_data:
        save = sp.save_npz if sparse else np.save
        return save(f_data, a, **params)


def load_array(filename: str, sparse: bool = False, **params):
    with open(filename, 'rb') as f_data:
        load = sp.load_npz if sparse else np.load
        return load(f_data, **params)

In [7]:
def parse_specializations(s):
    res = s[1:-1].split(',')
    res = map(int, res)
    res = list(res)
    # res = np.asarray(res, dtype=int)
    return res

vacancies_file = os.path.join(WORKDIR, 'data/vacancies_info.csv.gz')

if not os.path.isfile(vacancies_file):
    # Загружаем специализации для обучения
    df_train_ids = pd.read_csv(
        os.path.join(WORKDIR, 'train_labels.csv.gz'),
        index_col='vacancy_id',
        compression='gzip',
    )

    df_train_ids['specializations'] = df_train_ids['specializations'].map(parse_specializations)
    df_train_ids['is_train'] = True

    # Загружаем специализации для теста
    df_test_ids = pd.read_csv(
        os.path.join(WORKDIR, 'test_vacancy_ids.csv.gz'),
        index_col='vacancy_id',
        compression='gzip',
    )

    # Объединяем в один датафрейм
    df_all_ids = pd.concat([df_train_ids, df_test_ids], axis=0)
    df_all_ids['is_train'].fillna(False, inplace=True)
    df_all_ids.sort_index(inplace=True)

    # Загружаем информацию о каждой из вакансий
    df_vacancies_info = pd.read_csv(
        os.path.join(WORKDIR, 'vacancies_info.csv.gz'),
        index_col='vacancy_id',
        compression='gzip',
    )

    # Объединяем в один датафрейм
    df_all_ids = pd.merge(df_all_ids, df_vacancies_info, left_index=True, right_index=True, how='left')

    df_all_ids.to_csv(vacancies_file, index=True, compression='gzip')
else:
    df_all_ids = pd.read_csv(
        vacancies_file,
        index_col='vacancy_id',
        compression='gzip',
    )
    df_all_ids.loc[df_all_ids['is_train'], 'specializations'] = \
        df_all_ids.loc[df_all_ids['is_train'], 'specializations'].map(parse_specializations)

df_all_ids.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,specializations,is_train,area_id,compensation_from,compensation_to,creation_date,currency,employer,employment,work_experience,work_schedule
vacancy_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,"[242, 256, 302, 324, 358, 440]",True,26,22000.0,24000.0,2019-01-24,RUR,0ce23382345c,full,between1And3,fullDay
2,,False,160,,,2019-07-26,,b9aa259f8724,full,between1And3,fullDay
3,[211],True,1002,,,2019-04-15,,11ecc72a7a76,project,between1And3,fullDay
4,"[389, 412, 437]",True,22,,36000.0,2019-07-12,RUR,e1e424ceb5e4,full,noExperience,fullDay
5,,False,1002,600.0,,2019-01-17,BYR,943fd4a3770a,full,between1And3,fullDay


In [8]:
df_all_ids['joined_work'] = (
    df_all_ids['employment'] + '_' +
    df_all_ids['work_experience'] + '_' +
    df_all_ids['work_schedule']
)

df_all_ids.head()

Unnamed: 0_level_0,specializations,is_train,area_id,compensation_from,compensation_to,creation_date,currency,employer,employment,work_experience,work_schedule,joined_work
vacancy_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,"[242, 256, 302, 324, 358, 440]",True,26,22000.0,24000.0,2019-01-24,RUR,0ce23382345c,full,between1And3,fullDay,full_between1And3_fullDay
2,,False,160,,,2019-07-26,,b9aa259f8724,full,between1And3,fullDay,full_between1And3_fullDay
3,[211],True,1002,,,2019-04-15,,11ecc72a7a76,project,between1And3,fullDay,project_between1And3_fullDay
4,"[389, 412, 437]",True,22,,36000.0,2019-07-12,RUR,e1e424ceb5e4,full,noExperience,fullDay,full_noExperience_fullDay
5,,False,1002,600.0,,2019-01-17,BYR,943fd4a3770a,full,between1And3,fullDay,full_between1And3_fullDay


In [9]:
def make_onehot_csr_matrix(s: pd.Series):
    mapping = defaultdict(lambda: len(mapping))

    data = np.ones(shape=(s.shape[0], ))
    indices = [mapping[k] for k in s]
    indptr = np.arange(0, len(data) + 1)

    X = sp.csr_matrix((data, indices, indptr), shape=(len(indptr) - 1, len(mapping)))
    mapping.default_factory = None
    mapping_inv = sorted(mapping, key=lambda e: mapping[e])

    return mapping, mapping_inv, X

In [10]:
mapping_employer, mapping_employer_inv, features_employer = \
    make_onehot_csr_matrix(df_all_ids['employer'])

features_employer.shape

(2912650, 345193)

In [11]:
def make_onehot_multiple_csr_matrix(s: pd.Series):
    mapping = defaultdict(lambda: len(mapping))

    data, indices, indptr = [], [], [0, ]

    for row in tqdm(s):
        row = list(map(lambda e: mapping[e], row))

        data.extend([1] * len(row))
        indices.extend(row)
        indptr.append(len(data))

    X = sp.csr_matrix((data, indices, indptr), shape=(len(indptr) - 1, len(mapping)))
    mapping.default_factory = None
    mapping_inv = sorted(mapping, key=lambda e: mapping[e])

    return mapping, mapping_inv, X

In [12]:
mapping_spec, mapping_spec_inv, y_spec = \
    make_onehot_multiple_csr_matrix(df_all_ids.loc[df_all_ids['is_train'], 'specializations'])

y_spec.shape

100%|██████████| 1456325/1456325 [00:02<00:00, 620422.68it/s]


(1456325, 620)

In [13]:
vacancies_parts = (f for f in os.listdir(WORKDIR) if f.startswith('vacancies-'))
vacancies_parts = sorted(vacancies_parts)
vacancies_parts

['vacancies-01.json.gz',
 'vacancies-02.json.gz',
 'vacancies-03.json.gz',
 'vacancies-04.json.gz',
 'vacancies-05.json.gz',
 'vacancies-06.json.gz',
 'vacancies-07.json.gz',
 'vacancies-08.json.gz',
 'vacancies-09.json.gz',
 'vacancies-10.json.gz']

In [14]:
def read_vacancies_part(filename):
    with gzip.open(filename, mode='r') as f_gz:
        records = json.load(f_gz)
        records = {int(k): v for k, v in records.items()}
    return records

In [15]:
from functools import lru_cache

from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer


ru_morph = MorphAnalyzer()

@lru_cache(maxsize=15000)
def morph_process(token):
    return ru_morph.parse(token)[0].normal_form


stop_words = map(morph_process, stopwords.words('russian'))
stop_words = stopwords.words('russian') + list(stop_words)

In [16]:
vacancies_it = map(lambda p: os.path.join(WORKDIR, p), tqdm(vacancies_parts))
vacancies_it = map(read_vacancies_part, vacancies_it)
vacancies_it = ((k, v) for p in vacancies_it for k, v in p.items())

index = []

def content_names_reader(index):
    for vacancy_id, vacancy_info in vacancies_it:
        name = re.sub('\(.*?\)', '', vacancy_info['name'].lower())
        index.append(vacancy_id)
        yield name

content_names = content_names_reader(index)
content_names = tqdm(content_names, position=0)

0it [00:00, ?it/s]/10 [00:00<?, ?it/s]

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

def create_tfidf_vectorizer(**params):
    vec = TfidfVectorizer(
        stop_words=stop_words,
        preprocessor=morph_process,
        ngram_range=(1, 2),
        min_df=5,
        **params
    )
    return vec

In [18]:
content_names_array_file = os.path.join(WORKDIR, 'data/content_names_2.npz')
content_names_terms_idfs = os.path.join(WORKDIR, 'data/content_names_2.idf')
content_names_vacancies_mapping = os.path.join(WORKDIR, 'data/content_names_2.mapping')

if not os.path.isfile(content_names_array_file):
    vec = create_tfidf_vectorizer()

    # Считаем tfidf-вектора и сохраняем их
    features_content_names = vec.fit_transform(content_names)
    save_array(features_content_names, content_names_array_file, sparse=True)

    # Сохраняем словарик с idf
    vocabulary_inv = sorted(vec.vocabulary_, key=lambda e: vec.vocabulary_[e])
    with open(content_names_terms_idfs, mode='w', encoding='utf8') as f_data:
        for word, idf in zip(vocabulary_inv, vec.idf_):
            print(word, "%.16f" % idf, sep='\t', file=f_data)

    # Сохраняем порядок вакансий в матрице
    with open(content_names_vacancies_mapping, mode='w') as f_data:
        print(*index, sep='\n', file=f_data)
else:
    # Грузим tfidf-вектора
    features_content_names = load_array(content_names_array_file, sparse=True)

    # Грузим TfIdfVectorizer
    with open(content_names_terms_idfs, mode='r') as f_data:
        f_data = map(lambda s: s.rstrip().split('\t'), f_data)

        vocabulary_inv, vocabulary_idf = [], []
        for i, (word, idf) in enumerate(f_data):
            vocabulary_inv.append(word)
            vocabulary_idf.append(float(idf))

    vec = create_tfidf_vectorizer(vocabulary=vocabulary_inv)
    vec.idf_ = np.asarray(vocabulary_idf, dtype=float)

    # Грузим порядок документов
    with open(content_names_vacancies_mapping, mode='r') as f_data:
        index = list(map(int, f_data))

# Убеждаемся, что все правильно
assert (np.asarray(index) == df_all_ids.index.values).all()
assert features_content_names.shape == (df_all_ids.shape[0], len(vec.idf_))

features_content_names.shape

(2912650, 139435)

In [19]:
def get_best_ranks(ranks: np.ndarray, top: int, axis: int = 0, return_ranks: bool = False):
    top_slice = (slice(None), ) * axis + (slice(-top, None), )
    inv_slice = (slice(None), ) * axis + (slice(None, None, -1), )

    if top < ranks.shape[axis]:
        indices = np.argpartition(ranks, -top, axis=axis)[top_slice]
        ranks_top = np.take_along_axis(ranks, indices, axis=axis)
        indices = np.take_along_axis(indices, ranks_top.argsort(axis=axis)[inv_slice], axis=axis)
    else:
        indices = np.argsort(ranks, axis=axis)[top_slice]
        indices = indices[inv_slice]

    result = (indices, )

    if return_ranks:
        ranks = np.take_along_axis(ranks, indices, axis=axis)
        result += (ranks, )

    return result if len(result) > 1 else result[0]

In [20]:
def f1score(y_true, y_pred):
    # y_true = set(y_true)
    # y_pred = set(y_pred)
    
    tp = len(y_true & y_pred)
    precision = tp / len(y_pred)
    recall = tp / len(y_true)
    if precision == 0.0 and recall == 0.0:
        score = 0.0
    else:
        score = 2 * precision * recall / (precision + recall)
    return score

In [21]:
def make_employer_mult_matrix(A, B):
    data, indices, indptr = [], [], [0, ]

    indices_buf = {}

    for e in tqdm(A.indices, position=0, desc='make_employer_mult_matrix', leave=False):
        if e not in indices_buf:
            indices_buf[e] = np.where(B.indices == e)[0]
        
        data.extend([1] * len(indices_buf[e]))
        indices.extend(indices_buf[e])
        indptr.append(len(indices))

    X = sp.csr_matrix((data, indices, indptr), shape=(A.shape[0], B.shape[0]))
    return X

In [22]:
def get_best_ranks_sparse(ranks: sp.spmatrix, top: int, axis: int = 1, return_ranks: bool = False, verbose: bool = True):
    if axis == 1 and not sp.isspmatrix_csr(ranks):
        ranks = ranks.tocsr()
    elif axis == 0 and not sp.isspmatrix_csc(ranks):
        ranks = ranks.tocsc()
    
    ranks_top, indices = [], []

    indptr = zip(ranks.indptr, ranks.indptr[1:])
    if verbose:
        indptr = tqdm(indptr, position=0, total=ranks.shape[0], desc='get_best_ranks_sparse', leave=False)

    for i, j in indptr:
        row_indices = ranks.indices[i:j]
        row_data = ranks.data[i:j]
        row_order = np.argsort(row_data)[:-top-1:-1]

        row_indices = row_indices[row_order]
        row_ranks_top = row_data[row_order]

        if len(row_indices) < top:
            num = top - len(row_indices)
            row_indices = np.pad(row_indices, (0, num), constant_values=-1, mode='constant')
            row_ranks_top = np.pad(row_ranks_top, (0, num), constant_values=0, mode='constant')

        ranks_top.append(row_ranks_top)
        indices.append(row_indices)

    indices = np.asarray(indices)   # same as np.vstack
    result = (indices, )

    if return_ranks:
        ranks = np.asarray(ranks_top)   # same as np.vstack
        result += (ranks, )

    return result

In [23]:
def get_best_indices_sparse(X):
    if not sp.isspmatrix_csr(X):
        X = X.tocsr()

    indices_all = []

    for i, j in zip(X.indptr, X.indptr[1:]):
        indices = X.indices[i:j]
        data = X.data[i:j]

        indices = indices[np.argsort(data)[::-1]]
        indices_all.append(indices)
        
    return indices_all

In [24]:
def reorder_indices(indices, index):
    indices_new = index[indices]
    indices_new[indices < 0] = -1
    return indices_new

In [25]:
from sklearn.model_selection import KFold

from sparse_dot_topn import awesome_cossim_topn


mask = df_all_ids['is_train'].values

kf = KFold(n_splits=5)

indices_valid_all, ranks_valid_all = [], []

for index_train, index_valid in tqdm(kf.split(features_content_names[mask]), position=0, total=kf.n_splits):
    features_content_names_train = features_content_names[mask][index_train]
    features_content_names_valid = features_content_names[mask][index_valid]
    
    features_employer_train = features_employer[mask][index_train]
    features_employer_valid = features_employer[mask][index_valid]
    
    # Вычисляем похожесть между вакансиями
    features_content_names_cossim = awesome_cossim_topn(
        features_content_names_valid,
        features_content_names_train.T,
        ntop=15,
        use_threads=True,
        n_jobs=32,
    )
    
    # Вычисляем индикаторную матрицу по работодателям
    features_employer_same = make_employer_mult_matrix(features_employer_valid, features_employer_train)
    
    # Поощеряем вакансии, у которых один и тот же работодатель
    features_content_names_cossim += 0.01 * (features_content_names_cossim > 0).multiply(features_employer_same)
    
    # Вытаскиваем индексы и ранки
    indices_valid, ranks_valid = \
        get_best_ranks_sparse(features_content_names_cossim, top=15, axis=1, return_ranks=True)
    
    # Преобразуем индексы в рамках всего обучающего датасета
    indices_valid = reorder_indices(indices_valid, index_train)
    
    indices_valid_all.append(indices_valid)
    ranks_valid_all.append(ranks_valid)
    
indices_valid = np.vstack(indices_valid_all)
ranks_valid = np.vstack(ranks_valid_all)

indices_valid.shape, ranks_valid.shape

100%|██████████| 5/5 [12:26<00:00, 149.38s/it]                                      


((1456325, 15), (1456325, 15))

In [30]:
indices_valid_file = os.path.join(WORKDIR, 'data/neigbours-train-all.indices.npz')
ranks_valid_file = os.path.join(WORKDIR, 'data/neigbours-train-all.ranks.npz')
# vacancies_valid_file = os.path.join(WORKDIR, 'data/neigbours-valid.vacancies.txt')

save_array(indices_valid, indices_valid_file, sparse=False)
save_array(ranks_valid, ranks_valid_file, sparse=False)

In [31]:
K_params = [1, 3, 5, 7, 11, 13]

scores = {K: [] for K in K_params}

for j in tqdm(range(indices_valid.shape[0]), position=0, leave=False):
    y_true = set(y_spec[j].indices)

    indices_base = indices_valid[j]
    if indices_base[-1] < 0:
        indices_base = indices_base[indices_base > 0]

    if not len(indices_base):
        y_pred = {mapping_spec[256], }   # see baseline
        score = f1score(y_true, y_pred)
        for K in K_params:
            scores[K].append(score)
        continue

    size = y_spec[indices_base[0]].indices.shape[0]

    y_pred = Counter()
    for K, n in enumerate(indices_base):
        y_pred += Counter(y_spec[n].indices)

        if K not in scores:
            continue

        y_pred_ = set(map(itemgetter(0), y_pred.most_common(size)))
        score = f1score(y_true, y_pred_)
        scores[K].append(score)

for k, v in scores.items():
    scores[k] = np.mean(v)

for k in K_params:
    print('K = {:>2d}; score = {:.6f}'.format(k, scores[k]))
print()

K_best = sorted(scores, key=lambda k: scores[k], reverse=True)[0]
K_best

                                                          

K =  1; score = 0.455142
K =  3; score = 0.484794
K =  5; score = 0.494470
K =  7; score = 0.497134
K = 11; score = 0.501144
K = 13; score = 0.502032



13

In [34]:
K_best = 13

In [35]:
from sparse_dot_topn import awesome_cossim_topn

step = features_content_names[~mask].shape[0] // 5

indices_test, ranks_test = [], []

for i in tqdm(range(0, features_content_names[~mask].shape[0], step), position=0):
    j = min(i + step, features_content_names[~mask].shape[0])

    features_content_names_cossim = awesome_cossim_topn(
        features_content_names[~mask][i:j],     # test
        features_content_names[mask].T,         # train
        ntop=15,
        use_threads=True,
        n_jobs=32,
    )

    features_employer_same = make_employer_mult_matrix(features_employer[~mask][i:j], features_employer[mask])
    features_content_names_cossim += 0.01 * (features_content_names_cossim > 0).multiply(features_employer_same)

    indices, ranks = \
        get_best_ranks_sparse(features_content_names_cossim, top=15, axis=1, return_ranks=True)

    indices_test.append(indices)
    ranks_test.append(ranks)

indices_test = np.vstack(indices_test)
ranks_test = np.vstack(ranks_test)

100%|██████████| 5/5 [16:04<00:00, 192.83s/it]                                      


In [36]:
assert (~mask).sum() == indices_test.shape[0] == ranks_test.shape[0]

In [37]:
indices_test_file = os.path.join(WORKDIR, 'data/neigbours-test.indices.npz')
ranks_test_file = os.path.join(WORKDIR, 'data/neigbours-test.ranks.npz')

save_array(indices_test, indices_test_file, sparse=False)
save_array(ranks_test, ranks_test_file, sparse=False)

In [38]:
def convert_specializations(s):
    s = map(lambda e: mapping_spec_inv[e], s)
    s = list(s)
    return s


y_pred_all = []

for j in tqdm(range(indices_test.shape[0]), position=0, leave=False):
    indices_base = indices_test[j]
    if indices_base[-1] < 0:
        indices_base = indices_base[indices_base > 0]

    if len(indices_base) > 0:
        size = y_spec[indices_base[0]].indices.shape[0]

        y_pred = Counter()
        for n in indices_base[:K_best]:
            y_pred += Counter(y_spec[n].indices)
        y_pred = set(map(itemgetter(0), y_pred.most_common(size)))
    else:
        y_pred = {mapping_spec[256], }   # see baseline

    y_pred = convert_specializations(y_pred)
    y_pred_all.append(y_pred)

df_submission = df_all_ids.loc[~df_all_ids['is_train'], ['specializations']]
df_submission['specializations'] = y_pred_all

submission_id = 11
submission_file = os.path.join(WORKDIR, 'submission_{:03d}.csv.gz').format(submission_id)

df_submission.to_csv(submission_file, index=True, compression='gzip')
df_submission.head()

                                                           

Unnamed: 0_level_0,specializations
vacancy_id,Unnamed: 1_level_1
2,"[172, 395, 420, 211, 388, 82]"
5,"[541, 494, 204]"
7,[127]
8,[287]
10,"[429, 205]"


In [39]:
!zcat "{submission_file}" | head -n5

vacancy_id,specializations
2,"[172, 395, 420, 211, 388, 82]"
5,"[541, 494, 204]"
7,[127]
8,[287]

gzip: stdout: Broken pipe


In [40]:
!zcat "{WORKDIR}/sample_submission.csv.gz" | head -n5

vacancy_id,specializations
2,"[25, 324, 42]"
5,"[491, 193, 313]"
7,[256]
8,"[287, 70, 83]"

gzip: stdout: Broken pipe
