In [None]:
from IPython.display import clear_output

In [None]:
!pip install pymorphy2
!python -m nltk.downloader stopwords
!python -m nltk.downloader wordnet

clear_output()

In [None]:
import torch

In [None]:
import os
import re

import json
import gzip
import codecs

from itertools import islice, chain, filterfalse
from collections import Counter, defaultdict
from operator import itemgetter

import numpy as np
import scipy.sparse as sp
import pandas as pd

import lxml.html as lhtml

# from tqdm.notebook import tqdm
from tqdm import tqdm

In [None]:
from google.colab import drive

drive.mount('/content/drive/')

clear_output()

In [None]:
WORKDIR = '/content/drive/My Drive/round2-hh'

In [None]:
!mkdir -p "{WORKDIR}/data" "{WORKDIR}/models"

In [None]:
def save_array(a, filename: str, sparse: bool = False, **params):
    if sparse and not sp.issparse(a):
        a = sp.csr_matrix(a)
    elif not sparse and sp.issparse(a):
        a = np.asarray(a.todense())

    with open(filename, 'wb') as f_data:
        save = sp.save_npz if sparse else np.save
        return save(f_data, a, **params)


def load_array(filename: str, sparse: bool = False, **params):
    with open(filename, 'rb') as f_data:
        load = sp.load_npz if sparse else np.load
        return load(f_data, **params)

In [None]:
def parse_specializations(s):
    res = s[1:-1].split(',')
    res = map(int, res)
    res = list(res)
    # res = np.asarray(res, dtype=int)
    return res

vacancies_file = os.path.join(WORKDIR, 'data/vacancies_info.csv.gz')

if not os.path.isfile(vacancies_file):
    # Загружаем специализации для обучения
    df_train_ids = pd.read_csv(
        os.path.join(WORKDIR, 'train_labels.csv.gz'),
        index_col='vacancy_id',
        compression='gzip',
    )

    df_train_ids['specializations'] = df_train_ids['specializations'].map(parse_specializations)
    df_train_ids['is_train'] = True

    # Загружаем специализации для теста
    df_test_ids = pd.read_csv(
        os.path.join(WORKDIR, 'test_vacancy_ids.csv.gz'),
        index_col='vacancy_id',
        compression='gzip',
    )

    # Объединяем в один датафрейм
    df_all_ids = pd.concat([df_train_ids, df_test_ids], axis=0)
    df_all_ids['is_train'].fillna(False, inplace=True)
    df_all_ids.sort_index(inplace=True)

    # Загружаем информацию о каждой из вакансий
    df_vacancies_info = pd.read_csv(
        os.path.join(WORKDIR, 'vacancies_info.csv.gz'),
        index_col='vacancy_id',
        compression='gzip',
    )

    # Объединяем в один датафрейм
    df_all_ids = pd.merge(df_all_ids, df_vacancies_info, left_index=True, right_index=True, how='left')

    df_all_ids.to_csv(vacancies_file, index=True, compression='gzip')
else:
    df_all_ids = pd.read_csv(
        vacancies_file,
        index_col='vacancy_id',
        compression='gzip',
    )
    df_all_ids.loc[df_all_ids['is_train'], 'specializations'] = \
        df_all_ids.loc[df_all_ids['is_train'], 'specializations'].map(parse_specializations)

df_all_ids.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,specializations,is_train,area_id,compensation_from,compensation_to,creation_date,currency,employer,employment,work_experience,work_schedule
vacancy_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,"[242, 256, 302, 324, 358, 440]",True,26,22000.0,24000.0,2019-01-24,RUR,0ce23382345c,full,between1And3,fullDay
2,,False,160,,,2019-07-26,,b9aa259f8724,full,between1And3,fullDay
3,[211],True,1002,,,2019-04-15,,11ecc72a7a76,project,between1And3,fullDay
4,"[389, 412, 437]",True,22,,36000.0,2019-07-12,RUR,e1e424ceb5e4,full,noExperience,fullDay
5,,False,1002,600.0,,2019-01-17,BYR,943fd4a3770a,full,between1And3,fullDay


In [None]:
df_all_ids['joined_work'] = (
    df_all_ids['employment'] + '_' +
    df_all_ids['work_experience'] + '_' +
    df_all_ids['work_schedule']
)

df_all_ids.head()

Unnamed: 0_level_0,specializations,is_train,area_id,compensation_from,compensation_to,creation_date,currency,employer,employment,work_experience,work_schedule,joined_work
vacancy_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,"[242, 256, 302, 324, 358, 440]",True,26,22000.0,24000.0,2019-01-24,RUR,0ce23382345c,full,between1And3,fullDay,full_between1And3_fullDay
2,,False,160,,,2019-07-26,,b9aa259f8724,full,between1And3,fullDay,full_between1And3_fullDay
3,[211],True,1002,,,2019-04-15,,11ecc72a7a76,project,between1And3,fullDay,project_between1And3_fullDay
4,"[389, 412, 437]",True,22,,36000.0,2019-07-12,RUR,e1e424ceb5e4,full,noExperience,fullDay,full_noExperience_fullDay
5,,False,1002,600.0,,2019-01-17,BYR,943fd4a3770a,full,between1And3,fullDay,full_between1And3_fullDay


In [None]:
features_vacancy_info = pd.get_dummies(df_all_ids[['employment', 'work_experience', 'work_schedule']], sparse=True)

# features_vacancy_info = pd.get_dummies(df_all_ids['joined_work'], sparse=True)
features_vacancy_info = features_vacancy_info.sparse.to_coo().tocsr()
features_vacancy_info.shape

(2912650, 14)

In [None]:
employer_chosen = df_all_ids.groupby(by='employer')['employer'].count().sort_values(ascending=False)
employer_chosen = set(employer_chosen[employer_chosen >= 5].index)
employer_chosen |= set(df_all_ids.loc[~df_all_ids['is_train'], 'employer'])

len(employer_chosen)

257994

In [None]:
df_all_ids.loc[~df_all_ids['employer'].isin(employer_chosen), 'employer'] = 'UNKNOWN'
df_all_ids.head()

Unnamed: 0_level_0,specializations,is_train,area_id,compensation_from,compensation_to,creation_date,currency,employer,employment,work_experience,work_schedule,joined_work
vacancy_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,"[242, 256, 302, 324, 358, 440]",True,26,22000.0,24000.0,2019-01-24,RUR,0ce23382345c,full,between1And3,fullDay,full_between1And3_fullDay
2,,False,160,,,2019-07-26,,b9aa259f8724,full,between1And3,fullDay,full_between1And3_fullDay
3,[211],True,1002,,,2019-04-15,,11ecc72a7a76,project,between1And3,fullDay,project_between1And3_fullDay
4,"[389, 412, 437]",True,22,,36000.0,2019-07-12,RUR,UNKNOWN,full,noExperience,fullDay,full_noExperience_fullDay
5,,False,1002,600.0,,2019-01-17,BYR,943fd4a3770a,full,between1And3,fullDay,full_between1And3_fullDay


In [None]:
features_employer = pd.get_dummies(df_all_ids['employer'], sparse=True)
features_employer = features_employer.sparse.to_coo().tocsr()
features_employer.shape

(2912650, 257995)

In [None]:
def make_onehot_csr_matrix(s: pd.Series):
    mapping = defaultdict(lambda: len(mapping))

    data = np.ones(shape=(s.shape[0], ))
    indices = [mapping[k] for k in s]
    indptr = np.arange(0, len(data) + 1)

    X = sp.csr_matrix((data, indices, indptr), shape=(len(indptr) - 1, len(mapping)))
    mapping.default_factory = None
    mapping_inv = sorted(mapping, key=lambda e: mapping[e])

    return mapping, mapping_inv, X

In [None]:
def make_onehot_multiple_csr_matrix(s: pd.Series):
    mapping = defaultdict(lambda: len(mapping))

    data, indices, indptr = [], [], [0, ]

    for row in tqdm(s):
        row = list(map(lambda e: mapping[e], row))

        data.extend([1] * len(row))
        indices.extend(row)
        indptr.append(len(data))

    X = sp.csr_matrix((data, indices, indptr), shape=(len(indptr) - 1, len(mapping)))
    mapping.default_factory = None
    mapping_inv = sorted(mapping, key=lambda e: mapping[e])

    return mapping, mapping_inv, X

In [None]:
mapping_spec, mapping_spec_inv, y_spec = \
    make_onehot_multiple_csr_matrix(df_all_ids.loc[df_all_ids['is_train'], 'specializations'])

y_spec.shape

100%|██████████| 1456325/1456325 [00:02<00:00, 546319.19it/s]


(1456325, 620)

In [None]:
vacancies_parts = (f for f in os.listdir(WORKDIR) if f.startswith('vacancies-'))
vacancies_parts = sorted(vacancies_parts)
vacancies_parts

['vacancies-01.json.gz',
 'vacancies-02.json.gz',
 'vacancies-03.json.gz',
 'vacancies-04.json.gz',
 'vacancies-05.json.gz',
 'vacancies-06.json.gz',
 'vacancies-07.json.gz',
 'vacancies-08.json.gz',
 'vacancies-09.json.gz',
 'vacancies-10.json.gz']

In [None]:
def read_vacancies_part(filename):
    with gzip.open(filename, mode='r') as f_gz:
        records = json.load(f_gz)
        records = {int(k): v for k, v in records.items()}
    return records

In [None]:
from functools import lru_cache

from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer


ru_morph = MorphAnalyzer()

@lru_cache(maxsize=15000)
def morph_process(token):
    return ru_morph.parse(token)[0].normal_form

@lru_cache(maxsize=5000)
def preprocess_skill(s):
    parts = re.sub('\s+', ' ', s.strip().lower()).split()
    parts = map(morph_process, parts)
    return '_'.join(parts)

stop_words = map(morph_process, stopwords.words('russian'))
stop_words = stopwords.words('russian') + list(stop_words)

In [None]:
def content_names_reader(vacancies_it, index):
    for vacancy_id, vacancy_info in vacancies_it:
        # name = re.sub('\(.*?\)', '', vacancy_info['name'].lower())
        name = vacancy_info['name'].lower()
        index.append(vacancy_id)
        yield name

def content_skills_reader(vacancies_it, index):
    for vacancy_id, vacancy_info in vacancies_it:
        skills = ' '.join(map(preprocess_skill, vacancy_info['key_skills']))
        index.append(vacancy_id)
        yield skills

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def create_tfidf_vectorizer(mode, **params):
    if mode == 'names':
        vec = TfidfVectorizer(
            stop_words=stop_words,
            token_pattern=r"(?u)\b\w\w+\b",
            preprocessor=morph_process,
            ngram_range=(1, 2),
            min_df=5,
            **params
        )
    elif mode == 'skills':
        vec = TfidfVectorizer(
            stop_words=stop_words,
            token_pattern=r"(?u)\b\w\w+\b",
            min_df=5,
            **params
        )
    else:
        raise ValueError(mode)

    return vec

In [None]:
def create_tfdif_vectors(mode,
                         content_array_file,
                         content_terms_idfs,
                         content_vacancies_mapping, ):
    vacancies_it = map(lambda p: os.path.join(WORKDIR, p), tqdm(vacancies_parts))
    vacancies_it = map(read_vacancies_part, vacancies_it)
    vacancies_it = ((k, v) for p in vacancies_it for k, v in p.items())

    index = []

    if mode == 'names':
        content = tqdm(content_names_reader(vacancies_it, index), position=0)
    elif mode == 'skills':
        content = tqdm(content_skills_reader(vacancies_it, index), position=0)
    else:
        raise ValueError(mode)

    vec = create_tfidf_vectorizer(mode)

    # Считаем tfidf-вектора и сохраняем их
    features = vec.fit_transform(content)
    save_array(features, content_array_file, sparse=True)

    # Сохраняем словарик с idf
    vocabulary_inv = sorted(vec.vocabulary_, key=lambda e: vec.vocabulary_[e])
    with open(content_terms_idfs, mode='w', encoding='utf8') as f_data:
        for word, idf in zip(vocabulary_inv, vec.idf_):
            print(word, "%.16f" % idf, sep='\t', file=f_data)

    # Сохраняем порядок вакансий в матрице
    with open(content_vacancies_mapping, mode='w') as f_data:
        print(*index, sep='\n', file=f_data)

    return vec, features, index


def load_tfidf_vectors(mode,
                       content_array_file,
                       content_terms_idfs,
                       content_vacancies_mapping, ):
    # Грузим tfidf-вектора
    features = load_array(content_array_file, sparse=True)

    # Грузим TfIdfVectorizer
    with open(content_terms_idfs, mode='r') as f_data:
        f_data = map(lambda s: s.rstrip().split('\t'), f_data)

        vocabulary_inv, vocabulary_idf = [], []
        for i, (word, idf) in enumerate(f_data):
            vocabulary_inv.append(word)
            vocabulary_idf.append(float(idf))

    vec = create_tfidf_vectorizer(mode=mode, vocabulary=vocabulary_inv)
    vec.idf_ = np.asarray(vocabulary_idf, dtype=float)

    # Грузим порядок документов
    with open(content_vacancies_mapping, mode='r') as f_data:
        index = list(map(int, f_data))

    return vec, features, index

In [None]:
content_names_array_file = os.path.join(WORKDIR, 'data/content_names_2.npz')
content_names_terms_idfs = os.path.join(WORKDIR, 'data/content_names_2.idf')
content_names_vacancies_mapping = os.path.join(WORKDIR, 'data/content_names_2.mapping')

if not os.path.isfile(content_names_array_file):
    index = []

    vec, features_content_names, index = create_tfdif_vectors(
        'names',
        content_names_array_file,
        content_names_terms_idfs,
        content_names_vacancies_mapping,
    )
else:
    vec, features_content_names, index = load_tfidf_vectors(
        'names',
        content_names_array_file,
        content_names_terms_idfs,
        content_names_vacancies_mapping,
    )

# Убеждаемся, что все правильно
assert (np.asarray(index) == df_all_ids.index.values).all()
assert features_content_names.shape == (df_all_ids.shape[0], len(vec.idf_))

features_content_names.shape

(2912650, 139435)

In [None]:
content_skills_array_file = os.path.join(WORKDIR, 'data/content_skills_2.npz')
content_skills_terms_idfs = os.path.join(WORKDIR, 'data/content_skills_2.idf')
content_skills_vacancies_mapping = os.path.join(WORKDIR, 'data/content_skills_2.mapping')

if not os.path.isfile(content_skills_array_file):
    index = []

    vec, features_content_skills, index = create_tfdif_vectors(
        'skills',
        content_skills_array_file,
        content_skills_terms_idfs,
        content_skills_vacancies_mapping,
    )
else:
    vec, features_content_skills, index = load_tfidf_vectors(
        'skills',
        content_skills_array_file,
        content_skills_terms_idfs,
        content_skills_vacancies_mapping,
    )

# Убеждаемся, что все правильно
assert (np.asarray(index) == df_all_ids.index.values).all()
assert features_content_skills.shape == (df_all_ids.shape[0], len(vec.idf_))

features_content_skills.shape

(2912650, 19402)

In [None]:
import math

from torch.utils.data import Dataset


class HHDataset(Dataset):
    def __init__(self, content_names, vacancy_info, employer_info, skills_info, target,
                 batch_size=100, shuffle=True, random_state=None):
        self.check_shapes(
            content_names,
            vacancy_info,
            employer_info,
            skills_info,
            target,
        )

        self.content_names = content_names
        self.vacancy_info = vacancy_info
        self.employer_info = employer_info
        self.skills_info = skills_info
        self.target = target

        self.batch_size = batch_size
        self.shuffle = shuffle

        if random_state is not None and isinstance(random_state, np.random.RandomState):
            self.random_state = random_state
        else:
            self.random_state = np.random.RandomState(random_state)

        # init index
        self.on_epoch_end()

    def check_shapes(self, *args):
        args = args[:-1] if args[-1] is None else args

        shapes = map(lambda e: e.shape[0], args)
        shapes = list(shapes)

        # https://stackoverflow.com/questions/3844801/check-if-all-elements-in-a-list-are-identical

        assert shapes.count(shapes[0]) == len(shapes)

    def __len__(self):
        return int(math.ceil(self.content_names.shape[0] / self.batch_size))

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        index = self.index[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        batch_content_names = self.content_names[index]
        batch_vacancy_info = self.vacancy_info[index]
        batch_employer_info = self.employer_info[index]
        batch_skills_info = self.skills_info[index]
        
        if self.target is not None:
            batch_y = self.target[index]
            batch_y = batch_y.toarray()
        else:
            # inference mode
            batch_y = None

        batch_x = (
            batch_content_names.toarray(),
            batch_vacancy_info.toarray(),
            batch_employer_info.toarray(),
            batch_skills_info.toarray(),
        )

        return batch_x, batch_y

    def on_epoch_end(self):
        if self.shuffle:
            self.index = self.random_state.permutation(self.content_names.shape[0])
        else:
            self.index = np.arange(self.content_names.shape[0])


In [None]:
from torch import nn

class HHNetwork(nn.Module):
    def __init__(self, num_content_names, num_vacancy_info, num_employer_info, num_skills_info, num_target):
        super().__init__()

        self.network = nn.Sequential(
            nn.Linear(
                in_features=num_content_names + num_vacancy_info + num_employer_info + num_skills_info,
                out_features=num_target,
                bias=True,
            ),
            nn.Sigmoid(),
        )

    def forward(self, content_names, vacancy_info, employer_info, skills_info):
        x = torch.cat((content_names, vacancy_info, employer_info, skills_info), dim=1)
        x = self.network(x)
        return x

clf = HHNetwork(
    num_content_names=features_content_names.shape[1],
    num_vacancy_info=features_vacancy_info.shape[1],
    num_employer_info=features_employer.shape[1],
    num_skills_info=features_content_skills.shape[1],
    num_target=len(mapping_spec),
).cuda()

criteria = nn.BCELoss()
optimizer = torch.optim.Adam(clf.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.MultiStepLR(
    optimizer,
    milestones=[3, 7, ],
    gamma=0.1,
)

clf

HHNetwork(
  (network): Sequential(
    (0): Linear(in_features=416846, out_features=620, bias=True)
    (1): Sigmoid()
  )
)

In [None]:
def to_tensor(X, use_cuda=True):
    if not torch.is_tensor(X):
        device = 'cuda' if use_cuda else 'cpu'
        X = torch.tensor(X, device=device, dtype=torch.float32)

    if use_cuda and not X.is_cuda:
        X = X.cuda()

    if not torch.is_floating_point(X):
        X = X.float()

    return X

In [None]:
def get_best_ranks(ranks: np.ndarray, top: int, axis: int = 0, return_ranks: bool = False):
    top_slice = (slice(None), ) * axis + (slice(-top, None), )
    inv_slice = (slice(None), ) * axis + (slice(None, None, -1), )

    if top < ranks.shape[axis]:
        indices = np.argpartition(ranks, -top, axis=axis)[top_slice]
        ranks_top = np.take_along_axis(ranks, indices, axis=axis)
        indices = np.take_along_axis(indices, ranks_top.argsort(axis=axis)[inv_slice], axis=axis)
    else:
        indices = np.argsort(ranks, axis=axis)[top_slice]
        indices = indices[inv_slice]

    result = (indices, )

    if return_ranks:
        ranks = np.take_along_axis(ranks, indices, axis=axis)
        result += (ranks, )

    return result if len(result) > 1 else result[0]

In [None]:
def f1score(y_true, y_pred):
    # y_true = set(y_true)
    # y_pred = set(y_pred)
    
    tp = len(y_true & y_pred)
    precision = tp / len(y_pred)
    recall = tp / len(y_true)
    if precision == 0.0 and recall == 0.0:
        score = 0.0
    else:
        score = 2 * precision * recall / (precision + recall)
    return score

In [None]:
def make_predict(model, seq, top=3, return_ranks=False):
    y_pred, y_true, ranks_pred_all = [], [], []
    
    for i in tqdm(range(len(seq)), position=0, leave=False):
        X_test, y_test = seq[i]
        
        if y_test is not None:
            y_test = sp.csr_matrix(y_test)
            y_true.append(y_test)

        y_batch_pred = model(*map(to_tensor, X_test))
        y_batch_pred = y_batch_pred.cpu().detach().numpy()

        res = get_best_ranks(y_batch_pred, top=top, axis=1, return_ranks=return_ranks)
        if return_ranks:
            indices_pred, ranks_pred = res
        else:
            indices_pred, ranks_pred = res, None

        y_pred.append(indices_pred)
        ranks_pred_all.append(ranks_pred)

    y_true = sp.vstack(y_true) if y_true else None
    y_pred = np.vstack(y_pred)
    
    res = (y_true, y_pred, )
    
    if return_ranks:
        ranks_pred = np.vstack(ranks_pred_all)
        res += (ranks_pred, )

    return res

In [None]:
def validate(model, seq, top=3):
    y_true, y_pred = make_predict(model, seq, top=top, return_ranks=False)
    
    y_true = list(map(set, y_true.tolil().rows))
    y_pred = list(map(set, y_pred))

    assert len(y_true) == len(y_pred)

    scores = tqdm(zip(y_true, y_pred), total=len(y_true), position=0, leave=False)
    scores = [f1score(*pair) for pair in scores]
    scores = np.asarray(scores)
    return np.mean(scores)

In [None]:
from sklearn.model_selection import train_test_split

mask = df_all_ids['is_train'].values

( features_content_names_train, features_content_names_valid,
  features_vacancy_info_train, features_vacancy_info_valid,
  features_employer_train, features_employer_valid,
  features_content_skills_train, features_content_skills_valid, 
  y_train, y_valid ) = train_test_split(
      features_content_names[mask],
      features_vacancy_info[mask],
      features_employer[mask],
      features_content_skills[mask],
      y_spec,
      test_size=0.3,
      random_state=9872,
)
  
features_content_names_test = features_content_names[~mask]
features_vacancy_info_test = features_vacancy_info[~mask]
features_employer_test = features_employer[~mask]
features_content_skills_test = features_content_skills[~mask]

In [None]:
%%time

input_train = HHDataset(features_content_names_train, features_vacancy_info_train,
                        features_employer_train, features_content_skills_train, y_train,
                        batch_size=256, shuffle=True, random_state=42, )

input_valid = HHDataset(features_content_names_valid, features_vacancy_info_valid,
                        features_employer_valid, features_content_skills_valid, y_valid,
                        batch_size=256, shuffle=False, random_state=42, )

num_epochs = 10

for epoch in range(num_epochs):
    clf.train()

    desc = 'Epoch: {}'.format(epoch + 1)
    pbar = tqdm(range(len(input_train)), desc=desc, position=0, leave=True)

    for i in pbar:
        X_batch, y_batch_true = input_train[i]
        y_batch_pred = clf(*map(to_tensor, X_batch))
        loss = criteria(y_batch_pred, to_tensor(y_batch_true))

        pbar.set_postfix(loss=loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    pbar.close()

    clf.eval()
    
    score = validate(clf, input_valid, top=3)

    print('\n', end='')
    print('f-measure = {:.6f}'.format(score))

    input_train.on_epoch_end()
    input_valid.on_epoch_end()
    scheduler.step()

Epoch: 1: 100%|██████████| 3983/3983 [23:06<00:00,  2.87it/s, loss=0.0112]
Epoch: 2:   0%|          | 0/3983 [00:00<?, ?it/s]


f-measure = 0.518119


Epoch: 2: 100%|██████████| 3983/3983 [23:08<00:00,  2.87it/s, loss=0.0126]
Epoch: 3:   0%|          | 0/3983 [00:00<?, ?it/s]


f-measure = 0.527128


Epoch: 3: 100%|██████████| 3983/3983 [23:08<00:00,  2.87it/s, loss=0.00875]
Epoch: 4:   0%|          | 0/3983 [00:00<?, ?it/s]


f-measure = 0.527840


Epoch: 4: 100%|██████████| 3983/3983 [23:08<00:00,  2.87it/s, loss=0.00738]
Epoch: 5:   0%|          | 0/3983 [00:00<?, ?it/s]


f-measure = 0.529217


Epoch: 5: 100%|██████████| 3983/3983 [23:07<00:00,  2.87it/s, loss=0.00853]
Epoch: 6:   0%|          | 0/3983 [00:00<?, ?it/s]


f-measure = 0.529304


Epoch: 6: 100%|██████████| 3983/3983 [23:07<00:00,  2.87it/s, loss=0.0109]
Epoch: 7:   0%|          | 0/3983 [00:00<?, ?it/s]


f-measure = 0.529437


Epoch: 7: 100%|██████████| 3983/3983 [23:08<00:00,  2.87it/s, loss=0.00618]
Epoch: 8:   0%|          | 0/3983 [00:00<?, ?it/s]


f-measure = 0.529274


Epoch: 8: 100%|██████████| 3983/3983 [23:07<00:00,  2.87it/s, loss=0.00683]
Epoch: 9:   0%|          | 0/3983 [00:00<?, ?it/s]


f-measure = 0.529283


Epoch: 9: 100%|██████████| 3983/3983 [23:10<00:00,  2.87it/s, loss=0.0083]
Epoch: 10:   0%|          | 0/3983 [00:00<?, ?it/s]


f-measure = 0.529323


Epoch: 10: 100%|██████████| 3983/3983 [23:07<00:00,  2.87it/s, loss=0.00934]



f-measure = 0.529286
CPU times: user 5h 7min 2s, sys: 23min 17s, total: 5h 30min 19s
Wall time: 5h 30min 26s


In [None]:
torch.save(clf.state_dict(), os.path.join(WORKDIR, 'models/model_logreg_017_empl_skills.pt'))

with open(os.path.join(WORKDIR, 'models/model_logreg_017_empl_skills_spec.mapping'), mode='w') as f_mapping:
    print(*mapping_spec_inv, sep='\n', file=f_mapping)

In [None]:
from hyperopt import fmin, tpe, hp, Trials

space = hp.uniform('threshold', 0.01, 0.99)

y_true, y_pred, ranks_valid = make_predict(clf, input_valid, top=15, return_ranks=True)

def objective(threshold):
    mask = ranks_valid > threshold
    mask[:, 0] = True

    indices_pred = np.where(mask, y_pred, -1)
    indices_pred = indices_pred[:,:6]
    indices_pred = map(lambda e: filter(lambda x: x >= 0, e), indices_pred)
    indices_pred = map(set, indices_pred)

    indices_true = map(set, y_true.tolil().rows)

    scores = [f1score(*pair) for pair in zip(indices_true, indices_pred)]
    scores = np.asarray(scores)

    return -np.mean(scores)

trials = Trials()
res = fmin(objective, space, algo=tpe.suggest, max_evals=100, trials=trials,
           rstate=np.random.RandomState(4325), verbose=1)
res

                                                   


  0%|          | 0/100 [00:00<?, ?it/s, best loss: ?][A




  1%|          | 1/100 [00:05<08:22,  5.07s/it, best loss: -0.4708372194919398][A
  2%|▏         | 2/100 [00:09<08:12,  5.03s/it, best loss: -0.4708372194919398][A
  3%|▎         | 3/100 [00:15<08:21,  5.17s/it, best loss: -0.5167949196699968][A
  4%|▍         | 4/100 [00:21<08:26,  5.28s/it, best loss: -0.5254766103024118][A
  5%|▌         | 5/100 [00:26<08:21,  5.28s/it, best loss: -0.5535222304384738][A
  6%|▌         | 6/100 [00:31<08:10,  5.22s/it, best loss: -0.5638864744109628][A
  7%|▋         | 7/100 [00:36<08:02,  5.19s/it, best loss: -0.5638864744109628][A
  8%|▊         | 8/100 [00:41<07:52,  5.13s/it, best loss: -0.5725466949109503][A
  9%|▉         | 9/100 [00:46<07:45,  5.12s/it, best loss: -0.5725466949109503][A
 10%|█         | 10/100 [00:51<07:32,  5.02s/it, best loss: -0.5725466949109503][A
 11%|█         | 11/100 [00:56<07:39,  5.17s/it, best loss: -0.5725466949109503][A
 12%|█▏        | 12/100 [01:02<07:34,  5.17s/it, best loss: -0.5732526028644546][A


{'threshold': 0.2753375301431183}

In [None]:
threshold_best = res['threshold']
threshold_best

0.2753375301431183

In [None]:
indices_valid_file = os.path.join(WORKDIR, 'data/logreg-employer-valid.indices.npz')
ranks_valid_file = os.path.join(WORKDIR, 'data/logreg-employer-valid.ranks.npz')

save_array(ranks_valid, ranks_valid_file, sparse=False)
save_array(y_pred, indices_valid_file, sparse=False)

In [None]:
input_test = HHDataset(features_content_names_test, features_vacancy_info_test,
                       features_employer_test, features_content_skills_test, None,
                       batch_size=2048, shuffle=False, random_state=42, )

indices_test, ranks_test = [], []

for i in tqdm(range(len(input_test)), position=0, leave=True):
    X_batch, _ = input_test[i]

    y_batch = clf(*map(to_tensor, X_batch))
    y_batch = y_batch.cpu().detach().numpy()
    
    indices_pred, ranks_pred = get_best_ranks(y_batch, top=6, axis=1, return_ranks=True)
    
    indices_test.append(indices_pred)
    ranks_test.append(ranks_pred)

indices_test = np.vstack(indices_test)
ranks_test = np.vstack(ranks_test)

100%|██████████| 712/712 [54:40<00:00,  4.61s/it]


In [None]:
indices_test_file = os.path.join(WORKDIR, 'data/logreg-employer-test.indices.npz')
ranks_test_file = os.path.join(WORKDIR, 'data/logreg-employer-test.ranks.npz')

save_array(ranks_test, ranks_test_file, sparse=False)
save_array(indices_test, indices_test_file, sparse=False)

In [None]:
%%time

def convert_specializations(s):
    s = set(s)
    s = map(lambda e: mapping_spec_inv[e], s)
    s = sorted(s)
    # s = np.asarray(s, dtype=int)
    return s

use_smart = True

y_pred_all = []

if use_smart:
    mask = ranks_test > threshold_best
    mask[:, 0] = True

    indices_pred = np.where(mask, indices_test, -1)
    indices_pred = map(lambda e: filter(lambda x: x >= 0, e), indices_pred)
else:
    indices_pred = indices_test[:,:3]

y_pred_all = list(map(convert_specializations, indices_pred))

df_submission = df_all_ids.loc[~df_all_ids['is_train'], ['specializations']]
df_submission['specializations'] = y_pred_all

submission_id = 17
submission_file = os.path.join(WORKDIR, 'submission_{:03d}.csv.gz').format(submission_id)

df_submission.to_csv(submission_file, index=True, compression='gzip')
df_submission.head()

CPU times: user 21.7 s, sys: 174 ms, total: 21.8 s
Wall time: 23.6 s


In [None]:
!zcat "{submission_file}" | head -n5

vacancy_id,specializations
2,[211]
5,"[494, 541]"
7,[495]
8,"[70, 287]"


In [None]:
!zcat "{WORKDIR}/sample_submission.csv.gz" | head -n5

vacancy_id,specializations
2,"[25, 324, 42]"
5,"[491, 193, 313]"
7,[256]
8,"[287, 70, 83]"
