In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from my_parsers import download, parse
from tqdm import tqdm
from collections import Counter
from gensim.models import Word2Vec
import pickle
import json
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

russian_stopwords = pickle.load(open("solution/merito/utils/stopwords.pkl", "rb"))

In [6]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()


In [18]:
model_demand = Word2Vec.load("merito/models/demand_word2vec_russian.model")

In [20]:
spheres_clf = pickle.load(open("merito/models/sphere_clf.pickle", "rb"))

In [38]:
df = pd.read_csv("JOB_LIST.csv")

In [197]:
df = df.dropna(subset=["job_title"], axis=0)

In [195]:
df = df.drop_duplicates(subset=["job_title"])

In [None]:
preproc_jobs = []
for txt in tqdm(df["job_title"]):
    preproc_jobs.append(base_preproc(txt))

In [199]:
df["job_title"] = df["job_title"].map(base_preproc)

In [None]:
c = set()
for txt in tqdm(df["job_title"]):
    t = txt.split()
    for x in t:
        c.add(x)

In [None]:
lemm_dict = {}
for x in tqdm(c):
    lemm_dict[x] = morph.parse(x)[0].normal_form

In [202]:
df["job_title"] = df["job_title"].map(lambda x: " ".join([lemm_dict[s] for s in x.split()]))

In [54]:
preds = []

for i in tqdm(range(0, len(embeds), 2000)):
    sl = embeds.iloc[i:i+2000]
    preds.extend(spheres_clf.predict(sl).tolist())

100%|█████████████████████████████████████████| 119/119 [03:01<00:00,  1.52s/it]


In [58]:
new_pred = pd.DataFrame({"job_title": for_pred.tolist(), "marking": preds})

In [61]:
df = df.dropna()

In [63]:
df = pd.concat([new_pred, df], axis=0)

In [65]:
embeds_full = get_emb_by_modele(model_demand, df["job_title"].tolist())

100%|███████████████████████████████| 764228/764228 [00:04<00:00, 154013.42it/s]


In [74]:
df_featured = pd.concat([df.reset_index(), embeds_full.reset_index()], axis=1)

In [None]:
for sphere_name in tqdm(df_featured["marking"].unique()):
    cat1 = df_featured[df_featured["marking"] == sphere_name]
    cat1_downsample = cat1.sample(n=min(len(cat1), 75000))
    vectorizer = TfidfVectorizer(max_features=300).fit(cat1_downsample["job_title"])
    vectors = vectorizer.transform(cat1_downsample["job_title"])
    emb = pd.DataFrame(vectors.toarray(), columns=[f"emb_{i}" for i in range(300)])
    clusterizer = DBSCAN(eps=0.3).fit(emb)
    tmp_classes = clusterizer.labels_
    knn = KNeighborsClassifier().fit(emb, tmp_classes)
    
    emb = pd.DataFrame(vectorizer.transform(cat1["job_title"]).toarray(), 
                       columns=[f"emb_{i}" for i in range(300)])
    clusters = knn.predict(emb)
    
    for clust in set(clusters):
        names = cat1[clusters == clust]["job_title"]
        name_stat = Counter(names)
        clust_name = max(list(name_stat), key=lambda x: x[1])    
        df_featured.loc[names.index, "subspheres"] = clust_name

In [None]:
df_featured.to_csv("featured_jobs_titles.csv")