In [1]:
%config Completer.use_jedi = False
import gc
import sklearn
import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

BASE_PATH = '../input/uethackathon/original'
DS_PATH = '../input/uethackathon/'
N_FOLD = 5
SEED = 42
NOT_FOUND_RANK = 35.5

info_test_df: pd.DataFrame =  pd.read_csv(f'{BASE_PATH}/info_test.csv')
info_train_df: pd.DataFrame =  pd.read_csv(f'{BASE_PATH}/info_train.csv')

label_test_df: pd.DataFrame = pd.read_csv(f'{BASE_PATH}/label_test.csv')
label_train_df: pd.DataFrame = pd.read_csv(f'{BASE_PATH}/label_train.csv')

work_test_df: pd.DataFrame = pd.read_csv(f'{BASE_PATH}/work_test.csv')
work_train_df: pd.DataFrame = pd.read_csv(f'{BASE_PATH}/work_train.csv')
    
rank_df = pd.read_csv(DS_PATH + 'city_rank.csv')

all_train_df = info_train_df.join(work_train_df.set_index('id_bh'), on='id_bh', rsuffix='_work').join(label_train_df.set_index('id_bh'), on='id_bh', rsuffix='_label')
all_test_df = info_test_df.join(work_test_df.set_index('id_bh'), on='id_bh', rsuffix='_work')

In [2]:
import torch
from transformers import AutoModel, AutoTokenizer

SEQ_LENGTH = 20
device = "cuda:0" if torch.cuda.is_available() else "cpu"
phobert = AutoModel.from_pretrained("vinai/phobert-base").to(device)
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

def get_embeddings(sentences):
    # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
    tokens = [tokenizer(x, max_length=SEQ_LENGTH, padding='max_length', truncation=True)['input_ids'] for x in sentences]
    input_ids = torch.tensor(tokens).to(device)

    with torch.no_grad():
        features  = phobert(input_ids)
        print(features[1].shape)
    
    return features[1].tolist()
    
## With TensorFlow 2.0+:
# from transformers import TFAutoModel
# phobert = TFAutoModel.from_pretrained("vinai/phobert-base")

In [3]:
def clean_job_role(job: str) -> str:
    job = str(job).lower()
    
    abbrev = {
        "nv": "nhân viên",
        "pgđ": "phó giám đốc",
        "gđ": "giám đốc",
        "gv": "giáo viên",
        "ct": "chủ tịch",
        "cn": "công nhân",
        "cv": "chuyên viên",
        "tgđ": "tổng giám đốc",
        "cb": "cán bộ",
        "sc": "sửa chữa",
        "bqt": "ban quản trị",
        "bv": "bảo vệ",
        "p.": "phó",
        "qt": "quản trị",
        "kd": "kinh doanh",
        "xd": "xây dựng",
        "sx": "sản xuất",
        "qhkh": "quan hệ khách hàng",
    }
    for k, v in abbrev.items():
        job = job.replace(k, v)
        
    # Remove special characters
#     import re
#     job = re.sub('\W+',' ', job)
    job = ''.join([x for x in job if x.isalnum() or x == ' '])
    job = job.rstrip().lstrip()
    
    return job

In [4]:
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

all_jobs = pd.concat([work_train_df['job/role'], work_test_df['job/role']])
print("Before", all_jobs.nunique())

work_train_df['job/role'] = work_train_df['job/role'].map(clean_job_role)
work_test_df['job/role'] = work_test_df['job/role'].map(clean_job_role)
all_jobs = pd.concat([work_train_df['job/role'], work_test_df['job/role']])
display(all_jobs.head())

all_jobs = [x for x, _ in all_jobs.value_counts().iteritems()]
print("After", len(all_jobs))

embeddings = []
batch = 1000
for i in range(0, len(all_jobs), batch):
    embeddings.extend(get_embeddings(all_jobs[i:min(i + batch, len(all_jobs))]))
print(np.shape(embeddings))

In [5]:
df_config = {
    'job/role': all_jobs,
}
embeddings = np.array(embeddings)
for i in range(embeddings.shape[1]):
    df_config[f'job_embedding_{i}'] = embeddings[:, i]
    
df = pd.DataFrame(df_config, columns=df_config.keys())
display(df.head())
df.to_csv(f'./job_embeddings.csv', index=False)

In [6]:
def plot(features, labels=None):
    pca = PCA(n_components=2)
    features = pca.fit_transform(features)
    features = MinMaxScaler().fit_transform(features)

    df = pd.DataFrame()
    print(features.shape, labels.shape)

    df['x'] = features[:, 0]
    df['y'] = features[:, 1]
    
    if labels is not None:
        df['labels'] = labels.values
        
    display(df.head())
    plt.figure()
    sns.scatterplot(data=df, x='x', y='y', hue=('labels' if labels is not None else None))
    plt.show()

display(work_train_df.head())
tmp = work_train_df.join(
    df.set_index('job/role'), on='job/role', rsuffix='_embedding').join(
        label_train_df.set_index('id_bh'), on='id_bh', rsuffix='_label')

In [7]:
del_columns = [
    'id', 'id_bh', 'id_management', 
    'id_office', 
    'company_type', 'from_date', 'to_date', 'employee_lv', 
    'address'
]
    
display(tmp.head())
groupby_columns = list(tmp.columns)
for x in del_columns:
    groupby_columns.remove(x)
groupby_columns.remove('label')
aggregated = tmp.groupby(groupby_columns, as_index=False).aggregate('mean')
display(aggregated.head())
print(aggregated.columns)

dropped = ['job/role', 'label', *del_columns]
dropped.remove('id_office')
dropped.remove('address')
plot(aggregated.drop(dropped, axis=1).to_numpy(), aggregated['label'])