In [6]:
import os
import re
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
from sklearn.utils import compute_class_weight, resample

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    mean_squared_error,
    r2_score,
    silhouette_score
)

from sklearn.cluster import KMeans
from kneed import KneeLocator


In [7]:
df = pd.read_csv('../data/job_en_with_skills.csv') 
group_skill_list = pd.read_csv('../data/skill.csv')['group_skill'].tolist()

In [8]:
df.head()

Unnamed: 0,company_name,city_text,exp,item_tag,company_scale,company_field,position,education,min_salary,max_salary,average_salary,platform,job_des,job_require,skills,group_skill
0,Navigos Search's Client,Hà Nội,3.0,Product Owner,,Information technology,Staff,bachelor,75000000.0,125000000.0,100000000.0,vietnamworks,Mô tả công việc Navigos Search's Client is loo...,Yêu cầu công việc - A Bachelor’s degree in a r...,"agile, blockchain, data, go, performance, qa, ...","Data Systems, Languages, OS & Infrastructure, ..."
1,Navigos Search's Client,"Hà Nội, Đà Nẵng",6.0,Technical Lead (Nextjs & Nestjs) Remote,,Information technology,Team leader,unknown,75000000.0,125000000.0,100000000.0,vietnamworks,Mô tả công việc What You'll Be Doing • Lead ar...,Yêu cầu công việc • 6+ years of full-stack dev...,"api, architecture, design, docker, graphql, pe...","Data Systems, Languages, Libs & Frameworks, OS..."
2,Navigos Search's Client,"Hà Nội, Đà Nẵng",6.0,Technical Lead (Nextjs & Nestjs)_Remote,,Information technology,Team leader,unknown,75000000.0,125000000.0,100000000.0,vietnamworks,Mô tả công việc What You'll Be Doing • Lead ar...,Yêu cầu công việc • 6+ years of full-stack dev...,"api, architecture, design, docker, graphql, pe...","Data Systems, Languages, Libs & Frameworks, OS..."
3,Navigos Search's Client,Hà Nội,3.5,Editorial - Retouch Specialist,,Information technology,Staff,unknown,75000000.0,125000000.0,100000000.0,vietnamworks,Mô tả công việcTOP 4 JOB RESPONSIBILITIES Qual...,Yêu cầu công việcRequirements & Experience • 3...,,
4,"RGF HR Agent Vietnam Co., LTD",Hồ Chí Minh,8.0,116935 - Principal Backend Engineer - IT softw...,,Information technology,Staff,,97000000.0,100000000.0,98500000.0,careerviet,Mô tả Công việc - Work Location: Phu N...,YÊU CẦU CÔNG VIỆC <Necessary Skill / Experienc...,"agile, architecture, design, docker, express, ...","Languages, Libs & Frameworks, OS & Infrastruct..."


In [9]:
from sklearn.metrics import (
    accuracy_score, r2_score, f1_score, precision_score, recall_score
)

In [None]:
SEED = 83
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

def remove_outliers_percentile(df, col, lower_pct=0.01, upper_pct=0.99):
    lower = df[col].quantile(lower_pct)
    upper = df[col].quantile(upper_pct)
    return df[(df[col] >= lower) & (df[col] <= upper)]

df_work = remove_outliers_percentile(df.copy(), 'average_salary')

bins = [0, 8.5e6, 17e6, df['average_salary'].max()]
labels = [0, 1, 2]
df_work['salary_label'] = pd.cut(df_work['average_salary'], bins=bins, labels=labels).astype(int)

df_work['num_skills'] = df_work['skills'].apply(lambda x: len(str(x).split(',')) if pd.notnull(x) else 0)
df_work['num_group_skills'] = df_work['group_skill'].apply(lambda x: len(str(x).split(',')) if pd.notnull(x) else 0)
df_work['has_data_skill'] = df_work['group_skill'].apply(lambda x: int('data' in str(x).lower()) if pd.notnull(x) else 0)

def augment_text(text, n_aug=2):
    words = text.split()
    for _ in range(n_aug):
        if len(words) > 2:
            idx1, idx2 = sorted(random.sample(range(len(words)), 2))
            words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

augmented_rows = []
for i, row in df_work.iterrows():
    if random.random() < 0.1: 
        new_row = row.copy()
        new_row['job_des'] = augment_text(str(row['job_des']))
        new_row['job_require'] = augment_text(str(row['job_require'])) 
        augmented_rows.append(new_row)

df_aug = pd.DataFrame(augmented_rows)
df_work = pd.concat([df_work, df_aug], ignore_index=True)

# 4. Encode features
def target_encode(df, col_name, target='average_salary'):
    return df.groupby(col_name)[target].transform('mean')

categorical_cols = ['city_text', 'company_field', 'position', 'platform']
for col in categorical_cols:
    df_work[col + '_enc'] = target_encode(df_work, col)

skill_columns = [col for col in df.columns if col.startswith('skill_')]
correlations = df_work[skill_columns + ['average_salary']].corr(numeric_only=True)['average_salary'].drop('average_salary').abs()
top_group_skill = correlations.sort_values(ascending=False).head(10).index.tolist()


feature_cols = top_group_skill + ['exp', 'num_skills', 'num_group_skills', 'has_data_skill'] + [col + '_enc' for col in categorical_cols]

df_work.columns = [col.replace(" ", "_").replace(".", "_").replace("-", "_") for col in df_work.columns]
X = df_work[feature_cols].fillna(0)
y = df_work['salary_label']

df_balanced = pd.concat([
    resample(df_work[df_work.salary_label == label], 
             replace=True, 
             n_samples=df_work['salary_label'].value_counts().max(), 
             random_state=SEED)
    for label in df_work['salary_label'].unique()
], ignore_index=True)

X = df_balanced[feature_cols].fillna(0)
y = df_balanced['salary_label']

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

print("✅ Finished processing features, balancing labels, and applying data augmentation.")
print(f"Number of samples after augmentation and balancing: {len(df_balanced)}")
print(f"Label distribution: \n{y.value_counts(normalize=True)}")

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=SEED
)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=SEED),
    "Decision Tree": DecisionTreeClassifier(random_state=SEED),
    "XGBoost": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='mlogloss', random_state=SEED),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=SEED),
}

print("🎯 Result in classification (test):")
for name, model in models.items():
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        precision = precision_score(y_test, y_pred, average='macro')

        print(f"{name:<20} 🎯 Accuracy = {acc:.4f} | R² = {r2:.4f} | F1 = {f1:.4f} | Recall = {recall:.4f} | Precision = {precision:.4f}")
    except Exception as e:
        print(f"{name:<20} ❌ Error: {e}")

✅ Finished processing features, balancing labels, and applying data augmentation.
Number of samples after augmentation and balancing: 5799
Label distribution: 
salary_label
2    0.333333
1    0.333333
0    0.333333
Name: proportion, dtype: float64
🎯 Kết quả phân loại (trên tập test):
Logistic Regression  🎯 Accuracy = 0.6612 | R² = 0.3364 | F1 = 0.6647 | Recall = 0.6611 | Precision = 0.6912
KNN                  🎯 Accuracy = 0.8181 | R² = 0.6261 | F1 = 0.8139 | Recall = 0.8183 | Precision = 0.8161
Random Forest        🎯 Accuracy = 0.8802 | R² = 0.7814 | F1 = 0.8790 | Recall = 0.8803 | Precision = 0.8813
Decision Tree        🎯 Accuracy = 0.8716 | R² = 0.7607 | F1 = 0.8700 | Recall = 0.8717 | Precision = 0.8743
XGBoost              🎯 Accuracy = 0.8707 | R² = 0.7633 | F1 = 0.8694 | Recall = 0.8708 | Precision = 0.8712


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Gradient Boosting    🎯 Accuracy = 0.7560 | R² = 0.5136 | F1 = 0.7576 | Recall = 0.7561 | Precision = 0.7615
