# IMPORT LIBRARIES AND LOAD DATA 

In [1]:
# import libraries
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor


In [2]:
# load data
df = pd.read_csv('../data/job_en_final.csv')

In [3]:
df.head()

Unnamed: 0,city_text,company_field,position,platform,exp_log_z,average_salary_log_z,skill_.net,skill_3d,skill_Not specified,skill_agile,...,skill_wpf,skill_xamarin,skill_xcode,skill_xml,group_skill_Data Systems,group_skill_Development Tools,group_skill_Languages,group_skill_Libs & Frameworks,group_skill_OS & Infrastructure,group_skill_Process & Methods
0,20,10,2,2,0.85056,2.90382,0,0,0,1,...,0,0,0,0,1,0,1,0,1,1
1,37,10,3,2,2.041137,2.90382,0,0,0,0,...,0,0,0,0,1,0,1,1,1,1
2,37,10,3,2,2.041137,2.90382,0,0,0,0,...,0,0,0,0,1,0,1,1,1,1
3,20,10,2,2,1.101143,2.90382,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,54,10,2,0,2.575805,2.877022,0,0,0,1,...,0,0,0,0,0,0,1,1,1,1


# LINEAR REGRESSION

In [4]:
X = df.drop(columns=['average_salary_log_z'])
y = df['average_salary_log_z']

In [5]:
from sklearn.model_selection import train_test_split

# Test 15%
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.15,
    random_state=42
)

# Train / Validation
X_train, X_valid, y_train, y_valid = train_test_split(
    X_temp, y_temp,
    test_size=0.1765,  # ~15% tổng
    random_state=42
)


In [6]:
model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [7]:
y_test_pred = model.predict(X_test)

print("TEST")
print("MAE:", mean_absolute_error(y_test, y_test_pred))
print("MSE:", mean_squared_error(y_test, y_test_pred))
print("R² :", r2_score(y_test, y_test_pred))


TEST
MAE: 0.5919954682784252
MSE: 0.6771955236182674
R² : 0.3312876718937052


# Lọc biến

In [8]:
corr = pd.concat([X_train, y_train], axis=1).corr()['average_salary_log_z']
corr = corr.drop('average_salary_log_z')

selected = corr[abs(corr) > 0.05].index


In [9]:
import statsmodels.api as sm

X_sm = sm.add_constant(X_train)
model = sm.OLS(y_train, X_sm).fit()

model.summary()


0,1,2,3
Dep. Variable:,average_salary_log_z,R-squared:,0.577
Model:,OLS,Adj. R-squared:,0.53
Method:,Least Squares,F-statistic:,12.26
Date:,"Fri, 16 Jan 2026",Prob (F-statistic):,2.03e-234
Time:,15:30:33,Log-Likelihood:,-2102.9
No. Observations:,2116,AIC:,4632.0
Df Residuals:,1903,BIC:,5837.0
Df Model:,212,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.4422,0.105,-13.779,0.000,-1.647,-1.237
city_text,-0.0003,0.001,-0.302,0.763,-0.002,0.002
company_field,-0.0107,0.004,-3.031,0.002,-0.018,-0.004
position,0.5690,0.031,18.596,0.000,0.509,0.629
platform,0.1793,0.032,5.584,0.000,0.116,0.242
exp_log_z,0.3293,0.020,16.817,0.000,0.291,0.368
skill_.net,0.1729,0.174,0.991,0.322,-0.169,0.515
skill_3d,-0.0513,0.075,-0.681,0.496,-0.199,0.096
skill_Not specified,0.0722,0.067,1.070,0.285,-0.060,0.205

0,1,2,3
Omnibus:,118.82,Durbin-Watson:,2.02
Prob(Omnibus):,0.0,Jarque-Bera (JB):,241.824
Skew:,0.381,Prob(JB):,3.08e-53
Kurtosis:,4.47,Cond. No.,6.32e+17


In [10]:
# delete if p_value > 0.05
p_values = model.pvalues
columns_to_delete = p_values[p_values > 0.05].index.tolist()
columns_to_delete = [col for col in columns_to_delete if col != 'const']
X_train.drop(columns=columns_to_delete, inplace=True, errors='ignore')
X_valid.drop(columns=columns_to_delete, inplace=True, errors='ignore')
X_test.drop(columns=columns_to_delete, inplace=True, errors='ignore')


In [11]:
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)

(2116, 35)
(454, 35)
(454, 35)


# VIF

In [12]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np

X_vif = X_train.copy()

vif_df = pd.DataFrame()
vif_df['Feature'] = X_vif.columns
vif_df['VIF'] = [
    variance_inflation_factor(X_vif.values, i)
    for i in range(X_vif.shape[1])
]

vif_df.sort_values('VIF', ascending=False)


Unnamed: 0,Feature,VIF
1,position,7.682633
0,company_field,5.87584
2,platform,4.239874
20,skill_linux,1.772064
8,skill_bash,1.629783
33,skill_windows,1.627184
27,skill_scripting,1.519757
22,skill_performance,1.460471
12,skill_cloud,1.390176
29,skill_spark,1.340765


In [13]:
# delete if vif_value > 5
columns_to_delete = vif_df[vif_df['VIF'] > 5]['Feature'].tolist()
X_vif.drop(columns=columns_to_delete, inplace=True, errors='ignore')
X_train = X_vif.copy()
X_valid.drop(columns=columns_to_delete, inplace=True, errors='ignore')
X_test.drop(columns=columns_to_delete, inplace=True, errors='ignore')

# Linear – Ridge – Lasso

In [14]:


models = {
    'Linear': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.01)
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results.append({
        'Model': name,
        'MAE': mean_absolute_error(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
        'R²': r2_score(y_test, y_pred)
    })

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,MAE,MSE,R²
0,Linear,0.604491,0.659275,0.348984
1,Ridge,0.599048,0.644856,0.363222
2,Lasso,0.598135,0.635186,0.372771


# MACHINE LEARNING MODELS

In [15]:
models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.01),
    "Decision Tree": DecisionTreeRegressor(max_depth=6, random_state=42),
    "Random Forest": RandomForestRegressor(
        n_estimators=200, max_depth=8, random_state=42
    ),
    "Gradient Boosting": GradientBoostingRegressor(
        n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42
    ),
    "SVR": SVR(kernel='rbf', C=10, epsilon=0.1)
}


In [16]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results.append({
        "Model": name,
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "R²": r2_score(y_test, y_pred)
    })

results_df = pd.DataFrame(results).sort_values("R²", ascending=False)
results_df


Unnamed: 0,Model,MAE,MSE,R²
6,SVR,0.566755,0.623791,0.384024
4,Random Forest,0.577649,0.624196,0.383623
5,Gradient Boosting,0.580703,0.634573,0.373376
2,Lasso,0.598135,0.635186,0.372771
1,Ridge,0.599048,0.644856,0.363222
3,Decision Tree,0.593509,0.648449,0.359674
0,Linear,0.604491,0.659275,0.348984


# FINE TUNE MODEL

In [17]:
param_grid = {
    "C": [0.1, 1, 10, 50],
    "epsilon": [0.01, 0.05, 0.1],
    "gamma": ['scale', 0.1, 0.01]
}


In [18]:
best_score = -1
best_model = None
results = []

for C in param_grid["C"]:
    for eps in param_grid["epsilon"]:
        for gamma in param_grid["gamma"]:
            model = SVR(kernel='rbf', C=C, epsilon=eps, gamma=gamma)
            model.fit(X_train, y_train)

            y_val_pred = model.predict(X_valid)
            r2 = r2_score(y_valid, y_val_pred)

            results.append({
                "C": C,
                "epsilon": eps,
                "gamma": gamma,
                "R2_valid": r2
            })

            if r2 > best_score:
                best_score = r2
                best_model = model


In [19]:
tune_results = pd.DataFrame(results).sort_values("R2_valid", ascending=False)
tune_results.head()


Unnamed: 0,C,epsilon,gamma,R2_valid
13,1.0,0.05,0.1,0.46524
16,1.0,0.1,0.1,0.464925
29,50.0,0.01,0.01,0.464343
35,50.0,0.1,0.01,0.463913
32,50.0,0.05,0.01,0.461456


In [20]:
y_test_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print("SVR (Best)")
print("MAE:", mae)
print("MSE:", mse)
print("R²:", r2)


SVR (Best)
MAE: 0.5601657325681532
MSE: 0.5850367850584712
R²: 0.42229194240096257


In [21]:
print("Train R²:", r2_score(y_train, best_model.predict(X_train)))
print("Valid R²:", r2_score(y_valid, best_model.predict(X_valid)))
print("Test  R²:", r2_score(y_test, y_test_pred))


Train R²: 0.4798504794248384
Valid R²: 0.4652396828244725
Test  R²: 0.42229194240096257


# DEEP LEARNING

In [22]:
mlp = MLPRegressor(
    hidden_layer_sizes=(32, 16),
    activation='relu',
    solver='adam',
    alpha=0.0005,         
    learning_rate_init=0.001,
    max_iter=500,
    early_stopping=True,   
    validation_fraction=0.2,
    n_iter_no_change=20,
    random_state=42
)


In [23]:
mlp.fit(X_train, y_train)

y_test_pred = mlp.predict(X_test)

mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
r2  = r2_score(y_test, y_test_pred)

print("MLP Regression")
print("MAE:", mae)
print("MSE:", mse)
print("R²:", r2)


MLP Regression
MAE: 0.572012182766964
MSE: 0.6008336292146671
R²: 0.40669298454610137


In [24]:
print("Train R²:", r2_score(y_train, mlp.predict(X_train)))
print("Valid R²:", r2_score(y_valid, mlp.predict(X_valid)))
print("Test  R²:", r2)


Train R²: 0.502749848572732
Valid R²: 0.4286663995281529
Test  R²: 0.40669298454610137


# TUNE DEEP LEARNING

In [25]:
architectures = [
    (32,),
    (64,),
    (32, 16),
    (64, 32)
]

alphas = [0.0005, 0.001, 0.005, 0.01]
lrs = [0.001, 0.0005]


In [26]:
results = []
best_r2 = -1
best_model = None

for arch in architectures:
    for alpha in alphas:
        for lr in lrs:
            mlp = MLPRegressor(
                hidden_layer_sizes=arch,
                activation='relu',
                solver='adam',
                alpha=alpha,
                learning_rate_init=lr,
                max_iter=400,
                early_stopping=True,
                validation_fraction=0.2,
                n_iter_no_change=20,
                random_state=42
            )

            mlp.fit(X_train, y_train)
            r2_val = r2_score(y_valid, mlp.predict(X_valid))

            results.append({
                "Architecture": arch,
                "alpha": alpha,
                "lr": lr,
                "R2_valid": r2_val
            })

            if r2_val > best_r2:
                best_r2 = r2_val
                best_model = mlp


In [27]:
tune_df = pd.DataFrame(results).sort_values("R2_valid", ascending=False)
tune_df.head()


Unnamed: 0,Architecture,alpha,lr,R2_valid
15,"(64,)",0.01,0.0005,0.436131
8,"(64,)",0.0005,0.001,0.435838
12,"(64,)",0.005,0.001,0.435417
5,"(32,)",0.005,0.0005,0.435213
10,"(64,)",0.001,0.001,0.435179


In [28]:

y_test_pred = best_model.predict(X_test)

print("Best Deep Learning Model")
print("MAE:", mean_absolute_error(y_test, y_test_pred))
print("MSE:", mean_squared_error(y_test, y_test_pred))
print("R² :", r2_score(y_test, y_test_pred))


Best Deep Learning Model
MAE: 0.5733267495797119
MSE: 0.610901880923082
R² : 0.3967508573389973


In [29]:
print("Train R²:", r2_score(y_train, best_model.predict(X_train)))
print("Valid R²:", r2_score(y_valid, best_model.predict(X_valid)))
print("Test  R²:", r2_score(y_test, y_test_pred))


Train R²: 0.5109474739308335
Valid R²: 0.4361312084836869
Test  R²: 0.3967508573389973


# SAVE MODEL

In [30]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import joblib


In [31]:
X_train.head()

Unnamed: 0,platform,exp_log_z,skill_algorithm,skill_appium,skill_architecture,skill_arm,skill_bash,skill_boost,skill_c++,skill_cisco,...,skill_sap,skill_scala,skill_scripting,skill_scrum,skill_spark,skill_symfony,skill_testing,skill_unit-testing,skill_windows,skill_xcode
1426,1,-0.624102,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2760,2,-2.098765,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1986,2,-0.624102,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1547,2,0.85056,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
1455,1,-0.624102,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
import joblib
import pandas as pd
import os
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

num_features = X_train.shape[1]
print(f"📊 Dữ liệu X_train hiện tại có: {num_features} cột.")

print("🔄 Đang train lại SVR pipeline...")
model_pipeline = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
model_pipeline.fit(X_train, y_train)
print("✅ Train xong!")

if hasattr(X_train, 'columns'):
    real_columns = list(X_train.columns)
else:
    # Nếu X_train là numpy array (mất tên), ta phải lấy từ X gốc
    print("⚠️ X_train đang là array (mất tên cột). Lấy tên từ X gốc...")
    real_columns = list(X.columns)
    if len(real_columns) > num_features:
        print(f"⚠️ Cảnh báo: X gốc có {len(real_columns)} cột nhưng X_train chỉ có {num_features}.")
        print("👉 Đang giữ lại các cột khớp nhất...")
        real_columns = real_columns[:num_features]

print(f"📝 Danh sách cột sẽ lưu ({len(real_columns)} cột): {real_columns}")

save_dir = 'pipeline' 
if not os.path.exists(save_dir) and os.path.exists('../pipeline'):
    save_dir = '../pipeline'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Lưu Model
joblib.dump(model_pipeline, os.path.join(save_dir, 'model.pkl'))
# Lưu danh sách cột
joblib.dump(real_columns, os.path.join(save_dir, 'model_columns.pkl'))

print("------------------------------------------------------")
print(f"💾 ĐÃ LƯU THÀNH CÔNG VÀO: {save_dir}")
print(f"✅ Model và Danh sách cột đã đồng bộ: Cùng là {len(real_columns)} cột.")

📊 Dữ liệu X_train hiện tại có: 33 cột.
🔄 Đang train lại SVR pipeline...
✅ Train xong!
📝 Danh sách cột sẽ lưu (33 cột): ['platform', 'exp_log_z', 'skill_algorithm', 'skill_appium', 'skill_architecture', 'skill_arm', 'skill_bash', 'skill_boost', 'skill_c++', 'skill_cisco', 'skill_cloud', 'skill_confluence', 'skill_debian', 'skill_etl', 'skill_firewall', 'skill_hardware', 'skill_java', 'skill_jenkins', 'skill_linux', 'skill_matlab', 'skill_performance', 'skill_php', 'skill_saas', 'skill_sap', 'skill_scala', 'skill_scripting', 'skill_scrum', 'skill_spark', 'skill_symfony', 'skill_testing', 'skill_unit-testing', 'skill_windows', 'skill_xcode']
------------------------------------------------------
💾 ĐÃ LƯU THÀNH CÔNG VÀO: ../pipeline
✅ Model và Danh sách cột đã đồng bộ: Cùng là 33 cột.
