# Salary Predictor Model (10k Dataset)
Trains a salary prediction model using experience, geolocation, qualifications, work type, gender preference, and PCA-reduced job embeddings. Models: Ridge baseline and LightGBM with compact tuning; optional quantile regressors. Artifacts saved locally.

In [3]:
import os, pickle, json, itertools
from pathlib import Path
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
import lightgbm as lgb
from joblib import dump
np.random.seed(42)
# Robust path resolution for dataset and embeddings
bases = [Path.cwd(), Path.cwd().parent, Path.cwd().parent.parent, Path(r'd:/ai courses/JobHunt')]
def find_file(rel, fallback):
    for b in bases:
        p = (b/rel).resolve()
        if p.exists(): return str(p)
    return fallback
data_path = find_file(Path('dataset/complete_dataset10k.csv'), r'd:\ai courses\JobHunt\dataset\complete_dataset10k.csv')
emb_path = find_file(Path('embeddings/job_embeddings.npy'), r'd:\ai courses\JobHunt\embeddings\job_embeddings.npy')
ids_path = find_file(Path('embeddings/job_ids.pkl'), r'd:\ai courses\JobHunt\embeddings\job_ids.pkl')
artifacts_dir='.'; os.makedirs(artifacts_dir, exist_ok=True)

## Load dataset and engineer features

In [4]:
df = pd.read_csv(data_path)
# Gender Preference feature engineering
if 'Preference' not in df.columns: df['Preference'] = np.nan
def _normalize_pref(x):
    if pd.isna(x): return 'none'
    s=str(x).strip().lower()
    if s in ('male','man','m','men'): return 'male'
    if s in ('female','woman','w','f','women'): return 'female'
    if s in ('any','either','both','no preference','none',''): return 'none'
    if 'male' in s or 'man' in s: return 'male'
    if 'female' in s or 'woman' in s or 'lady' in s: return 'female'
    return 'none'
df['gender_preference']=df['Preference'].apply(_normalize_pref)
df['prefers_male']=(df['gender_preference']=='male').astype(int)
df['prefers_female']=(df['gender_preference']=='female').astype(int)
df['has_preference']=((df['prefers_male']|df['prefers_female']).astype(int))
# Filter rows with valid target and basic imputations
df=df[df['salary_mid'].notna()].copy()
for c in ['experience_mid','latitude','longitude']:
    if df[c].isna().any(): df[c]=df[c].fillna(df[c].median())
for c in ['is_bachelor','is_master','is_phd','is_contract','is_part_time','is_internship','is_full_time']:
    df[c]=df[c].fillna(0).astype(int)
print('Data shape:', df.shape)

Data shape: (10000, 53)


## Load embeddings and fit PCA (train-only)

In [5]:
embeddings=np.load(emb_path)
with open(ids_path,'rb') as f: emb_ids=pickle.load(f)
id_to_idx={jid:i for i,jid in enumerate(emb_ids)}
rows=[]; miss=0
for jid in df['Job_Id'].tolist():
    i=id_to_idx.get(jid)
    rows.append(embeddings[i] if i is not None else np.zeros(embeddings.shape[1]))
    miss += (i is None)
emb_arr=np.vstack(rows); print('Embeddings aligned. Missing IDs:', miss)
idx_all=np.arange(len(df))
tr_idx, te_idx=train_test_split(idx_all, test_size=0.15, random_state=42)
tr_idx, va_idx=train_test_split(tr_idx, test_size=0.1765, random_state=42)
pca=PCA(n_components=50, random_state=42).fit(emb_arr[tr_idx])
emb_pca=pca.transform(emb_arr)
for k in range(emb_pca.shape[1]): df[f'emb_pca_{k+1}']=emb_pca[:,k]
print('Added PCA components:', emb_pca.shape[1])

Embeddings aligned. Missing IDs: 0
Added PCA components: 50


## Build features and splits

In [10]:
# Define feature groups
numeric_features = ['experience_mid', 'latitude', 'longitude']
qualification_features = ['is_bachelor', 'is_master', 'is_phd']
work_type_features = ['is_contract', 'is_part_time', 'is_internship', 'is_full_time']
preference_features = ['prefers_male', 'prefers_female', 'has_preference']
embedding_features = [c for c in df.columns if c.startswith('emb_pca_')]

# Combine all features
feature_cols = numeric_features + qualification_features + work_type_features + preference_features + embedding_features

# Prepare input and target arrays
X = df[feature_cols].values
y = df['salary_mid'].values
idx = np.arange(len(df))  # keep track of indices (optional)

# Split dataset: Train / Validation / Test
# 15% for test, remaining 85% for train+val
X_train_val, X_test, y_train_val, y_test, idx_train_val, idx_test = train_test_split(
    X, y, idx, test_size=0.15, random_state=42
)

# Further split train+val into Train (70%) and Validation (~15% of total)
# 0.1765 ≈ 15% / 85%
X_train, X_val, y_train, y_val, idx_train, idx_val = train_test_split(
    X_train_val, y_train_val, idx_train_val, test_size=0.1765, random_state=42
)

# Print shapes for sanity check
print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")


Train shape: (6999, 63), Validation shape: (1501, 63), Test shape: (1500, 63)


## Baseline: Ridge Regression

In [11]:
# -------------------------------
# Define Ridge model pipeline
# -------------------------------
ridge = Pipeline([
    ('scaler', StandardScaler(with_mean=True)),
    ('model', Ridge(alpha=1.0, random_state=42))
])

# -------------------------------
# Train Ridge model
# -------------------------------
ridge.fit(X_train, y_train)

# -------------------------------
# Evaluation function
# -------------------------------
def eval_metrics(model, X_train, y_train, X_val, y_val, name="Model"):
    """
    Evaluates a regression model and prints MAE, RMSE, R², and Median AE for Train and Validation sets.
    """
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    
    # Evaluate for both splits
    for split_name, y_true, y_pred in [('Train', y_train, y_pred_train), 
                                       ('Validation', y_val, y_pred_val)]:
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))  # RMSE
        r2 = r2_score(y_true, y_pred)
        medae = median_absolute_error(y_true, y_pred)
        
        print(f"{name} {split_name} Metrics:")
        print(f"  MAE   : {mae:.2f}")
        print(f"  RMSE  : {rmse:.2f}")
        print(f"  R²    : {r2:.3f}")
        print(f"  MedAE : {medae:.2f}\n")

# -------------------------------
# Evaluate Ridge model
# -------------------------------
eval_metrics(ridge, X_train, y_train, X_val, y_val, name='Ridge')

Ridge Train Metrics:
  MAE   : 6437.36
  RMSE  : 7491.98
  R²    : 0.007
  MedAE : 6278.49

Ridge Validation Metrics:
  MAE   : 6432.81
  RMSE  : 7505.91
  R²    : -0.012
  MedAE : 6316.28



## Preferred: LightGBM with compact tuning

In [13]:
# -------------------------------
# Hyperparameter search
# -------------------------------
best_mae = float('inf')
best_params = None

learning_rates = [0.03, 0.05, 0.1]
n_estimators_list = [300, 500, 800]
num_leaves_list = [31, 63]

for lr, n_est, n_leaf in itertools.product(learning_rates, n_estimators_list, num_leaves_list):
    model = lgb.LGBMRegressor(
        random_state=42,
        learning_rate=lr,
        n_estimators=n_est,
        num_leaves=n_leaf,
        subsample=0.9,
        colsample_bytree=0.9
    )
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='l2')
    
    val_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, val_pred)
    
    if mae < best_mae:
        best_mae = mae
        best_params = {'learning_rate': lr, 'n_estimators': n_est, 'num_leaves': n_leaf}

print(f"Best params: {best_params}, Validation MAE: {best_mae:.2f}")

# -------------------------------
# Refit best model on Train + Validation
# -------------------------------
X_train_val = np.vstack([X_train, X_val])
y_train_val = np.concatenate([y_train, y_val])

lgbm = lgb.LGBMRegressor(
    random_state=42,
    **best_params,
    subsample=0.9,
    colsample_bytree=0.9
)
lgbm.fit(X_train_val, y_train_val)

# -------------------------------
# Evaluate on Test set
# -------------------------------
y_test_pred = lgbm.predict(X_test)
mae = mean_absolute_error(y_test, y_test_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
r2 = r2_score(y_test, y_test_pred)
medae = median_absolute_error(y_test, y_test_pred)

print(f"Test Metrics:\n  MAE: {mae:.2f}\n  RMSE: {rmse:.2f}\n  R²: {r2:.3f}\n  MedAE: {medae:.2f}\n")

# -------------------------------
# Feature importance
# -------------------------------
importances = lgbm.feature_importances_
top_features = sorted(zip(feature_cols, importances), key=lambda x: x[1], reverse=True)[:15]

print("Top 15 feature importances:")
for name, value in top_features:
    print(f"  {name}: {value}")

# -------------------------------
# Example predictions for first 5 test rows
# -------------------------------
print("\nExample predictions (first 5 test samples):")
for i in idx_test[:5]:
    true_salary = float(df.iloc[i]['salary_mid'])
    pred_salary = float(lgbm.predict(df.loc[[i], feature_cols].values)[0])
    print({
        'Job_Id': int(df.iloc[i]['Job_Id']),
        'true_salary_mid': true_salary,
        'pred_salary_mid': pred_salary
    })

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001910 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000803 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000895 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000839 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000894 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000801 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001567 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001280 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001419 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000936 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001770 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002072 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001065 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001253 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001262 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001388 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001795 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001345 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 6999, number of used features: 58
[LightGBM] [Info] Start training from score 82463.351907




Best params: {'learning_rate': 0.03, 'n_estimators': 300, 'num_leaves': 31}, Validation MAE: 6554.44
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001668 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13180
[LightGBM] [Info] Number of data points in the train set: 8500, number of used features: 58
[LightGBM] [Info] Start training from score 82455.705882
Test Metrics:
  MAE: 6631.88
  RMSE: 7743.04
  R²: -0.024
  MedAE: 6586.43

Top 15 feature importances:
  longitude: 1116
  latitude: 1055
  experience_mid: 500
  emb_pca_44: 169
  emb_pca_50: 164
  emb_pca_26: 156
  emb_pca_45: 156
  emb_pca_19: 144
  is_master: 143
  emb_pca_24: 142
  emb_pca_29: 142
  emb_pca_17: 141
  emb_pca_25: 137
  emb_pca_48: 136
  emb_pca_16: 134

Example predictions (first 5 test samples):
{'Job_Id': 557420268913803, 'true_salary_mid': 76500.0, 'pred_salary_mid': 80390.88488705443}
{'Job_Id': 2362353092278900, 'true_sal



## Save artifacts

In [14]:
dump(lgbm, os.path.join(artifacts_dir,'model_salary_mid.pkl'))
dump(pca, os.path.join(artifacts_dir,'pca_embeddings.pkl'))
with open(os.path.join(artifacts_dir,'feature_mapper.json'),'w') as f:
    json.dump({'feature_cols': feature_cols, 'pca_cols': pca_cols}, f)
print('Artifacts saved to:', os.path.abspath(artifacts_dir))

Artifacts saved to: /mnt/d/ai courses/JobHunt
