In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression, HuberRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from xgboost import XGBRegressor

import sys
from pathlib import Path
sys.path.append('..')
from paths import DATA_DIR


In [4]:

df = pd.read_csv(DATA_DIR / "prepared_data.csv")
df


Unnamed: 0,salary_usd,title,category,position_grouped,english_level,it_experience_years
0,2000.0,Middle,Marketing,Marketing Manager,Upper-Intermediate,4.00
1,3000.0,Senior,Software Engineer,"Software Engineer / Developer (frontend, backe...",Pre-Intermediate,8.00
2,282.0,Junior,Software Engineer,"Software Engineer / Developer (frontend, backe...",Pre-Intermediate,0.25
3,2700.0,Middle,Quality Assurance,Manual QA,Upper-Intermediate,4.00
4,2500.0,Middle,Software Engineer,"Software Engineer / Developer (frontend, backe...",Intermediate,4.00
...,...,...,...,...,...,...
11680,3000.0,Middle,Software Engineer,"Software Engineer / Developer (frontend, backe...",Intermediate,4.00
11681,2500.0,Middle,Analyst,System Analyst,Upper-Intermediate,1.50
11682,4500.0,Senior,Software Engineer,"Software Engineer / Developer (frontend, backe...",Upper-Intermediate,9.00
11683,1000.0,Middle,Software Engineer,"Software Engineer / Developer (frontend, backe...",Upper-Intermediate,3.00


In [5]:
df.dtypes


salary_usd             float64
title                   object
category                object
position_grouped        object
english_level           object
it_experience_years    float64
dtype: object

In [6]:
X = df.drop(columns=['salary_usd'])
y = df['salary_usd']
print(f'X = {X.shape}, y = {y.shape}')


X = (11685, 5), y = (11685,)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.8,
                                                    random_state=25,
                                                    shuffle=True)


In [8]:
# Скидаємо індекси після train_test_split, оскільки інакше виникає помилка при fit у sklearn pipeline (індекси не співпадають)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


In [9]:
for name, arr in zip(['X_train', 'X_test', 'y_train', 'y_test'], [X_train, X_test, y_train, y_test]):
    print(f'{name} = {arr.shape}')


X_train = (9348, 5)
X_test = (2337, 5)
y_train = (9348,)
y_test = (2337,)


In [22]:
num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
ordinal_features = ['english_level']
frequency_features = ['category']
target_features = ['title']


In [11]:
# labeling categories for OrdinalEncoder
level_order = [
    'Не знаю взагалі',
    'Elementary',
    'Pre-Intermediate',
    'Intermediate',
    'Upper-Intermediate',
    'Advanced'
]


In [12]:
import sys
import os
sys.path.append(os.path.abspath(".."))
from src.encoders import TargetEncoder, FrequencyEncoder


In [13]:
preprocessor = ColumnTransformer([
    ('num_scaled', StandardScaler(), num_cols), # scaling numeric cols

    # pipeline for processing under TargetEncoder
    ('target_scaled', Pipeline([
        ('encoder', TargetEncoder()),
        ('scaler', StandardScaler())
    ]), target_features),

    # pipeline for processing under FrequencyEncoder
    ('frequency_scaled', Pipeline([
        ('encoder', FrequencyEncoder()),
        ('scaler', StandardScaler())
    ]), frequency_features),

    # pipeline for processing under OrdinalEncoder
    ('ordinal_scaled', Pipeline([
        ('encoder', OrdinalEncoder(categories=[level_order])),
        ('scaler', StandardScaler())
    ]), ordinal_features)
])

preprocessor


In [None]:
models = {
    "LinearRegression": (
        LinearRegression(),
        {"regressor__fit_intercept": [True, False]}  # додавати константу чи ні
    ),
    "Ridge": (
        Ridge(),
        {
            "regressor__alpha": [0.1, 1.0, 10.0],        # сила L2 регуляризації
            "regressor__fit_intercept": [True, False]    # додавати константу чи ні
        }
    ),
    "Lasso": (
        Lasso(max_iter=5000),
        {
            "regressor__alpha": [0.001, 0.01, 0.1, 1.0] # сила L1 регуляризації (занулює коефіцієнти)
        }
    ),
    "ElasticNet": (
        ElasticNet(max_iter=5000),
        {
            "regressor__alpha": [0.001, 0.01, 0.1, 1.0], # сила регуляризації (L1+L2)
            "regressor__l1_ratio": [0.2, 0.5, 0.8]       # баланс L1/L2 (0=L2,1=L1)
        }
    ),
    "SVR": (
        SVR(),
        {
            "regressor__kernel": ["linear", "rbf"],  # тип ядра
            "regressor__C": [0.1, 1, 10]            # регуляризація (чим більше → складніше)
        }
    ),
    "KNN": (
        KNeighborsRegressor(),
        {
            "regressor__n_neighbors": [3, 5, 7],         # кількість сусідів
            "regressor__weights": ["uniform", "distance"] # вага сусідів
        }
    ),
    "DecisionTree": (
        DecisionTreeRegressor(random_state=25),
        {
            "regressor__max_depth": [3, 5, 10, None] # максимальна глибина дерева
        }
    ),
    "RandomForest": (
        RandomForestRegressor(random_state=25),
        {
            "regressor__n_estimators": [50, 100],   # кількість дерев
            "regressor__max_depth": [None, 5, 10]   # глибина дерев
        }
    ),
    "XGBoost": (
        XGBRegressor(objective="reg:squarederror", random_state=25, n_jobs=-1),
        {
            "regressor__n_estimators": [100, 300],        # кількість дерев
            "regressor__learning_rate": [0.01, 0.05, 0.1],# швидкість навчання
            "regressor__max_depth": [3, 5, 7],            # максимальна глибина дерева
            "regressor__min_child_weight": [1, 3],        # мінімальна вага вузла
            "regressor__subsample": [0.8, 1.0],          # частка рядків для дерева
            "regressor__colsample_bytree": [0.8, 1.0]    # частка фіч для дерева
        })
}


In [None]:
def run_regressions(X_train, X_test, y_train, y_test):

    results = []

    for name, (model, params) in models.items():
        base_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])

        grid = GridSearchCV(estimator = base_pipeline,
                            param_grid = params,
                            scoring='r2',
                            n_jobs=-1)
        grid.fit(X_train, y_train)

        y_pred_test = grid.predict(X_test)
        y_pred_train = grid.predict(X_train)

        def calculate_metrics(var, ground_truth, predictions):
            R2 = r2_score(ground_truth, predictions)
            MAE = mean_absolute_error(ground_truth, predictions)
            RMSE = np.sqrt(mean_squared_error(ground_truth, predictions))
            REL_MAE = MAE / np.mean(predictions) * 100

            result = {
                'model': name,
                'dataset_var': var,
                'R2': round(R2, 2),
                'MAE': round(MAE, 1),
                'RMSE': round(RMSE, 1),
                'REL_MAE': round(REL_MAE, 1)
            }
            if var == 'test':
                result['best_params'] = grid.best_params_
            return result

        results.append(calculate_metrics(var='train', ground_truth=y_train, predictions=y_pred_train))
        results.append(calculate_metrics(var='test', ground_truth=y_test, predictions=y_pred_test))
        print(f'Training for {model} is ended.')

    df_results = pd.DataFrame(results)
    return df_results


In [None]:
regressions = run_regressions(X_train, X_test, y_train, y_test)


Training for LinearRegression() is ended.
Training for Ridge() is ended.
Training for Lasso(max_iter=5000) is ended.
Training for ElasticNet(max_iter=5000) is ended.
Training for SVR() is ended.
Training for KNeighborsRegressor() is ended.
Training for DecisionTreeRegressor(random_state=25) is ended.
Training for RandomForestRegressor(random_state=25) is ended.
Training for XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
        

In [None]:
regressions.sort_values(by=['model', 'dataset_var', 'R2'], ascending=False)


Unnamed: 0,model,dataset_var,R2,MAE,RMSE,REL_MAE,best_params
16,XGBoost,train,0.62,919.1,1422.0,28.0,
17,XGBoost,test,0.51,1015.0,1653.2,31.1,"{'regressor__colsample_bytree': 0.8, 'regresso..."
8,SVR,train,0.45,1098.8,1717.8,37.0,
9,SVR,test,0.43,1080.9,1780.9,36.9,"{'regressor__C': 10, 'regressor__kernel': 'lin..."
2,Ridge,train,0.47,1124.5,1686.5,34.7,
3,Ridge,test,0.45,1115.7,1750.8,34.9,"{'regressor__alpha': 10.0, 'regressor__fit_int..."
14,RandomForest,train,0.65,874.8,1370.5,27.1,
15,RandomForest,test,0.48,1025.9,1695.4,31.7,"{'regressor__max_depth': 10, 'regressor__n_est..."
0,LinearRegression,train,0.47,1124.6,1686.5,34.7,
1,LinearRegression,test,0.45,1115.7,1750.7,34.9,{'regressor__fit_intercept': True}


---


Після виконання інспекції, було виявлено та додано:
- некоректні значення (0) та екстремально високі значення у цільовій зміні, що ідентифікується, як викиди.
- дані нерівномірно розподілені (рекомендується використовувати Log-трансформацію - додано у пайплайн)
- використано RobustScaler(), який краще працює із екстремальними фічами. (upd: not helped (X).)

In [None]:
from src.encoders import TwoStageRegressor


In [20]:
models_updated = {
    "LinearRegression": (
        LinearRegression(),
        {"regressor__regressor__fit_intercept": [True, False]}
    ),
    "Ridge": (
        Ridge(),
        {
            "regressor__regressor__alpha": [0.1, 1.0, 10.0],
            "regressor__regressor__fit_intercept": [True, False]
        }
    ),
    "Lasso": (
        Lasso(max_iter=5000),
        {"regressor__regressor__alpha": [0.001, 0.01, 0.1, 1.0]}
    ),
    "ElasticNet": (
        ElasticNet(max_iter=5000),
        {
            "regressor__regressor__alpha": [0.001, 0.01, 0.1, 1.0],
            "regressor__regressor__l1_ratio": [0.2, 0.5, 0.8]
        }
    ),
    "SVR": (
        SVR(),
        {
            "regressor__regressor__kernel": ["linear", "rbf"],
            "regressor__regressor__C": [0.1, 1, 10]
        }
    ),
    "KNN": (
        KNeighborsRegressor(),
        {
            "regressor__regressor__n_neighbors": [3, 5, 7],
            "regressor__regressor__weights": ["uniform", "distance"]
        }
    ),
    "DecisionTree": (
        DecisionTreeRegressor(random_state=25),
        {
            "regressor__regressor__max_depth": [3, 5, 10, None]
        }
    ),
    "RandomForest": (
        RandomForestRegressor(random_state=25),
        {
            "regressor__regressor__n_estimators": [50, 100],
            "regressor__regressor__max_depth": [None, 5, 10]
        }
    ),
    "GradientBoosting": (
        GradientBoostingRegressor(random_state=25),
        {
            "regressor__regressor__n_estimators": [100, 300],
            "regressor__regressor__learning_rate": [0.01, 0.05, 0.1],
            "regressor__regressor__max_depth": [3, 5, 7],
            "regressor__regressor__min_samples_split": [2, 5, 10]
        }
    ),
    "XGBoost": (
        XGBRegressor(objective="reg:squarederror", random_state=25, n_jobs=-1),
        {
            "regressor__regressor__n_estimators": [100, 300],
            "regressor__regressor__learning_rate": [0.01, 0.05, 0.1],
            "regressor__regressor__max_depth": [3, 5, 7],
            "regressor__regressor__min_child_weight": [1, 3],
            "regressor__regressor__subsample": [0.8, 1.0],
            "regressor__regressor__colsample_bytree": [0.8, 1.0]
        }
    ),
    "Huber": (
        HuberRegressor(epsilon=1.35, alpha=0.0001),
        {
            "regressor__regressor__epsilon": [1.15, 1.35, 1.5],
            "regressor__regressor__alpha": [0.0001, 0.001, 0.01]
        }
    ),
    "HistGBM": (
        HistGradientBoostingRegressor(random_state=25),
        {
            "regressor__regressor__max_iter": [100, 200, 300],
            "regressor__regressor__learning_rate": [0.01, 0.05, 0.1],
            "regressor__regressor__max_depth": [3, 5, None],
            "regressor__regressor__min_samples_leaf": [20, 30, 50],
            "regressor__regressor__l2_regularization": [0.0, 0.1, 0.5]
        }
    ),
    "SVR": (
        SVR(),
        {
            "regressor__regressor__kernel": ["linear", "rbf"],
            "regressor__regressor__C": [0.1, 1, 10],
            "regressor__regressor__epsilon": [0.01, 0.1, 0.2]
        }
    )

    #     ,
    # "TwoStageRegressor": (
    #     TwoStageRegressor(
    #         classifier=LogisticRegression(),
    #         reg_low=LinearRegression(),
    #         reg_high=RandomForestRegressor()
    #     ),
    #     {
    #         "regressor__regressor__threshold": [0.90, 0.95]
    #     }
    # )
}


In [14]:
df_new = pd.read_csv(DATA_DIR / "prep_data_v2.csv")
df_new.head(5)


Unnamed: 0,salary_usd,title,category,english_level,it_experience_years
0,2000.0,Middle,Analyst,Pre-Intermediate,2.0
1,4000.0,Senior,Quality Assurance,Intermediate,5.0
2,10000.0,Lead / Team Lead,DevOps,Upper-Intermediate,9.0
3,3500.0,Senior,Management,Advanced,5.0
4,3000.0,Lead / Team Lead,Management,Pre-Intermediate,18.0


In [None]:
def preparing_and_split(dataframe, target):
    X = dataframe.drop(columns = [target])
    y = dataframe[target]
    print(f'X = {X.shape}, y = {y.shape}')

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        train_size= 0.8,
                                                        random_state= 25,
                                                        shuffle=True)

    for name, arr in zip(['X_train', 'X_test', 'y_train', 'y_test'], [X_train, X_test, y_train, y_test]):
        print(f'{name} = {arr.shape}')

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = preparing_and_split(dataframe = df_new, target = 'salary_usd')


X = (8029, 4), y = (8029,)
X_train = (6423, 4)
X_test = (1606, 4)
y_train = (6423,)
y_test = (1606,)


In [24]:
preprocessor_updated = ColumnTransformer([
    ('num_scaled', RobustScaler(), num_cols), # scaling numeric cols

    # pipeline for processing under TargetEncoder
    ('target_scaled', Pipeline([
        ('encoder', TargetEncoder()),
        ('scaler', StandardScaler())
    ]), target_features),

    # pipeline for processing under FrequencyEncoder
    ('frequency_scaled', Pipeline([
        ('encoder', FrequencyEncoder()),
        ('scaler', StandardScaler())
    ]), frequency_features),

    # pipeline for processing under OrdinalEncoder
    ('ordinal_scaled', Pipeline([
        ('encoder', OrdinalEncoder(categories=[level_order])),
        ('scaler', StandardScaler())
    ]), ordinal_features)
])

preprocessor_updated


In [25]:
def run_regressions_updated(X_train, X_test, y_train, y_test):

    results = []
    feature_importances = {}

    for name, (model, params) in models_updated.items():
        base_pipeline = Pipeline([
            ('preprocessor', preprocessor_updated),
            ('regressor', model)
        ])

        pipeline_log_target = TransformedTargetRegressor(
            regressor = base_pipeline,
            transformer = FunctionTransformer(func = np.log,
                                              inverse_func = np.exp))

        grid = GridSearchCV(estimator = pipeline_log_target,
                            param_grid = params,
                            scoring='r2',
                            n_jobs=-1)
        grid.fit(X_train, y_train)

        y_pred_test = grid.predict(X_test)
        y_pred_train = grid.predict(X_train)

        def calculate_metrics(var, ground_truth, predictions):
            R2 = r2_score(ground_truth, predictions)
            MAE = mean_absolute_error(ground_truth, predictions)
            RMSE = np.sqrt(mean_squared_error(ground_truth, predictions))
            REL_MAE = MAE / np.mean(predictions) * 100

            result = {
                'model': name,
                'dataset_var': var,
                'R2': round(R2, 2),
                'MAE': round(MAE, 1),
                'RMSE': round(RMSE, 1),
                'REL_MAE': round(REL_MAE, 1)
            }
            if var == 'test':
                result['best_params'] = grid.best_params_
            return result


        results.append(calculate_metrics(var='train', ground_truth=y_train, predictions=y_pred_train))
        results.append(calculate_metrics(var='test', ground_truth=y_test, predictions=y_pred_test))

    # Додаємо feature importance
        best_model = grid.best_estimator_.regressor_.named_steps['regressor']

        try:
            if hasattr(best_model, 'feature_importances_'):
                feature_importances[name] = best_model.feature_importances_
            elif hasattr(best_model, 'coef_'):
                feature_importances[name] = np.abs(best_model.coef_)
        except:
            feature_importances[name] = None

        print(f'Training for {name} has ended.')

    df_results = pd.DataFrame(results)
    return df_results, feature_importances


In [26]:
updated_experiments, importances = run_regressions_updated(X_train, X_test, y_train, y_test)


Training for LinearRegression has ended.
Training for Ridge has ended.
Training for Lasso has ended.
Training for ElasticNet has ended.
Training for SVR has ended.
Training for KNN has ended.
Training for DecisionTree has ended.
Training for RandomForest has ended.
Training for GradientBoosting has ended.


  _data = np.array(data, dtype=dtype, copy=copy,


Training for XGBoost has ended.
Training for Huber has ended.
Training for HistGBM has ended.


In [27]:
updated_experiments.sort_values(by=['model', 'dataset_var', 'R2'], ascending=False)


Unnamed: 0,model,dataset_var,R2,MAE,RMSE,REL_MAE,best_params
18,XGBoost,train,0.53,914.2,1508.3,33.5,
19,XGBoost,test,0.55,948.1,1492.8,33.9,{'regressor__regressor__colsample_bytree': 0.8...
8,SVR,train,0.48,978.0,1592.4,36.1,
9,SVR,test,0.46,1034.5,1643.6,37.2,"{'regressor__regressor__C': 10, 'regressor__re..."
2,Ridge,train,0.38,1089.6,1725.6,41.3,
3,Ridge,test,0.4,1109.7,1726.5,41.2,"{'regressor__regressor__alpha': 0.1, 'regresso..."
14,RandomForest,train,0.59,853.0,1414.6,31.4,
15,RandomForest,test,0.53,963.5,1523.1,34.7,"{'regressor__regressor__max_depth': 10, 'regre..."
0,LinearRegression,train,0.38,1089.6,1725.6,41.3,
1,LinearRegression,test,0.4,1109.7,1726.5,41.2,{'regressor__regressor__fit_intercept': True}


In [28]:
for model_name, imp in importances.items():
    if imp is not None:
        print(f"\n{model_name}:")
        for i, importance in enumerate(imp):
            print(f"Feature {i}: {importance:.3f}")



LinearRegression:
Feature 0: 0.120
Feature 1: 0.433
Feature 2: 0.054
Feature 3: 0.120

Ridge:
Feature 0: 0.120
Feature 1: 0.433
Feature 2: 0.054
Feature 3: 0.120

Lasso:
Feature 0: 0.120
Feature 1: 0.433
Feature 2: 0.053
Feature 3: 0.119

ElasticNet:
Feature 0: 0.120
Feature 1: 0.433
Feature 2: 0.054
Feature 3: 0.120

DecisionTree:
Feature 0: 0.057
Feature 1: 0.832
Feature 2: 0.090
Feature 3: 0.020

RandomForest:
Feature 0: 0.102
Feature 1: 0.691
Feature 2: 0.150
Feature 3: 0.057

GradientBoosting:
Feature 0: 0.141
Feature 1: 0.669
Feature 2: 0.143
Feature 3: 0.047

XGBoost:
Feature 0: 0.216
Feature 1: 0.565
Feature 2: 0.116
Feature 3: 0.103

Huber:
Feature 0: 0.119
Feature 1: 0.445
Feature 2: 0.046
Feature 3: 0.119


In [29]:
X_train.columns


Index(['title', 'category', 'english_level', 'it_experience_years'], dtype='object')

In [None]:
X_train['category'].value_counts()


category
Quality Assurance    1562
Software Engineer    1170
Management            889
Analyst               443
Design                431
DevOps                417
Marketing             330
Other                 289
Data Scientist        249
HR                    214
Customer Support      151
SysAdmin              109
Sales                 106
Security               42
Technical Writing      21
Name: count, dtype: int64