## Записник для експериментів під час тренування моделей

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer, RobustScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression, HuberRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from xgboost import XGBRegressor

import joblib

from src.utils.paths import DATA_DIR, MODELS_DIR


In [2]:
dataframe = pd.read_csv(DATA_DIR / "data_v4.csv")
dataframe.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_category      3044 non-null   object 
 1   seniority_level   3044 non-null   object 
 2   english_level     3044 non-null   object 
 3   experience_years  3044 non-null   float64
 4   salary_usd        3044 non-null   float64
dtypes: float64(2), object(3)
memory usage: 119.0+ KB


In [3]:
def preparing_and_split(dataframe, target):
    X = dataframe.drop(columns = [target])
    y = dataframe[target]
    print(f'X = {X.shape}, y = {y.shape}')

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        train_size= 0.8,
                                                        random_state= 25,
                                                        shuffle=True)

    for name, arr in zip(['X_train', 'X_test', 'y_train', 'y_test'], [X_train, X_test, y_train, y_test]):
        print(f'{name} = {arr.shape}')

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = preparing_and_split(dataframe = dataframe, target = 'salary_usd')


X = (3044, 4), y = (3044,)
X_train = (2435, 4)
X_test = (609, 4)
y_train = (2435,)
y_test = (609,)


In [4]:
num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
ordinal_features = ['english_level']
frequency_features = ['job_category']
target_features = ['seniority_level']

# labeling categories for OrdinalEncoder
level_order = [
    'Elementary',
    'Pre-Intermediate',
    'Intermediate',
    'Upper-Intermediate',
    'Advanced'
]


In [5]:
from src.scripts.encoders import TargetEncoder, FrequencyEncoder


In [6]:
models_updated = {
    "LinearRegression": (
        LinearRegression(),
        {"regressor__fit_intercept": [True, False]}
    ),
    "Ridge": (
        Ridge(),
        {
            "regressor__alpha": [0.1, 1.0, 10.0],
            "regressor__fit_intercept": [True, False]
        }
    ),
    "Lasso": (
        Lasso(max_iter=5000),
        {"regressor__alpha": [0.001, 0.01, 0.1, 1.0]}
    ),
    "ElasticNet": (
        ElasticNet(max_iter=5000),
        {
            "regressor__alpha": [0.001, 0.01, 0.1, 1.0],
            "regressor__l1_ratio": [0.2, 0.5, 0.8]
        }
    ),
    "SVR": (
        SVR(),
        {
            "regressor__kernel": ["linear", "rbf"],
            "regressor__C": [0.1, 1, 10]
        }
    ),
    "KNN": (
        KNeighborsRegressor(),
        {
            "regressor__n_neighbors": [3, 5, 7],
            "regressor__weights": ["uniform", "distance"]
        }
    ),
    "DecisionTree": (
        DecisionTreeRegressor(random_state=25),
        {
            "regressor__max_depth": [3, 5, 10, None]
        }
    ),
    "RandomForest": (
        RandomForestRegressor(random_state=25),
        {
            "regressor__n_estimators": [50, 100],
            "regressor__max_depth": [None, 5, 10]
        }
    ),
    "GradientBoosting": (
        GradientBoostingRegressor(random_state=25),
        {
            "regressor__n_estimators": [100, 300],
            "regressor__learning_rate": [0.01, 0.05, 0.1],
            "regressor__max_depth": [3, 5, 7],
            "regressor__min_samples_split": [2, 5, 10]
        }
    ),
    "XGBoost": (
        XGBRegressor(objective="reg:squarederror", random_state=25, n_jobs=-1),
        {
            "regressor__n_estimators": [100, 300],
            "regressor__learning_rate": [0.01, 0.05, 0.1],
            "regressor__max_depth": [3, 5, 7],
            "regressor__min_child_weight": [1, 3],
            "regressor__subsample": [0.8, 1.0],
            "regressor__colsample_bytree": [0.8, 1.0]
        }
    ),
    "Huber": (
        HuberRegressor(epsilon=1.35, alpha=0.0001),
        {
            "regressor__epsilon": [1.15, 1.35, 1.5],
            "regressor__alpha": [0.0001, 0.001, 0.01]
        }
    ),
    "HistGBM": (
        HistGradientBoostingRegressor(random_state=25),
        {
            "regressor__max_iter": [100, 200, 300],
            "regressor__learning_rate": [0.01, 0.05, 0.1],
            "regressor__max_depth": [3, 5, None],
            "regressor__min_samples_leaf": [20, 30, 50],
            "regressor__l2_regularization": [0.0, 0.1, 0.5]
        }
    ),
    "SVR": (
        SVR(),
        {
            "regressor__kernel": ["linear", "rbf"],
            "regressor__C": [0.1, 1, 10],
            "regressor__epsilon": [0.01, 0.1, 0.2]
        }
    )
}


In [7]:
preprocessor_updated = ColumnTransformer([
    ('num_scaled', StandardScaler(), num_cols), # scaling numeric cols

    # pipeline for processing under TargetEncoder
    ('target_scaled', Pipeline([
        ('encoder', TargetEncoder()),
        ('scaler', StandardScaler())
    ]), target_features),

    # pipeline for processing under FrequencyEncoder
    ('frequency_scaled', Pipeline([
        ('encoder', FrequencyEncoder()),
        ('scaler', StandardScaler())
    ]), frequency_features),

    # pipeline for processing under OrdinalEncoder
    ('ordinal_scaled', Pipeline([
        ('encoder', OrdinalEncoder()),
        ('scaler', StandardScaler())
    ]), ordinal_features)
])

preprocessor_updated


In [8]:
def run_regressions_updated(X_train, X_test, y_train, y_test, save_best_model=bool):

    results = []
    feature_importances = {}

    # best model
    best_test_r2 = -np.inf
    best_model_info = None

    for name, (model, params) in models_updated.items():
        base_pipeline = Pipeline([
            ('preprocessor', preprocessor_updated),
            ('regressor', model)
        ])

        grid = GridSearchCV(estimator = base_pipeline,
                            param_grid = params,
                            scoring='r2',
                            n_jobs=-1)
        grid.fit(X_train, y_train)

        y_pred_test = grid.predict(X_test)
        y_pred_train = grid.predict(X_train)

        # Metrics
        def calculate_metrics(var, ground_truth, predictions):
            R2 = r2_score(ground_truth, predictions)
            MAE = mean_absolute_error(ground_truth, predictions)
            RMSE = np.sqrt(mean_squared_error(ground_truth, predictions))
            REL_MAE = MAE / np.mean(predictions) * 100

            result = {
                'model': name,
                'dataset_var': var,
                'R2': round(R2, 2),
                'MAE': round(MAE, 1),
                'RMSE': round(RMSE, 1),
                'REL_MAE': round(REL_MAE, 1)
            }
            if var == 'test':
                result['best_params'] = grid.best_params_
            return result


        train_results = calculate_metrics(var='train', ground_truth=y_train, predictions=y_pred_train)
        test_results = calculate_metrics(var='test', ground_truth=y_test, predictions=y_pred_test)

        results.append(train_results)
        results.append(test_results)

        # best model
        current_test_r2 = test_results['R2']
        if current_test_r2 > best_test_r2:
            best_test_r2 = current_test_r2
            best_model_info = {
                'name': name,
                'pipeline': grid.best_estimator_,
                'preprocessor': grid.best_estimator_.named_steps['preprocessor'],
                'regressor': grid.best_estimator_.named_steps['regressor'],
                'test_r2': current_test_r2,
                'best_params': grid.best_params_
            }

        # Feature importances
        best_model = grid.best_estimator_.named_steps['regressor']

        try:
            if hasattr(best_model, 'feature_importances_'):
                feature_importances[name] = best_model.feature_importances_
            elif hasattr(best_model, 'coef_'):
                feature_importances[name] = np.abs(best_model.coef_)
        except:
            feature_importances[name] = None

        print(f'Training for {name} has ended.')

    # best model
    if save_best_model and best_model_info:
        joblib.dump(best_model_info['pipeline'], MODELS_DIR / 'best_model_v2.pkl')
        joblib.dump(best_model_info['preprocessor'], MODELS_DIR / 'preprocessor_v2.pkl')

        # Save data
        model_metadata = {
            'model_name': best_model_info['name'],
            'test_r2': best_model_info['test_r2'],
            'best_params': best_model_info['best_params']
        }
        joblib.dump(model_metadata, MODELS_DIR / "model_metadata_v2.pkl")

        print(f"Best model saved: {best_model_info['name']}, test R2: {best_model_info['test_r2']}")
        print(f"Files saved to {MODELS_DIR}")

    df_results = pd.DataFrame(results)
    return df_results, feature_importances, best_model_info


In [9]:
run_experiments, importances, best_model = run_regressions_updated(X_train, X_test, y_train, y_test)


Training for LinearRegression has ended.
Training for Ridge has ended.
Training for Lasso has ended.
Training for ElasticNet has ended.
Training for SVR has ended.
Training for KNN has ended.
Training for DecisionTree has ended.
Training for RandomForest has ended.
Training for GradientBoosting has ended.


  _data = np.array(data, dtype=dtype, copy=copy,


Training for XGBoost has ended.
Training for Huber has ended.


  _data = np.array(data, dtype=dtype, copy=copy,


Training for HistGBM has ended.
Best model saved: RandomForest, test R2: 0.79
Files saved to /Users/vladislavpleshko/Documents/VS Code/projects/salary-predictor/models


In [10]:
run_experiments.sort_values(by=['model', 'dataset_var', 'R2'], ascending=False)


Unnamed: 0,model,dataset_var,R2,MAE,RMSE,REL_MAE,best_params
18,XGBoost,train,0.81,611.6,850.3,20.9,
19,XGBoost,test,0.79,660.7,933.8,21.6,"{'regressor__colsample_bytree': 1.0, 'regresso..."
8,SVR,train,0.75,648.6,961.4,23.9,
9,SVR,test,0.74,713.2,1036.6,25.3,"{'regressor__C': 10, 'regressor__epsilon': 0.0..."
2,Ridge,train,0.78,664.2,916.6,22.7,
3,Ridge,test,0.77,699.7,965.7,23.0,"{'regressor__alpha': 10.0, 'regressor__fit_int..."
14,RandomForest,train,0.81,609.7,852.0,20.9,
15,RandomForest,test,0.79,658.2,933.3,21.7,"{'regressor__max_depth': 5, 'regressor__n_esti..."
0,LinearRegression,train,0.78,664.8,916.5,22.7,
1,LinearRegression,test,0.77,699.7,965.1,22.9,{'regressor__fit_intercept': True}


In [11]:
run_experiments[(run_experiments['dataset_var'] == 'test')].sort_values(by=['R2'], ascending=False)


Unnamed: 0,model,dataset_var,R2,MAE,RMSE,REL_MAE,best_params
15,RandomForest,test,0.79,658.2,933.3,21.7,"{'regressor__max_depth': 5, 'regressor__n_esti..."
19,XGBoost,test,0.79,660.7,933.8,21.6,"{'regressor__colsample_bytree': 1.0, 'regresso..."
17,GradientBoosting,test,0.78,666.8,949.1,21.9,"{'regressor__learning_rate': 0.05, 'regressor_..."
23,HistGBM,test,0.78,664.0,959.7,21.8,"{'regressor__l2_regularization': 0.5, 'regress..."
1,LinearRegression,test,0.77,699.7,965.1,22.9,{'regressor__fit_intercept': True}
3,Ridge,test,0.77,699.7,965.7,23.0,"{'regressor__alpha': 10.0, 'regressor__fit_int..."
5,Lasso,test,0.77,699.5,965.0,22.9,{'regressor__alpha': 1.0}
7,ElasticNet,test,0.77,699.7,965.8,23.0,"{'regressor__alpha': 0.01, 'regressor__l1_rati..."
13,DecisionTree,test,0.77,697.6,976.6,22.8,{'regressor__max_depth': 3}
11,KNN,test,0.76,682.3,993.8,22.6,"{'regressor__n_neighbors': 7, 'regressor__weig..."


In [12]:
cols = X_train.columns

for model_name, imp_it in importances.items():
    print(f'\n-- Regression "{model_name}" --')
    for i, (col_name, importance) in enumerate(zip(cols ,imp_it)):
        print(f'Feature {i}: "{col_name}": {np.round(importance, 2)}')



-- Regression "LinearRegression" --
Feature 0: "job_category": 178.13
Feature 1: "seniority_level": 1531.22
Feature 2: "english_level": 144.5
Feature 3: "experience_years": 50.86

-- Regression "Ridge" --
Feature 0: "job_category": 191.48
Feature 1: "seniority_level": 1514.0
Feature 2: "english_level": 144.08
Feature 3: "experience_years": 51.37

-- Regression "Lasso" --
Feature 0: "job_category": 177.86
Feature 1: "seniority_level": 1530.61
Feature 2: "english_level": 143.51
Feature 3: "experience_years": 49.89

-- Regression "ElasticNet" --
Feature 0: "job_category": 194.32
Feature 1: "seniority_level": 1510.32
Feature 2: "english_level": 143.98
Feature 3: "experience_years": 51.47

-- Regression "SVR" --
Feature 0: "job_category": [ 366.92 1221.61   86.09   21.09]

-- Regression "DecisionTree" --
Feature 0: "job_category": 0.0
Feature 1: "seniority_level": 0.99
Feature 2: "english_level": 0.01
Feature 3: "experience_years": 0.0

-- Regression "RandomForest" --
Feature 0: "job_categ