In [28]:
import pandas as pd
data = pd.read_csv('https://raw.githubusercontent.com/zzeiidann/Data/main/Salaries.csv')

In [29]:
data.head()

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
0,Prof,B,19,18,Male,139750
1,Prof,B,20,16,Male,173200
2,AsstProf,B,4,3,Male,79750
3,Prof,B,45,39,Male,115000
4,Prof,B,40,41,Male,141500


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   rank           397 non-null    object
 1   discipline     397 non-null    object
 2   yrs.since.phd  397 non-null    int64 
 3   yrs.service    397 non-null    int64 
 4   sex            397 non-null    object
 5   salary         397 non-null    int64 
dtypes: int64(3), object(3)
memory usage: 18.7+ KB


In [31]:
data.describe()

Unnamed: 0,yrs.since.phd,yrs.service,salary
count,397.0,397.0,397.0
mean,22.314861,17.61461,113706.458438
std,12.887003,13.006024,30289.038695
min,1.0,0.0,57800.0
25%,12.0,7.0,91000.0
50%,21.0,16.0,107300.0
75%,32.0,27.0,134185.0
max,56.0,60.0,231545.0


In [32]:
data.isnull().sum()

rank             0
discipline       0
yrs.since.phd    0
yrs.service      0
sex              0
salary           0
dtype: int64

In [33]:
from sklearn.preprocessing import LabelEncoder

string_cols = ['rank', 'discipline', 'sex']

encoder = LabelEncoder()

for col in string_cols:
    data[col] = encoder.fit_transform(data[col])

In [34]:
data[string_cols]

Unnamed: 0,rank,discipline,sex
0,2,1,1
1,2,1,1
2,1,1,1
3,2,1,1
4,2,1,1
...,...,...,...
392,2,0,1
393,2,0,1
394,2,0,1
395,2,0,1


In [35]:
data

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
0,2,1,19,18,1,139750
1,2,1,20,16,1,173200
2,1,1,4,3,1,79750
3,2,1,45,39,1,115000
4,2,1,40,41,1,141500
...,...,...,...,...,...,...
392,2,0,33,30,1,103106
393,2,0,31,19,1,150564
394,2,0,42,25,1,101738
395,2,0,25,15,1,95329


In [36]:
import numpy as np

# 1. Tenure in Role
data['tenure_in_role'] = data['yrs.service'] + data['yrs.since.phd']

# 2. Experience Grouping
bins = [0, 5, 10, 20, 40, np.inf]  # Bins based on experience years
labels = ['Low Experience', 'Mid Experience', 'Experienced', 'Senior', 'Veteran']
data['experience_group'] = pd.cut(data['yrs.service'], bins=bins, labels=labels)


In [37]:
data

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary,tenure_in_role,experience_group
0,2,1,19,18,1,139750,37,Experienced
1,2,1,20,16,1,173200,36,Experienced
2,1,1,4,3,1,79750,7,Low Experience
3,2,1,45,39,1,115000,84,Senior
4,2,1,40,41,1,141500,81,Veteran
...,...,...,...,...,...,...,...,...
392,2,0,33,30,1,103106,63,Senior
393,2,0,31,19,1,150564,50,Experienced
394,2,0,42,25,1,101738,67,Senior
395,2,0,25,15,1,95329,40,Experienced


In [38]:
from sklearn.preprocessing import LabelEncoder

string_cols = ['rank', 'discipline', 'sex', 'experience_group']

encoder = LabelEncoder()

for col in string_cols:
    data[col] = encoder.fit_transform(data[col])

In [39]:
data

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary,tenure_in_role,experience_group
0,2,1,19,18,1,139750,37,0
1,2,1,20,16,1,173200,36,0
2,1,1,4,3,1,79750,7,1
3,2,1,45,39,1,115000,84,3
4,2,1,40,41,1,141500,81,4
...,...,...,...,...,...,...,...,...
392,2,0,33,30,1,103106,63,3
393,2,0,31,19,1,150564,50,0
394,2,0,42,25,1,101738,67,3
395,2,0,25,15,1,95329,40,0


In [42]:
# Install necessary libraries if not installed
# !pip install xgboost lightgbm scikit-learn

import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb

# Prepare features and target
X = data.drop(columns=['salary'])
y = data['salary']

# Define baseline models
models = {
    "Random Forest": RandomForestRegressor(),
    "XGBoost": xgb.XGBRegressor(),
    "LightGBM": lgb.LGBMRegressor()
}

# Function to calculate RMSE, MAE, and MAPE using cross-validation
def evaluate_model(model, X, y):
    # RMSE
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5))
    
    # MAE
    mae = -cross_val_score(model, X, y, scoring="neg_mean_absolute_error", cv=5)
    
    # MAPE calculation
    model.fit(X, y)  # Fit model first for predictions
    y_pred = model.predict(X)
    mape = np.mean(np.abs((y - y_pred) / y)) * 100

    return np.mean(rmse), np.mean(mae), mape

# Store results in a DataFrame
results = []

for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    rmse, mae, mape = evaluate_model(model, X, y)
    results.append([model_name, rmse, mae, mape])

# Create DataFrame for easy viewing
results_df = pd.DataFrame(results, columns=["Model", "RMSE", "MAE", "MAPE"])

# Display the DataFrame
print(results_df)


Evaluating Random Forest...
Evaluating XGBoost...
Evaluating LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000103 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 173
[LightGBM] [Info] Number of data points in the train set: 317, number of used features: 7
[LightGBM] [Info] Start training from score 114842.908517
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 317, number of used features: 7
[LightGBM] [Info] Start training from score 114770.981073
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000173 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Inf

In [43]:
results_df

Unnamed: 0,Model,RMSE,MAE,MAPE
0,Random Forest,26172.230608,18513.45704,6.353357
1,XGBoost,30026.571409,21392.995703,3.111166
2,LightGBM,24355.588537,17635.585038,10.486557


In [46]:
# Install necessary libraries if not installed
# !pip install optuna lightgbm scikit-learn

import optuna
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

# Prepare features and target
X = data.drop(columns=['salary'])
y = data['salary']

# Objective function for Optuna to optimize the hyperparameters
def objective(trial):
    # Hyperparameters to tune
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',  # Gradient Boosting Decision Tree
        'num_leaves': trial.suggest_int('num_leaves', 31, 128),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
    }

    # Create the LGBM model with the parameters
    model = lgb.LGBMRegressor(**param, verbose=-1)

    # Cross-validation with RMSE
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5).mean())
    
    return rmse  # Return RMSE as the objective to minimize

# Create the Optuna study to optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)  # Run 100 trials

# Output the best parameters and best RMSE score
print(f"Best trial: {study.best_trial.params}")
print(f"Best RMSE: {study.best_trial.value}")


[I 2025-12-17 21:59:58,917] A new study created in memory with name: no-name-954270fd-0eba-4775-9ced-59284de0c045
[I 2025-12-17 21:59:59,358] Trial 0 finished with value: 29281.768597874852 and parameters: {'num_leaves': 123, 'learning_rate': 0.0007223725553862801, 'n_estimators': 165, 'max_depth': 8, 'min_child_samples': 42, 'subsample': 0.675205396986381, 'colsample_bytree': 0.6738197024149702}. Best is trial 0 with value: 29281.768597874852.
[I 2025-12-17 21:59:59,559] Trial 1 finished with value: 30493.793929206695 and parameters: {'num_leaves': 61, 'learning_rate': 1.9934562696653916e-05, 'n_estimators': 83, 'max_depth': 7, 'min_child_samples': 45, 'subsample': 0.9528458055332634, 'colsample_bytree': 0.7054865796195308}. Best is trial 0 with value: 29281.768597874852.
[I 2025-12-17 21:59:59,728] Trial 2 finished with value: 30500.40478251503 and parameters: {'num_leaves': 32, 'learning_rate': 1.549085070537969e-05, 'n_estimators': 65, 'max_depth': 4, 'min_child_samples': 44, 'subs

Best trial: {'num_leaves': 37, 'learning_rate': 0.00895083344259933, 'n_estimators': 305, 'max_depth': 4, 'min_child_samples': 31, 'subsample': 0.9502844505914497, 'colsample_bytree': 0.9488297947732}
Best RMSE: 23100.955876476793
