In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Random Forest

In [2]:
df = pd.read_csv("Salary_Data.csv")

In [3]:
df.head()

Unnamed: 0,Age,Gender,education_level,job_title,years_of_experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,29.0,Male,Bachelor's,Marketing Analyst,2.0,55000.0
3,42.0,Female,Master's,Product Manager,12.0,120000.0
4,26.0,Female,Bachelor's,Marketing Coordinator,1.0,45000.0


In [None]:
len(df) 

4542

In [5]:
df.dtypes

Age                    float64
Gender                  object
Education Level         object
Job Title               object
Years of Experience    float64
Salary                 float64
dtype: object

In [6]:
df["Job Title"].value_counts()

Job Title
Software Engineer            518
Data Scientist               453
Software Engineer Manager    376
Data Analyst                 363
Senior Project Engineer      318
Product Manager              313
Full Stack Engineer          308
Marketing Manager            255
Senior Software Engineer     244
Back end Developer           244
Front end Developer          241
Marketing Coordinator        158
Junior Sales Associate       142
Financial Manager            134
Marketing Analyst            132
Software Developer           125
Operations Manager           114
Human Resources Manager      104
Name: count, dtype: int64

In [7]:
df.dropna(inplace=True)

In [8]:
categorical_cols = ["Gender", "Education Level", "Job Title"]
df_with_dummies = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [None]:
categorical_cols = ["Gender", "Education Level", "Job Title"]
df_with_dummies = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

X = df_with_dummies.drop(columns="Salary", axis=1)  
y = df_with_dummies['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

print("Best Parameters:", grid_search.best_params_)
print("Mean Absolute Error (MAE): ", mean_absolute_error(y_test, y_pred))
print("Test R^2 Score:", r2_score(y_test, y_pred))
print("Train R^2 Score:", r2_score(y_train, y_train_pred))

In [9]:
X = df_with_dummies.drop(columns="Salary", axis=1)  
y = df_with_dummies['Salary']

In [10]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
rf = RandomForestRegressor(random_state=42)

In [13]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [14]:
best_model = grid_search.best_estimator_

In [15]:
y_pred = best_model.predict(X_test)

In [17]:
y_train_pred = best_model.predict(X_train)

In [25]:
print("Best Parameters:", grid_search.best_params_)
print("Mean Absolute Error (MAE): ", mean_absolute_error(y_test, y_pred))
print("Test R^2 Score:", r2_score(y_test, y_pred))
print("Train R^2 Score:", r2_score(y_train, y_train_pred))

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Mean Absolute Error (MAE):  2898.6780090586103
Test R^2 Score: 0.9787636726823509
Train R^2 Score: 0.993104013462885


# XGBoost

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

In [12]:
df = pd.read_csv("Salary_Data.csv")

In [13]:
df.dropna(inplace=True)

In [5]:
categorical_cols = ["Gender", "education_level", "job_title"]
df_with_dummies = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [6]:
X = df_with_dummies.drop(columns="Salary", axis=1)  
y = df_with_dummies['Salary']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

In [9]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0]
}

In [10]:
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [11]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [13]:
# Saving model
import joblib
joblib.dump(best_model, "saved_models/xgboost_salary_prediction.pkl")

['saved_models/xgboost_salary_prediction.pkl']

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import joblib
from sklearn.model_selection import train_test_split


In [15]:
X = df.drop(columns=["Salary"])  # Replace "Salary" with your target column name
y = df["Salary"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
numerical_features = ["Age", "years_of_experience"]
categorical_features = ["Gender", "education_level", "job_title"]

In [18]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features)
    ]
)

In [19]:
xgb_model = XGBRegressor(
    learning_rate=0.3,
    max_depth=7,
    n_estimators=200,
    subsample=0.8,
    random_state=42
)

In [20]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", xgb_model)
])

In [21]:
pipeline.fit(X_train, y_train)

In [22]:
joblib.dump(pipeline, "saved_models/xgb_pipeline.pkl")

['saved_models/xgb_pipeline.pkl']

In [36]:
print("Best Parameters:", grid_search.best_params_)
print("MSE:", mean_absolute_error(y_test, y_pred))
print("R^2 (Test):", r2_score(y_test, y_pred))
print("R^2 (Train):", r2_score(y_train, best_model.predict(X_train)))

Best Parameters: {'learning_rate': 0.3, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
MSE: 2812.8794326698294
R^2 (Test): 0.9798075441624176
R^2 (Train): 0.9946708516286576


# LightGBM

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMRegressor

In [38]:
df = pd.read_csv("Salary_Data.csv")

In [41]:
df.dropna(inplace=True)

In [42]:
categorical_cols = ["Gender", "Education Level", "Job Title"]
df_with_dummies = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [None]:
categorical_cols = ["Gender", "Education Level", "Job Title"]
df_with_dummies = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

X = df_with_dummies.drop(columns="Salary", axis=1)  
y = df_with_dummies['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lgbm = LGBMRegressor(random_state=42) # using these parameters Best Parameters: {'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 200, 'num_leaves': 50, 'subsample': 0.8}




In [43]:
X = df_with_dummies.drop(columns="Salary", axis=1)  
y = df_with_dummies['Salary']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
lgbm = LGBMRegressor(random_state=42)

In [46]:
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [-1, 5, 10],
    'num_leaves': [31, 50],
    'subsample': [0.8, 1.0]
}

In [47]:
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.083138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 118
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.094093 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 118
[LightGBM] [Info] Number of data points in the train set: 2422, number of used features: 25
[LightGBM] [Info] Number of data points in the train set: 2422, number of used features: 25
[LightGBM] [Info] Start training from score 124346.842692
[LightGBM] [Info] Start training from score 123982.575145
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 119
[LightGBM] [Info] Number of data points in the train set: 2422, number of used features: 25
[LightGBM] [Info] Start

  _data = np.array(data, dtype=dtype, copy=copy,


In [48]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [49]:
print("Best Parameters:", grid_search.best_params_)
print("MSE:", mean_absolute_error(y_test, y_pred))
print("R^2 (Test):", r2_score(y_test, y_pred))
print("R^2 (Train):", r2_score(y_train, best_model.predict(X_train)))

Best Parameters: {'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 200, 'num_leaves': 50, 'subsample': 0.8}
MSE: 3996.7440045993367
R^2 (Test): 0.9742552273454418
R^2 (Train): 0.9881714415655586
