## Scaling and transformations
Now before proceeding to building the models I'm going to do the X/y split and scale/transform the data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/cleaned/6.jobs_in_data.csv')
df.head()

Unnamed: 0,work_year,job_title,job_category,employee_residence,experience_level,employment_type,work_setting,company_location,company_size,salary_in_euros,cost_of_living,purchasing_power,job_field
0,2023,Data DevOps Engineer,Data Engineering,Germany,2,4,1,Germany,L,87411,127.47,685.74,Data Engineering
1,2023,Data Architect,Data Architecture and Modeling,United States,3,4,3,United States,M,171120,143.34,1193.8,Data Engineering
2,2023,Data Architect,Data Architecture and Modeling,United States,3,4,3,United States,M,75256,143.34,525.02,Data Engineering
3,2023,Data Scientist,Data Science and Research,United States,3,4,3,United States,M,195040,143.34,1360.68,Data Science
4,2023,Data Scientist,Data Science and Research,United States,3,4,3,United States,M,85836,143.34,598.83,Data Science


## X/y Split
The target will be "salary_in_euros". I also want to drop the columns 'job_title' and 'job_category' because they are redundant for creating the model since I already added the column 'job_field' before with the categories that I want to work with.

In [3]:
X = df.drop(columns=['salary_in_euros', 'job_title', 'job_category', 'purchasing_power'], axis=1)
y = df['salary_in_euros']

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
display(X_train.shape)
display(X_test.shape)
display(y_train.shape)
display(y_test.shape)

(4264, 9)

(1066, 9)

(4264,)

(1066,)

## Dividing X into numerical and categorical

In [6]:
X_train_num = X_train.select_dtypes(np.number)
X_train_cat = X_train.select_dtypes(object)
X_test_num = X_test.select_dtypes(np.number)
X_test_cat = X_test.select_dtypes(object)

In [7]:
X_train_num.head()

Unnamed: 0,work_year,experience_level,employment_type,work_setting,cost_of_living
2804,2023,3,4,3,143.34
3858,2023,3,4,3,143.34
511,2023,2,4,3,143.34
62,2023,3,4,3,143.34
3034,2023,3,4,2,143.34


In [8]:
X_test_num.head()

Unnamed: 0,work_year,experience_level,employment_type,work_setting,cost_of_living
1323,2023,3,4,3,143.34
1839,2023,3,4,3,143.34
798,2023,3,4,2,143.34
3856,2023,4,4,3,143.34
4553,2022,3,4,3,106.46


In [9]:
X_train_num.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
work_year,4264.0,2022.685038,0.603377,2020.0,2022.0,2023.0,2023.0,2023.0
experience_level,4264.0,2.656895,0.678589,1.0,2.0,3.0,3.0,4.0
employment_type,4264.0,3.984522,0.183138,1.0,4.0,4.0,4.0,4.0
work_setting,4264.0,2.505629,0.565209,1.0,2.0,3.0,3.0,3.0
cost_of_living,4264.0,139.699977,15.292912,27.37,143.34,143.34,143.34,197.89


In [10]:
# from sklearn.preprocessing import PowerTransformer
# import pickle

# transformer = PowerTransformer()
# transformer.fit(X_train_num)

# # path = "../ml/transformers/"
# # scaler_file_name = "standard_scaler.pkl"

# # with open(path + scaler_file_name, "wb") as file:
# #     pickle.dump(scaler, file)

# X_train_num = scaler.transform(X_train_num)
# X_test_num = scaler.transform(X_test_num)

## Scaling numerical features
Since the numerical features have very different ranges I'm going to use the Standard Scaler.

In [11]:
# from sklearn.preprocessing import MinMaxScaler
# import pickle

# scaler = MinMaxScaler()
# scaler.fit(X_train_num)

# path = "../ml/scalers/"
# scaler_file_name = "MinMaxScaler.pkl"

# with open(path + scaler_file_name, "wb") as file:
#     pickle.dump(scaler, file)

# X_train_num_transformed = scaler.transform(X_train_num)
# X_test_num_transformed = scaler.transform(X_test_num)

In [12]:
from sklearn.preprocessing import StandardScaler
import pickle

scaler = StandardScaler()
scaler.fit(X_train_num)

path = "../ml/scalers/"
scaler_file_name = "standard_scaler.pkl"

with open(path + scaler_file_name, "wb") as file:
    pickle.dump(scaler, file)

X_train_num_transformed = scaler.transform(X_train_num)
X_test_num_transformed = scaler.transform(X_test_num)

In [13]:
X_train_num_transformed_df = pd.DataFrame(X_train_num_transformed, columns=X_train_num.columns , index=X_train_num.index)
X_test_num_transformed_df = pd.DataFrame(X_test_num_transformed, columns=X_test_num.columns , index=X_test_num.index)

In [14]:
X_train_num_transformed_df.head()

Unnamed: 0,work_year,experience_level,employment_type,work_setting,cost_of_living
2804,0.52206,0.505675,0.084528,0.874772,0.238048
3858,0.52206,0.505675,0.084528,0.874772,0.238048
511,0.52206,-0.968145,0.084528,0.874772,0.238048
62,0.52206,0.505675,0.084528,0.874772,0.238048
3034,0.52206,0.505675,0.084528,-0.894691,0.238048


In [15]:
X_test_num_transformed_df.head()

Unnamed: 0,work_year,experience_level,employment_type,work_setting,cost_of_living
1323,0.52206,0.505675,0.084528,0.874772,0.238048
1839,0.52206,0.505675,0.084528,0.874772,0.238048
798,0.52206,0.505675,0.084528,-0.894691,0.238048
3856,0.52206,1.979495,0.084528,0.874772,0.238048
4553,-1.135472,0.505675,0.084528,0.874772,-2.173809


## Encoding categorical features

In [16]:
X_train_cat.head()

Unnamed: 0,employee_residence,company_location,company_size,job_field
2804,United States,United States,M,Data Engineering
3858,United States,United States,M,Data Science
511,United States,United States,M,Data Analysis
62,United States,United States,M,Data Science
3034,United States,United States,M,Data Science


In [17]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train_cat)

# path = "../ml/encoders/"
# encoder_file_name = "one_hot_encoder.pkl"

# with open(path + encoder_file_name, "wb") as file:
#     pickle.dump(encoder, file)

X_train_cat_encoded = encoder.transform(X_train_cat).toarray()
X_test_cat_encoded = encoder.transform(X_test_cat).toarray()

In [18]:
encoded_feature_names = encoder.get_feature_names_out(X_train_cat.columns)

X_train_cat_encoded_df = pd.DataFrame(X_train_cat_encoded, columns=encoded_feature_names, index=X_train_cat.index)
X_test_cat_encoded_df = pd.DataFrame(X_test_cat_encoded, columns=encoded_feature_names, index=X_test_cat.index)

In [19]:
X_train_cat_encoded_df.head()

Unnamed: 0,employee_residence_Algeria,employee_residence_Argentina,employee_residence_Armenia,employee_residence_Australia,employee_residence_Austria,employee_residence_Belgium,employee_residence_Bolivia,employee_residence_Bosnia and Herzegovina,employee_residence_Brazil,employee_residence_Canada,...,company_location_United Arab Emirates,company_location_United Kingdom,company_location_United States,company_size_L,company_size_M,company_size_S,job_field_Data Analysis,job_field_Data Engineering,job_field_Data Science,job_field_Other
2804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [20]:
X_test_cat_encoded_df.head()

Unnamed: 0,employee_residence_Algeria,employee_residence_Argentina,employee_residence_Armenia,employee_residence_Australia,employee_residence_Austria,employee_residence_Belgium,employee_residence_Bolivia,employee_residence_Bosnia and Herzegovina,employee_residence_Brazil,employee_residence_Canada,...,company_location_United Arab Emirates,company_location_United Kingdom,company_location_United States,company_size_L,company_size_M,company_size_S,job_field_Data Analysis,job_field_Data Engineering,job_field_Data Science,job_field_Other
1323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1839,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3856,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [21]:
X_train_concat = pd.concat([X_train_num_transformed_df, X_train_cat_encoded_df], axis=1)
X_train_concat

Unnamed: 0,work_year,experience_level,employment_type,work_setting,cost_of_living,employee_residence_Algeria,employee_residence_Argentina,employee_residence_Armenia,employee_residence_Australia,employee_residence_Austria,...,company_location_United Arab Emirates,company_location_United Kingdom,company_location_United States,company_size_L,company_size_M,company_size_S,job_field_Data Analysis,job_field_Data Engineering,job_field_Data Science,job_field_Other
2804,0.522060,0.505675,0.084528,0.874772,0.238048,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3858,0.522060,0.505675,0.084528,0.874772,0.238048,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
511,0.522060,-0.968145,0.084528,0.874772,0.238048,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
62,0.522060,0.505675,0.084528,0.874772,0.238048,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3034,0.522060,0.505675,0.084528,-0.894691,0.238048,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3092,0.522060,0.505675,0.084528,0.874772,0.238048,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3772,0.522060,0.505675,0.084528,-0.894691,0.238048,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5191,-2.793004,0.505675,0.084528,-0.894691,-2.299372,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5226,-2.793004,-2.441964,0.084528,-2.664155,0.238048,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [22]:
X_test_concat = pd.concat([X_test_num_transformed_df, X_test_cat_encoded_df], axis=1)
X_test_concat

Unnamed: 0,work_year,experience_level,employment_type,work_setting,cost_of_living,employee_residence_Algeria,employee_residence_Argentina,employee_residence_Armenia,employee_residence_Australia,employee_residence_Austria,...,company_location_United Arab Emirates,company_location_United Kingdom,company_location_United States,company_size_L,company_size_M,company_size_S,job_field_Data Analysis,job_field_Data Engineering,job_field_Data Science,job_field_Other
1323,0.522060,0.505675,0.084528,0.874772,0.238048,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1839,0.522060,0.505675,0.084528,0.874772,0.238048,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
798,0.522060,0.505675,0.084528,-0.894691,0.238048,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3856,0.522060,1.979495,0.084528,0.874772,0.238048,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4553,-1.135472,0.505675,0.084528,0.874772,-2.173809,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2841,0.522060,-0.968145,0.084528,-0.894691,-5.699385,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5208,-4.450536,-0.968145,0.084528,0.874772,-1.899794,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1965,0.522060,0.505675,0.084528,-0.894691,0.238048,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4538,-1.135472,-0.968145,0.084528,-0.894691,0.238048,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [155]:
from sklearn.model_selection import GridSearchCV

max_depth_choices = [50,60,70] # A list of the possible values of max_depth to try
criterion_choices = ['squared_error','absolute_error'] # A list of the possible values optimization metrics
min_samples_split_choices = [55, 60, 65] # A list of the possible values of min_samples_split to try
min_samples_leaf_choices = [4,5,6] # A list of the possible values of min_samples_leaf to try
max_features_choices = [90,100,110]

grid = {'max_depth': max_depth_choices,
        'criterion': criterion_choices,
        'min_samples_split': min_samples_split_choices,
        'min_samples_leaf': min_samples_leaf_choices,
        'max_features': max_features_choices
       }

In [156]:
# Instantiate the grid search model object
from sklearn.tree import DecisionTreeRegressor
# estimator -> model to optimize
dtr = DecisionTreeRegressor()
# param_grid -> state the dictionary of parameters to optimize
# cv = 5 -> number of cross validation folds.
grid_search = GridSearchCV(estimator = dtr, param_grid = grid, cv = 10)

In [157]:
grid_search.fit(X_train_concat, y_train)

In [158]:
grid_search.best_params_

{'criterion': 'squared_error',
 'max_depth': 70,
 'max_features': 110,
 'min_samples_leaf': 5,
 'min_samples_split': 60}

In [159]:
print("The best R2 for the best hyperparameters is {:.2f}".format(grid_search.best_score_))

The best R2 for the best hyperparameters is 0.33


In [160]:
print("The performance of the DecisionTree using the best gridsearchcv hyperpameters is {:.2f}".format(grid_search.best_estimator_.score(X_test_concat, y_test)))

The performance of the DecisionTree using the best gridsearchcv hyperpameters is 0.35


In [29]:
# from sklearn.tree import DecisionTreeRegressor

# dtr = DecisionTreeRegressor(max_depth=3,
#                              criterion = 'squared_error',
#                              min_samples_split=2,
#                              min_samples_leaf = 3,
#                              max_features = 5)
# dtr.fit(X_train_concat, y_train)

In [30]:
dtr.score(X_test_concat, y_test), dtr.score(X_train_concat, y_train)

NotFittedError: This DecisionTreeRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
y_train_pred = dtr.predict(X_train_concat)
y_test_pred  = dtr.predict(X_test_concat)

In [None]:
import functions

functions.error_metrics_report(y_train, y_test, y_train_pred, y_test_pred)