In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from scipy.sparse import csr_matrix

In [2]:
# Load your dataset
df = pd.read_csv('ds_salaries.csv')
df = df.drop(['salary'], axis=1)

In [3]:
# Preprocessing
# Split the dataset into training and test data
df_target = df['salary_in_usd']

# Remove target variable from the DataFrame
df_features = df.drop('salary_in_usd', axis=1)

# Initialize the OneHotEncoder from sklearn
encoder = OneHotEncoder(sparse=False)

# Fit and transform the features DataFrame
df_features_encoded = encoder.fit_transform(df_features)

# Convert the array back to a DataFrame
df_features_encoded = pd.DataFrame(df_features_encoded, columns=encoder.get_feature_names_out(df_features.columns))

# Add the target variable back to the DataFrame
df_preprocessed = pd.concat([df_features_encoded, df_target], axis=1)

In [5]:
df_preprocessed

Unnamed: 0,work_year_2020,work_year_2021,work_year_2022,work_year_2023,experience_level_EN,experience_level_EX,experience_level_MI,experience_level_SE,employment_type_CT,employment_type_FL,...,company_location_SK,company_location_TH,company_location_TR,company_location_UA,company_location_US,company_location_VN,company_size_L,company_size_M,company_size_S,salary_in_usd
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,85847
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,30000
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,25500
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,175000
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,120000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3750,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,412000
3751,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,151000
3752,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,105000
3753,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,100000


In [280]:
# Split the dataset
df_train, df_test = train_test_split(df_preprocessed, test_size=0.3, random_state=42)

In [281]:
# Copy train dataset for a comparison without oversampling
df_train_without_oversampling = df_train.copy()

In [282]:
# Oversample using RESAMPLE
def oversample_data(df_input):
    # Separate majority and minority classes
    df_majority = df_input[df_input.employment_type_CT==1.0]
    df_minority_large = df_input[df_input.employment_type_FL==1.0]
    df_minority_small = df_input[df_input.employment_type_FL==1.0]

    # Upsample minority classes
    df_minority_large_upsampled = resample(df_minority_large, 
                                     replace=True,    # sample with replacement
                                     n_samples=len(df_majority),  # to match majority class
                                     random_state=123) # reproducible results

    df_minority_small_upsampled = resample(df_minority_small, 
                                     replace=True,    # sample with replacement
                                     n_samples=len(df_majority),  # to match majority class
                                     random_state=123) # reproducible results

    # Combine majority class with upsampled minority classes
    df_upsampled = pd.concat([df_majority, df_minority_large_upsampled, df_minority_small_upsampled])
    
    return df_upsampled

df_train = oversample_data(df_train)

In [283]:
# Prepare the data for model training
X_train = df_train.drop(['salary_in_usd'], axis=1)
y_train = df_train['salary_in_usd']
X_test = df_test.drop(['salary_in_usd'], axis=1)
y_test = df_test['salary_in_usd']

X_train_without_oversampling = df_train_without_oversampling.drop(['salary_in_usd'], axis=1)
y_train_without_oversampling = df_train_without_oversampling['salary_in_usd']

X_train = csr_matrix(X_train.values)
X_test = csr_matrix(X_test.values)
X_train_without_oversampling = csr_matrix(X_train_without_oversampling.values)

In [284]:
# Define the model
svr = SVR()
svr_without_oversampling = SVR()

# Define the hyperparameters
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'poly', 'sigmoid']}
param_grind_without_oversampling = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'poly', 'sigmoid']}

# Use GridSearchCV to find the optimal hyperparameters
grid = GridSearchCV(svr, param_grid, verbose=3)
grid_without_oversampling = GridSearchCV(svr_without_oversampling, param_grind_without_oversampling, verbose=3)

# Fit the model on the training data
grid.fit(X_train, y_train)
grid_without_oversampling.fit(X_train_without_oversampling, y_train_without_oversampling)

# Get the optimal hyperparameters
print(grid.best_params_)
print(grid_without_oversampling.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.787 total time=   2.1s
[CV 2/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.269 total time=   2.1s
[CV 3/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.041 total time=   2.0s
[CV 4/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.112 total time=   2.0s
[CV 5/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.416 total time=   2.0s
[CV 1/5] END ......C=0.1, gamma=1, kernel=poly;, score=-0.273 total time=   1.9s
[CV 2/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.042 total time=   1.9s
[CV 3/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.106 total time=   1.9s
[CV 4/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.177 total time=   1.8s
[CV 5/5] END ......C=0.1, gamma=1, kernel=poly;, score=-0.030 total time=   1.8s
[CV 1/5] END ...C=0.1, gamma=1, kernel=sigmoid;, score=-0.787 total time=   2.1s
[CV 2/5] END ...C=0.1, gamma=1, kernel=sigmoid;

[CV 2/5] END ..C=1, gamma=0.01, kernel=sigmoid;, score=-0.269 total time=   2.1s
[CV 3/5] END ..C=1, gamma=0.01, kernel=sigmoid;, score=-0.041 total time=   2.1s
[CV 4/5] END ..C=1, gamma=0.01, kernel=sigmoid;, score=-0.111 total time=   2.1s
[CV 5/5] END ..C=1, gamma=0.01, kernel=sigmoid;, score=-0.415 total time=   2.0s
[CV 1/5] END .....C=1, gamma=0.001, kernel=rbf;, score=-0.787 total time=   2.1s
[CV 2/5] END .....C=1, gamma=0.001, kernel=rbf;, score=-0.269 total time=   2.3s
[CV 3/5] END .....C=1, gamma=0.001, kernel=rbf;, score=-0.041 total time=   2.2s
[CV 4/5] END .....C=1, gamma=0.001, kernel=rbf;, score=-0.112 total time=   2.1s
[CV 5/5] END .....C=1, gamma=0.001, kernel=rbf;, score=-0.416 total time=   2.3s
[CV 1/5] END ....C=1, gamma=0.001, kernel=poly;, score=-0.787 total time=   2.1s
[CV 2/5] END ....C=1, gamma=0.001, kernel=poly;, score=-0.269 total time=   1.9s
[CV 3/5] END ....C=1, gamma=0.001, kernel=poly;, score=-0.041 total time=   1.8s
[CV 4/5] END ....C=1, gamma=

[CV 4/5] END .....C=100, gamma=0.1, kernel=poly;, score=0.177 total time=   1.8s
[CV 5/5] END ....C=100, gamma=0.1, kernel=poly;, score=-0.030 total time=   1.9s
[CV 1/5] END .C=100, gamma=0.1, kernel=sigmoid;, score=-0.131 total time=   2.1s
[CV 2/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.098 total time=   2.0s
[CV 3/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.177 total time=   2.0s
[CV 4/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.252 total time=   2.0s
[CV 5/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.076 total time=   2.0s
[CV 1/5] END ....C=100, gamma=0.01, kernel=rbf;, score=-0.550 total time=   2.0s
[CV 2/5] END ....C=100, gamma=0.01, kernel=rbf;, score=-0.172 total time=   2.0s
[CV 3/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.019 total time=   1.9s
[CV 4/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.022 total time=   1.9s
[CV 5/5] END ....C=100, gamma=0.01, kernel=rbf;, score=-0.249 total time=   1.9s
[CV 1/5] END ...C=100, gamma

[CV 5/5] END .........C=1, gamma=1, kernel=rbf;, score=-0.002 total time=   0.3s
[CV 1/5] END .........C=1, gamma=1, kernel=poly;, score=0.233 total time=   0.3s
[CV 2/5] END .........C=1, gamma=1, kernel=poly;, score=0.248 total time=   0.3s
[CV 3/5] END .........C=1, gamma=1, kernel=poly;, score=0.230 total time=   0.3s
[CV 4/5] END .........C=1, gamma=1, kernel=poly;, score=0.251 total time=   0.3s
[CV 5/5] END .........C=1, gamma=1, kernel=poly;, score=0.325 total time=   0.3s
[CV 1/5] END .....C=1, gamma=1, kernel=sigmoid;, score=-0.004 total time=   0.3s
[CV 2/5] END .....C=1, gamma=1, kernel=sigmoid;, score=-0.004 total time=   0.4s
[CV 3/5] END .....C=1, gamma=1, kernel=sigmoid;, score=-0.003 total time=   0.3s
[CV 4/5] END .....C=1, gamma=1, kernel=sigmoid;, score=-0.005 total time=   0.3s
[CV 5/5] END .....C=1, gamma=1, kernel=sigmoid;, score=-0.002 total time=   0.3s
[CV 1/5] END .......C=1, gamma=0.1, kernel=rbf;, score=-0.003 total time=   0.3s
[CV 2/5] END .......C=1, gam

[CV 2/5] END ....C=10, gamma=0.001, kernel=rbf;, score=-0.004 total time=   0.3s
[CV 3/5] END ....C=10, gamma=0.001, kernel=rbf;, score=-0.003 total time=   0.3s
[CV 4/5] END ....C=10, gamma=0.001, kernel=rbf;, score=-0.005 total time=   0.3s
[CV 5/5] END ....C=10, gamma=0.001, kernel=rbf;, score=-0.002 total time=   0.3s
[CV 1/5] END ...C=10, gamma=0.001, kernel=poly;, score=-0.004 total time=   0.3s
[CV 2/5] END ...C=10, gamma=0.001, kernel=poly;, score=-0.004 total time=   0.3s
[CV 3/5] END ...C=10, gamma=0.001, kernel=poly;, score=-0.003 total time=   0.3s
[CV 4/5] END ...C=10, gamma=0.001, kernel=poly;, score=-0.005 total time=   0.3s
[CV 5/5] END ...C=10, gamma=0.001, kernel=poly;, score=-0.002 total time=   0.3s
[CV 1/5] END C=10, gamma=0.001, kernel=sigmoid;, score=-0.004 total time=   0.3s
[CV 2/5] END C=10, gamma=0.001, kernel=sigmoid;, score=-0.004 total time=   0.3s
[CV 3/5] END C=10, gamma=0.001, kernel=sigmoid;, score=-0.003 total time=   0.3s
[CV 4/5] END C=10, gamma=0.0

In [285]:
# Make predictions using the test set
y_pred = grid.predict(X_test)
y_pred_without_oversampling = grid_without_oversampling.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mse_without_oversampling = mean_squared_error(y_test, y_pred_without_oversampling)
r2 = r2_score(y_test, y_pred)
r2_without_oversampling = r2_score(y_test, y_pred_without_oversampling)

print("With oversampling")
print('Mean Squared Error:', mse)
print('R^2 Score:', r2)
print('_____________')
print("Without oversampling")
print('Mean Squared Error:', mse_without_oversampling)
print('R^2 Score:', r2_without_oversampling)

With oversampling
Mean Squared Error: 2547108988.294539
R^2 Score: 0.35754937156004374
_____________
Without oversampling
Mean Squared Error: 2408758156.0082846
R^2 Score: 0.3924452396033694
