In [102]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

In [103]:
# Load your dataset
df = pd.read_csv('ds_salaries.csv')

# Display the first few rows of the dataset
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [104]:
# Separate majority and minority classes
df_majority = df[df.company_size=='M']
df_minority_large = df[df.company_size=='L']
df_minority_small = df[df.company_size=='S']

# Upsample minority classes
df_minority_large_upsampled = resample(df_minority_large, 
                                 replace=True,    # sample with replacement
                                 n_samples=3153,  # to match majority class
                                 random_state=123) # reproducible results

df_minority_small_upsampled = resample(df_minority_small, 
                                 replace=True,    # sample with replacement
                                 n_samples=3153,  # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority classes
df_upsampled = pd.concat([df_majority, df_minority_large_upsampled, df_minority_small_upsampled])

# Display new class counts
df_upsampled.company_size.value_counts()


M    3153
L    3153
S    3153
Name: company_size, dtype: int64

In [106]:
# Split the dataset into training and test data
df_features = df_upsampled.copy(deep=True)
target = df_features["salary_in_usd"]
df_features.drop(["salary", "salary_in_usd"], axis=1, inplace=True)

In [107]:
# Encoding categorical features
oneHot_features = ["work_year", "employment_type", "remote_ratio", "job_title", "salary_currency", "employee_residence", "company_location","company_size", "experience_level"]
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(), oneHot_features),
    ])
df_preprocessed = preprocessor.fit_transform(df_features)

In [111]:
# Split the data into training and test sets (30% of data will be used for testing)
X_train, X_test, y_train, y_test = train_test_split(df_preprocessed, target, test_size=0.3, random_state=42)

# Check the shapes of the data
print("Training set - Features: ", X_train.shape, "Target: ", y_train.shape)
print("Testing set - Features: ", X_test.shape, "Target: ",y_test.shape)

Training set - Features:  (6621, 281) Target:  (6621,)
Testing set - Features:  (2838, 281) Target:  (2838,)


In [112]:
# Define the model
svr = SVR()

# Define the hyperparameters
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'poly', 'sigmoid']}

# Use GridSearchCV to find the optimal hyperparameters
grid = GridSearchCV(svr, param_grid, verbose=3)

# Fit the model on the training data
grid.fit(X_train, y_train)

# Get the optimal hyperparameters
print(grid.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.022 total time=   2.7s
[CV 2/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.010 total time=   2.6s
[CV 3/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.008 total time=   2.6s
[CV 4/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.016 total time=   2.6s
[CV 5/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.007 total time=   2.5s
[CV 1/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.215 total time=   2.4s
[CV 2/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.208 total time=   2.4s
[CV 3/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.207 total time=   2.4s
[CV 4/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.208 total time=   2.4s
[CV 5/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.215 total time=   2.4s
[CV 1/5] END ...C=0.1, gamma=1, kernel=sigmoid;, score=-0.021 total time=   2.8s
[CV 2/5] END ...C=0.1, gamma=1, kernel=sigmoid;

[CV 2/5] END ..C=1, gamma=0.01, kernel=sigmoid;, score=-0.009 total time=   2.3s
[CV 3/5] END ..C=1, gamma=0.01, kernel=sigmoid;, score=-0.008 total time=   2.3s
[CV 4/5] END ..C=1, gamma=0.01, kernel=sigmoid;, score=-0.016 total time=   2.3s
[CV 5/5] END ..C=1, gamma=0.01, kernel=sigmoid;, score=-0.006 total time=   2.3s
[CV 1/5] END .....C=1, gamma=0.001, kernel=rbf;, score=-0.022 total time=   2.5s
[CV 2/5] END .....C=1, gamma=0.001, kernel=rbf;, score=-0.010 total time=   2.4s
[CV 3/5] END .....C=1, gamma=0.001, kernel=rbf;, score=-0.008 total time=   2.4s
[CV 4/5] END .....C=1, gamma=0.001, kernel=rbf;, score=-0.016 total time=   2.5s
[CV 5/5] END .....C=1, gamma=0.001, kernel=rbf;, score=-0.007 total time=   2.6s
[CV 1/5] END ....C=1, gamma=0.001, kernel=poly;, score=-0.022 total time=   2.5s
[CV 2/5] END ....C=1, gamma=0.001, kernel=poly;, score=-0.010 total time=   2.3s
[CV 3/5] END ....C=1, gamma=0.001, kernel=poly;, score=-0.009 total time=   2.2s
[CV 4/5] END ....C=1, gamma=

[CV 4/5] END .....C=100, gamma=0.1, kernel=poly;, score=0.208 total time=   2.3s
[CV 5/5] END .....C=100, gamma=0.1, kernel=poly;, score=0.215 total time=   2.2s
[CV 1/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.261 total time=   2.4s
[CV 2/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.260 total time=   2.4s
[CV 3/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.254 total time=   2.4s
[CV 4/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.253 total time=   2.4s
[CV 5/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.262 total time=   2.4s
[CV 1/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.071 total time=   2.4s
[CV 2/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.080 total time=   2.4s
[CV 3/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.083 total time=   2.4s
[CV 4/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.075 total time=   2.4s
[CV 5/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.087 total time=   2.4s
[CV 1/5] END ...C=100, gamma

In [113]:
# Make predictions using the test set
y_pred = grid.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Squared Error:', mse)
print('R^2 Score:', r2)

Mean Squared Error: 1275145038.3116114
R^2 Score: 0.7513297508270133
