In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from scipy.sparse import csr_matrix

In [8]:
# Load your dataset
df = pd.read_csv('ds_salaries.csv')
df = df.drop(['salary'], axis=1)

In [9]:
# Preprocessing
# Split the dataset into training and test data
df_target = df['salary_in_usd']

# Remove target variable from the DataFrame
df_features = df.drop('salary_in_usd', axis=1)

# Initialize the OneHotEncoder from sklearn
encoder = OneHotEncoder(sparse=False)

# Fit and transform the features DataFrame
df_features_encoded = encoder.fit_transform(df_features)

# Convert the array back to a DataFrame
df_features_encoded = pd.DataFrame(df_features_encoded, columns=encoder.get_feature_names_out(df_features.columns))

# Add the target variable back to the DataFrame
df_preprocessed = pd.concat([df_features_encoded, df_target], axis=1)

In [10]:
# Split the dataset
df_train, df_test = train_test_split(df_preprocessed, test_size=0.3, random_state=42)

In [11]:
# Copy train dataset for a comparison without oversampling
df_train_without_oversampling = df_train.copy()

In [12]:
# Oversample using RESAMPLE
def oversample_data(df_input):
    # Separate majority and minority classes
    df_majority = df_input[df_input.employment_type_FT==1.0]
    df_minority_FL = df_input[df_input.employment_type_FL==1.0]
    df_minority_CT = df_input[df_input.employment_type_CT==1.0]
    df_minority_PT = df_input[df_input.employment_type_PT==1.0]

    # Upsample minority classes
    df_minority_FL_upsampled = resample(df_minority_FL, 
                                     replace=True,    # sample with replacement
                                     n_samples=len(df_majority),  # to match majority class
                                     random_state=123) # reproducible results

    df_minority_CT_upsampled = resample(df_minority_CT, 
                                     replace=True,    # sample with replacement
                                     n_samples=len(df_majority),  # to match majority class
                                     random_state=123) # reproducible results
    
    df_minority_PT_upsampled = resample(df_minority_PT, 
                                     replace=True,    # sample with replacement
                                     n_samples=len(df_majority),  # to match majority class
                                     random_state=123) # reproducible results

    # Combine majority class with upsampled minority classes
    df_upsampled = pd.concat([df_majority, df_minority_FL_upsampled, df_minority_CT_upsampled, df_minority_PT_upsampled])
    
    return df_upsampled

df_train = oversample_data(df_train)

In [13]:
# Prepare the data for model training
X_train = df_train.drop(['salary_in_usd'], axis=1)
y_train = df_train['salary_in_usd']
X_test = df_test.drop(['salary_in_usd'], axis=1)
y_test = df_test['salary_in_usd']

X_train_without_oversampling = df_train_without_oversampling.drop(['salary_in_usd'], axis=1)
y_train_without_oversampling = df_train_without_oversampling['salary_in_usd']

X_train = csr_matrix(X_train.values)
X_test = csr_matrix(X_test.values)
X_train_without_oversampling = csr_matrix(X_train_without_oversampling.values)

In [14]:
# Define the model
svr = SVR()
svr_without_oversampling = SVR()

# Define the hyperparameters
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'poly', 'sigmoid']}
param_grind_without_oversampling = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'poly', 'sigmoid']}

# Use GridSearchCV to find the optimal hyperparameters
grid = GridSearchCV(svr, param_grid, verbose=3)
grid_without_oversampling = GridSearchCV(svr_without_oversampling, param_grind_without_oversampling, verbose=3)

# Fit the model on the training data
grid.fit(X_train, y_train)
grid_without_oversampling.fit(X_train_without_oversampling, y_train_without_oversampling)

# Get the optimal hyperparameters
print(grid.best_params_)
print(grid_without_oversampling.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-2.089 total time=   3.8s
[CV 2/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.151 total time=   4.0s
[CV 3/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.101 total time=   5.0s
[CV 4/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.106 total time=   5.9s
[CV 5/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-2.229 total time=   4.8s
[CV 1/5] END ......C=0.1, gamma=1, kernel=poly;, score=-1.335 total time=   3.7s
[CV 2/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.403 total time=   4.1s
[CV 3/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.035 total time=   4.2s
[CV 4/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.009 total time=   4.0s
[CV 5/5] END ......C=0.1, gamma=1, kernel=poly;, score=-0.716 total time=   4.5s
[CV 1/5] END ...C=0.1, gamma=1, kernel=sigmoid;, score=-2.087 total time=   5.5s
[CV 2/5] END ...C=0.1, gamma=1, kernel=sigmoid;

[CV 2/5] END ..C=1, gamma=0.01, kernel=sigmoid;, score=-0.150 total time=   4.2s
[CV 3/5] END ..C=1, gamma=0.01, kernel=sigmoid;, score=-0.101 total time=   4.3s
[CV 4/5] END ..C=1, gamma=0.01, kernel=sigmoid;, score=-0.106 total time=   4.2s
[CV 5/5] END ..C=1, gamma=0.01, kernel=sigmoid;, score=-2.223 total time=   4.0s
[CV 1/5] END .....C=1, gamma=0.001, kernel=rbf;, score=-2.089 total time=   3.9s
[CV 2/5] END .....C=1, gamma=0.001, kernel=rbf;, score=-0.151 total time=   4.3s
[CV 3/5] END .....C=1, gamma=0.001, kernel=rbf;, score=-0.101 total time=   4.5s
[CV 4/5] END .....C=1, gamma=0.001, kernel=rbf;, score=-0.107 total time=   4.3s
[CV 5/5] END .....C=1, gamma=0.001, kernel=rbf;, score=-2.227 total time=   4.2s
[CV 1/5] END ....C=1, gamma=0.001, kernel=poly;, score=-2.090 total time=   3.7s
[CV 2/5] END ....C=1, gamma=0.001, kernel=poly;, score=-0.151 total time=   4.0s
[CV 3/5] END ....C=1, gamma=0.001, kernel=poly;, score=-0.101 total time=   4.3s
[CV 4/5] END ....C=1, gamma=

[CV 4/5] END .....C=100, gamma=0.1, kernel=poly;, score=0.009 total time=   3.7s
[CV 5/5] END ....C=100, gamma=0.1, kernel=poly;, score=-0.716 total time=   4.3s
[CV 1/5] END .C=100, gamma=0.1, kernel=sigmoid;, score=-1.044 total time=   3.9s
[CV 2/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.375 total time=   4.7s
[CV 3/5] END .C=100, gamma=0.1, kernel=sigmoid;, score=-0.018 total time=   4.8s
[CV 4/5] END .C=100, gamma=0.1, kernel=sigmoid;, score=-0.040 total time=   4.6s
[CV 5/5] END .C=100, gamma=0.1, kernel=sigmoid;, score=-0.412 total time=   4.6s
[CV 1/5] END ....C=100, gamma=0.01, kernel=rbf;, score=-1.786 total time=   4.0s
[CV 2/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.081 total time=   4.7s
[CV 3/5] END ....C=100, gamma=0.01, kernel=rbf;, score=-0.061 total time=   4.6s
[CV 4/5] END ....C=100, gamma=0.01, kernel=rbf;, score=-0.069 total time=   4.7s
[CV 5/5] END ....C=100, gamma=0.01, kernel=rbf;, score=-1.385 total time=   4.8s
[CV 1/5] END ...C=100, gamma

[CV 5/5] END .........C=1, gamma=1, kernel=rbf;, score=-0.002 total time=   0.3s
[CV 1/5] END .........C=1, gamma=1, kernel=poly;, score=0.233 total time=   0.3s
[CV 2/5] END .........C=1, gamma=1, kernel=poly;, score=0.248 total time=   0.3s
[CV 3/5] END .........C=1, gamma=1, kernel=poly;, score=0.230 total time=   0.3s
[CV 4/5] END .........C=1, gamma=1, kernel=poly;, score=0.251 total time=   0.3s
[CV 5/5] END .........C=1, gamma=1, kernel=poly;, score=0.325 total time=   0.3s
[CV 1/5] END .....C=1, gamma=1, kernel=sigmoid;, score=-0.004 total time=   0.3s
[CV 2/5] END .....C=1, gamma=1, kernel=sigmoid;, score=-0.004 total time=   0.3s
[CV 3/5] END .....C=1, gamma=1, kernel=sigmoid;, score=-0.003 total time=   0.3s
[CV 4/5] END .....C=1, gamma=1, kernel=sigmoid;, score=-0.005 total time=   0.3s
[CV 5/5] END .....C=1, gamma=1, kernel=sigmoid;, score=-0.002 total time=   0.3s
[CV 1/5] END .......C=1, gamma=0.1, kernel=rbf;, score=-0.003 total time=   0.3s
[CV 2/5] END .......C=1, gam

[CV 2/5] END ....C=10, gamma=0.001, kernel=rbf;, score=-0.004 total time=   0.3s
[CV 3/5] END ....C=10, gamma=0.001, kernel=rbf;, score=-0.003 total time=   0.3s
[CV 4/5] END ....C=10, gamma=0.001, kernel=rbf;, score=-0.005 total time=   0.3s
[CV 5/5] END ....C=10, gamma=0.001, kernel=rbf;, score=-0.002 total time=   0.3s
[CV 1/5] END ...C=10, gamma=0.001, kernel=poly;, score=-0.004 total time=   0.3s
[CV 2/5] END ...C=10, gamma=0.001, kernel=poly;, score=-0.004 total time=   0.3s
[CV 3/5] END ...C=10, gamma=0.001, kernel=poly;, score=-0.003 total time=   0.3s
[CV 4/5] END ...C=10, gamma=0.001, kernel=poly;, score=-0.005 total time=   0.3s
[CV 5/5] END ...C=10, gamma=0.001, kernel=poly;, score=-0.002 total time=   0.3s
[CV 1/5] END C=10, gamma=0.001, kernel=sigmoid;, score=-0.004 total time=   0.3s
[CV 2/5] END C=10, gamma=0.001, kernel=sigmoid;, score=-0.004 total time=   0.3s
[CV 3/5] END C=10, gamma=0.001, kernel=sigmoid;, score=-0.003 total time=   0.3s
[CV 4/5] END C=10, gamma=0.0

In [15]:
# Make predictions using the test set
y_pred = grid.predict(X_test)
y_pred_without_oversampling = grid_without_oversampling.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mse_without_oversampling = mean_squared_error(y_test, y_pred_without_oversampling)
r2 = r2_score(y_test, y_pred)
r2_without_oversampling = r2_score(y_test, y_pred_without_oversampling)

print("With oversampling")
print('Mean Squared Error:', mse)
print('R^2 Score:', r2)
print('_____________')
print("Without oversampling")
print('Mean Squared Error:', mse_without_oversampling)
print('R^2 Score:', r2_without_oversampling)

With oversampling
Mean Squared Error: 2410441170.7807093
R^2 Score: 0.39202073719566444
_____________
Without oversampling
Mean Squared Error: 2408758156.0082846
R^2 Score: 0.3924452396033694
