In [46]:
import pandas as pd
df = pd.read_csv("./ds_salaries.csv")

In [47]:
df_features = df.copy(deep=True)
df_features.drop_duplicates(inplace=True)
target = df_features["salary_in_usd"]
df_features.drop(["salary", "salary_in_usd", "company_location"], axis=1, inplace=True)


# Create preprocessor

In [48]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

one_hot_features = ["work_year", "employment_type", "remote_ratio", "job_title", "salary_currency", "employee_residence","company_size", "experience_level"]
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(), one_hot_features),
    ])
df_preprocessed = preprocessor.fit_transform(df_features)
print("TEST")

TEST


# Export CSVs

In [49]:
from sklearn.model_selection import train_test_split

target_stratified = pd.cut(target, bins=6, labels=False)

print(target_stratified)

0       1
1       0
2       0
3       2
4       1
       ..
3750    5
3751    1
3752    1
3753    1
3754    1
Name: salary_in_usd, Length: 2584, dtype: int64


In [50]:
# Split into 80/20 Training / Testing
X_train, X_holdout, target_train, target_holdout = train_test_split(df_preprocessed, target, test_size=0.2, random_state=42)

X_train_stratified, X_holdout_stratified, target_train_stratified, target_holdout_stratified = train_test_split(df_preprocessed, target, test_size=0.2, stratify=target_stratified, random_state=42)

In [51]:
from scipy.sparse import save_npz

save_npz('./ds_salaries_GeneralPreprocessing_train.npz', X_train)
save_npz('./ds_salaries_GeneralPreprocessing_train_stratified.npz', X_train_stratified)

target_train.to_csv('./ds_salaries_target_train.csv', index=False)
target_train_stratified.to_csv('./ds_salaries_target_train_stratified.csv', index=False)


save_npz('./ds_salaries_GeneralPreprocessing_test.npz', X_holdout)
save_npz('./ds_salaries_GeneralPreprocessing_test_stratified.npz', X_holdout_stratified)

target_holdout.to_csv('./ds_salaries_target_test.csv', index=False)
target_holdout_stratified.to_csv('./ds_salaries_target_test_stratified.csv', index=False)

display(target_train.describe())
display(target_holdout.describe())

# Save as CSV (for oversampling notebook)
X_train_dense = X_train_stratified.toarray()

# Feature names after one-hot encoding
feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(one_hot_features)

# Converting to DataFrame
X_train_dense_csv = pd.DataFrame(X_train_dense, columns=feature_names)
display(X_train_dense_csv)

X_train_dense_csv.to_csv('./ds_salaries_GeneralPreprocessing_train_csv.csv', index=False)

count      2067.000000
mean     133570.320271
std       67383.813058
min        5132.000000
25%       85000.000000
50%      130000.000000
75%      175000.000000
max      450000.000000
Name: salary_in_usd, dtype: float64

count       517.000000
mean     132765.431335
std       66200.821478
min        5409.000000
25%       80000.000000
50%      130000.000000
75%      180000.000000
max      430967.000000
Name: salary_in_usd, dtype: float64

Unnamed: 0,work_year_2020,work_year_2021,work_year_2022,work_year_2023,employment_type_CT,employment_type_FL,employment_type_FT,employment_type_PT,remote_ratio_0,remote_ratio_50,...,employee_residence_US,employee_residence_UZ,employee_residence_VN,company_size_L,company_size_M,company_size_S,experience_level_EN,experience_level_EX,experience_level_MI,experience_level_SE
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2062,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2063,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2064,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2065,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
