In [37]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import resample
from scipy.sparse import csr_matrix
from scipy.sparse import save_npz

In [38]:
# Load the training data
df_feature = pd.read_csv('ds_salaries_GeneralPreprocessing_train_csv.csv')
df_target = pd.read_csv('ds_salaries_target_train.csv')

In [39]:
df_train_complete = pd.concat([df_feature, df_target], axis=1)

In [40]:
# Oversample using RESAMPLE
def oversample_data(df_input):
    # Separate majority and minority classes
    df_majority = df_input[df_input.employment_type_FT==1.0]
    df_minority_FL = df_input[df_input.employment_type_FL==1.0]
    df_minority_CT = df_input[df_input.employment_type_CT==1.0]
    df_minority_PT = df_input[df_input.employment_type_PT==1.0]

    # Upsample minority classes
    df_minority_FL_upsampled = resample(df_minority_FL, 
                                     replace=True,    # sample with replacement
                                     n_samples=len(df_majority),  # to match majority class
                                     random_state=123) # reproducible results

    df_minority_CT_upsampled = resample(df_minority_CT, 
                                     replace=True,    # sample with replacement
                                     n_samples=len(df_majority),  # to match majority class
                                     random_state=123) # reproducible results
    
    df_minority_PT_upsampled = resample(df_minority_PT, 
                                     replace=True,    # sample with replacement
                                     n_samples=len(df_majority),  # to match majority class
                                     random_state=123) # reproducible results

    # Combine majority class with upsampled minority classes
    df_upsampled = pd.concat([df_majority, df_minority_FL_upsampled, df_minority_CT_upsampled, df_minority_PT_upsampled])
    
    return df_upsampled

df_train_oversampled = oversample_data(df_train_complete)

In [41]:
df_train_oversampled_features = df_train_oversampled.drop(['salary_in_usd'], axis=1)
df_train_oversampled_features = csr_matrix(df_train_oversampled_features.values)
df_train_oversampled_target = df_train_oversampled['salary_in_usd']

In [42]:
save_npz('./ds_salaries_Oversampling_features_train.npz', df_train_oversampled_features)
df_train_oversampled_target.to_csv('./ds_salaries_Oversampling_target_train.csv', index=False)