In [32]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np
import pickle
import json

# Load your dataset
df = pd.read_csv("dataset/csv/survey_results_public.csv")

def transform_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map

df = df[["Country", "EdLevel", "YearsCodePro", "Employment", "ConvertedCompYearly"]]
df = df.rename({"ConvertedCompYearly": "Salary"}, axis=1)
df = df[df["Salary"].notnull()]
df = df.dropna()
country_map = transform_categories(df.Country.value_counts(), 50)
df['Country'] = df['Country'].map(country_map)

full_time_variations = [
    'Employed, full-time',
    'Employed, full-time;Independent contractor, freelancer, or self-employed',
    'Employed, full-time;Employed, part-time',
    'Employed, full-time;Independent contractor, freelancer, or self-employed;Employed, part-time',
    'Employed, full-time;Retired'
]
df = df[df["Employment"].isin(full_time_variations)]
df = df.drop("Employment", axis=1)

if len(df) == 0:
    raise ValueError("No samples left in the dataset after preprocessing.")

def transform_experience(x):
    if x == 'More than 50 years':
        return 50
    if x == 'Less than 1 year':
        return 0.5
    return float(x)

df['YearsCodePro'] = df['YearsCodePro'].apply(transform_experience)

def transform_education(x):
    if 'Bachelor’s degree' in x:
        return 'Bachelor’s degree'
    if 'Master’s degree' in x:
        return 'Master’s degree'
    if 'Professional degree' in x or 'Other doctoral' in x:
        return 'Post grad'
    return 'Less than a Bachelors'

df['EdLevel'] = df['EdLevel'].apply(transform_education)

label_education = LabelEncoder()
df['EdLevel'] = label_education.fit_transform(df['EdLevel'])

label_country = LabelEncoder()
df['Country'] = label_country.fit_transform(df['Country'])

X = df.drop("Salary", axis=1)
y = df["Salary"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

if len(X_train) == 0 or len(X_test) == 0:
    raise ValueError("No samples left for training or testing after splitting.")

# Train the RandomForestRegressor and tune parameters using GridSearchCV
random_forest_reg = RandomForestRegressor(random_state=0)
max_depth = [None, 2, 4, 6, 8, 10, 12]
parameters = {"max_depth": max_depth}
regressor = GridSearchCV(random_forest_reg, parameters, scoring='neg_mean_squared_error')
regressor.fit(X_train, y_train.values)

# Get the best estimator from the grid search
regressor = regressor.best_estimator_

# Make predictions on training and testing sets
y_pred_train = regressor.predict(X_train)
y_pred_test = regressor.predict(X_test)

# Evaluate the model
train_error = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_error = np.sqrt(mean_squared_error(y_test, y_pred_test))

# Display the results
print(f"Train RMSE: ${train_error:,.2f}")
print(f"Test RMSE: ${test_error:,.2f}")

# Save the model and encoders
data = {"model": regressor, "label_country": label_country, "label_education": label_education}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

# Load the model and encoders
with open('saved_steps.pkl', 'rb') as file:
    data = pickle.load(file)

regressor_loaded = data["model"]
label_country = data["label_country"]
label_education = data["label_education"]

# Function to transform JSON input into the format expected by the model
def transform_json_input(json_input):
    country = json_input['Country']
    ed_level = json_input['EdLevel']
    years_code_pro = json_input['YearsCodePro']

    # Ensure that the country encoder is used consistently
    # Transform country and education columns separately
    country = country_map.get(country, 'Other')
    country = label_country.transform([country])[0]
    ed_level = label_education.transform([ed_level])[0]
    return np.array([[country, ed_level, years_code_pro]])

# JSON input from the client
# json_input_from_client = {
#     "Country": "Hong Kong (S.A.R.)",
#     "EdLevel": "Bachelor’s degree",
#     "YearsCodePro": 10
# }

# Transform JSON input into the format expected by the model
X_new = transform_json_input(json_input_from_client)

# Make predictions on the new data point
y_pred_new = regressor_loaded.predict(X_new.reshape(1, -1))
print(f"Predicted Salary: ${y_pred_new[0]:,.2f}")

Train RMSE: $357,794.42
Test RMSE: $141,444.80
Predicted Salary: $86,295.20


