In [2]:
import gradio as gr
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import Ridge
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("resources/data_science_salaries.csv")

# Remove 'work_year' column completely
if "work_year" in df.columns:
    df = df.drop(columns=["work_year"])

# Categorical columns to encode
categorical_columns = [
    "job_title", "experience_level", "employment_type", "work_models", 
    "company_size", "employee_residence", "company_location"
]

# Encode categorical variables, storing mappings
encoders = {}
decoded_options = {}  # Store decoded values for UI dropdowns

for col in categorical_columns:
    encoders[col] = LabelEncoder()
    df[col] = encoders[col].fit_transform(df[col])  # Encode column
    decoded_options[col] = encoders[col].classes_.tolist()  # Store original values for UI dropdowns

# Define feature columns
feature_columns = categorical_columns
X = df[feature_columns]

# Log-transform salary for Ridge regression model
df["log_salary"] = np.log(df["salary"])  

# Binning Salary for Classification
bins = [1500, 50000, 156000, 176000, 750000]  # Define bin edges
group_names = ["Very Low", "Low", "Average", "High"]  # Labels

df["salary_category"] = pd.cut(df["salary"], bins=bins, labels=group_names, include_lowest=True)

# Encode salary_category as numerical values for AdaBoost
salary_encoder = LabelEncoder()
df["salary_category"] = salary_encoder.fit_transform(df["salary_category"])

# Define targets
y_salary = df["log_salary"]  # Ridge Regression Target
y_classification = df["salary_category"]  # AdaBoost Classification Target

# Train-Test Split
X_train, X_test, y_train_salary, y_test_salary = train_test_split(X, y_salary, test_size=0.2, random_state=42)
X_train_class, X_test_class, y_train_classification, y_test_classification = train_test_split(
    X, y_classification, test_size=0.2, random_state=42
)

# Train Ridge Regression Model on Log Salary
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train_salary)

# Train AdaBoost Classification Model
best_params = {
    "n_estimators": 142,
    "learning_rate": 0.4832930238571752,
    "algorithm": "SAMME",  # Fixed invalid parameter
    "random_state": 61
}

best_ada = AdaBoostClassifier(**best_params)
best_ada.fit(X_train_class, y_train_classification)

# Function to preprocess user input
def preprocess_input(job_title, experience_level, employment_type, work_models, company_size, employee_residence, company_location):
    """Convert user input into numerical format for model prediction."""
    
    # Encode input using stored LabelEncoders
    input_data = pd.DataFrame([[
        encoders["job_title"].transform([job_title])[0],
        encoders["experience_level"].transform([experience_level])[0],
        encoders["employment_type"].transform([employment_type])[0],
        encoders["work_models"].transform([work_models])[0],
        encoders["company_size"].transform([company_size])[0],
        encoders["employee_residence"].transform([employee_residence])[0],
        encoders["company_location"].transform([company_location])[0]
    ]], columns=feature_columns)
    
    return input_data

# Prediction function
def predict_salary_and_category(job_title, experience_level, employment_type, work_models, company_size, employee_residence, company_location):
    """Predicts both the exact salary (exponentiated log salary) and the classification category."""
    
    input_data = preprocess_input(job_title, experience_level, employment_type, work_models, 
                                  company_size, employee_residence, company_location)
    
    # Predict log salary using Ridge regression and convert back to actual salary
    log_salary_prediction = ridge_model.predict(input_data)[0]
    salary_prediction = np.exp(log_salary_prediction)  # Convert back from log scale

    # Predict salary classification using AdaBoost
    classification_prediction = best_ada.predict(input_data)[0]
    classification_label = salary_encoder.inverse_transform([classification_prediction])[0]  # Convert back to label

    return f"Predicted Salary: ${salary_prediction:,.2f}", f"Salary Category: {classification_label}"

# Define Gradio Interface
with gr.Blocks() as interface:
    gr.Markdown("# Salary Prediction and Classification")

    with gr.Row():
        job_title = gr.Dropdown(choices=decoded_options["job_title"], label="Job Title")
        experience_level = gr.Dropdown(choices=decoded_options["experience_level"], label="Experience Level")
    
    with gr.Row():
        employment_type = gr.Dropdown(choices=decoded_options["employment_type"], label="Employment Type")
        work_models = gr.Dropdown(choices=decoded_options["work_models"], label="Work Models")
    
    with gr.Row():
        company_size = gr.Dropdown(choices=decoded_options["company_size"], label="Company Size")
        employee_residence = gr.Dropdown(choices=decoded_options["employee_residence"], label="Employee Residence")
    
    company_location = gr.Dropdown(choices=decoded_options["company_location"], label="Company Location")

    predict_button = gr.Button("Predict")

    salary_output = gr.Textbox(label="Predicted Salary", interactive=False)
    category_output = gr.Textbox(label="Salary Classification", interactive=False)

    predict_button.click(
        predict_salary_and_category, 
        inputs=[job_title, experience_level, employment_type, work_models, company_size, employee_residence, company_location], 
        outputs=[salary_output, category_output]
    )

# Launch the Interface
interface.launch(share=True)


* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://ac292a8d96ec07cc5b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


