In [29]:

# Import necessary libraries
import pandas as pd
import gradio as gr
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

# Load dataset
file_path = "./resources/data_science_salaries.csv"  # Update path if needed
salary_df = pd.read_csv(file_path)

# Define features and target variable
categorical_features = ["job_title", "experience_level", "employment_type", "work_models", 
                        "company_size", "employee_residence", "company_location"]
numerical_features = ["work_year"]

# Binning Employee Salary (y):

# Visual observation of the range of the Highest two bars
bins = [1500, 50000, 156000, 176000, 750000]

# Create labels for these bins
group_names = ["Very Low", "Low", "Average", "High"]

salary_data = salary_df["salary_in_usd"]
# Slice the data and place it into bins
binned_data = pd.cut(salary_data, bins, labels=group_names, include_lowest=True)

X = salary_df[categorical_features + numerical_features]
y = binned_data  # This should already contain "Very Low", "Low", etc.

# Apply OneHotEncoder for categorical variables
preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)],
    remainder="passthrough"
)

# Transform features and split dataset
X_encoded = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train Decision Tree Classifier with optimized depth
dtc_model = DecisionTreeClassifier(max_depth=4, random_state=42)
dtc_model.fit(X_train, y_train)

# Gradio Interface Function
def predict_salary(job_title, experience_level, employment_type, work_models, work_year, company_size, employee_residence, company_location):
    """Predicts salary bin using the trained Decision Tree Classifier."""
    user_input = pd.DataFrame([{
        "job_title": job_title,
        "experience_level": experience_level,
        "employment_type": employment_type,
        "work_models": work_models,
        "company_size": company_size,
        "employee_residence": employee_residence,
        "company_location": company_location,
        "work_year": int(work_year)
    }])

    # Apply OneHotEncoder transformation
    user_encoded = preprocessor.transform(user_input)

    # Predict salary bin
    predicted_bin = dtc_model.predict(user_encoded)[0]

    return f"Predicted Salary Bin: {predicted_bin}"

# Extract unique values for dropdowns
job_titles = list(salary_df["job_title"].unique())
experience_levels = list(salary_df["experience_level"].unique())
employment_types = list(salary_df["employment_type"].unique())
work_models = list(salary_df["work_models"].unique())
company_sizes = list(salary_df["company_size"].unique())
employee_residences = list(salary_df["employee_residence"].unique())
company_locations = list(salary_df["company_location"].unique())

# Create Gradio Interface
gr_interface = gr.Interface(
    fn=predict_salary,
    inputs=[
        gr.Dropdown(choices=job_titles, label="Job Title"),
        gr.Dropdown(choices=experience_levels, label="Experience Level"),
        gr.Dropdown(choices=employment_types, label="Employment Type"),
        gr.Dropdown(choices=work_models, label="Work Model"),
        gr.Number(label="Work Year"),
        gr.Dropdown(choices=company_sizes, label="Company Size"),
        gr.Dropdown(choices=employee_residences, label="Employee Residence"),
        gr.Dropdown(choices=company_locations, label="Company Location")
    ],
    outputs=gr.Textbox(label="Predicted Salary Bin"),
    title="Salary Prediction App",
    description="Select your details and get an estimated salary category using a Decision Tree model."
)

# Launch Gradio App
gr_interface.launch()


* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.


