In [4]:
!pip install gradio



In [5]:
# File Imports
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import tree

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


import gradio as gr
# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

# Suppress Warnings
import warnings
from sklearn.exceptions import DataConversionWarning

warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.simplefilter("ignore", category=FutureWarning)

In [6]:
# Load dataset
file_path = "./resources/data_science_salaries.csv"  # Update path if needed
salary_df = pd.read_csv(file_path)

# Define features and target variable
categorical_features = ["job_title", "experience_level", "employment_type", "work_models", 
                        "company_size", "employee_residence", "company_location"]
# numerical_features = ["work_year"]

# Binning Employee Salary (y):

# Visual observation of the range of the Highest two bars
bins = [1500, 50000, 156000, 176000, 750000]

# Create labels for these bins
group_names = ["Very Low", "Low", "Average", "High"]

salary_data = salary_df["salary_in_usd"]
# Slice the data and place it into bins
binned_data = pd.cut(salary_data, bins, labels=group_names, include_lowest=True)

X = salary_df[categorical_features]
y = binned_data  # This should already contain "Very Low", "Low", etc.

# Apply OneHotEncoder for categorical variables
preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)],
    remainder="passthrough"
)

# Transform features and split dataset
X_encoded = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train AdaBoost with best parameters
# best_params = {'n_estimators': 70, 'learning_rate': 1.0, 'algorithm': 'SAMME'}
best_params = {
    'n_estimators': 142, 
    'learning_rate':  0.4832930238571752, 
    'algorithm': 'SAMME.R',
    'random_state': 61
    }

best_ada = AdaBoostClassifier(**best_params)

best_ada.fit(X_train, y_train)

# Gradio Interface Function
def predict_salary(job_title, experience_level, employment_type, work_models, company_size, employee_residence, company_location):
    """Predicts salary bin using the trained Decision Tree Classifier."""
    user_input = pd.DataFrame([{
        "job_title": job_title,
        "experience_level": experience_level,
        "employment_type": employment_type,
        "work_models": work_models,
        "company_size": company_size,
        "employee_residence": employee_residence,
        "company_location": company_location
    }])

    # Apply OneHotEncoder transformation
    user_encoded = preprocessor.transform(user_input)

    # Predict salary bin
    predicted_bin = best_ada.predict(user_encoded)[0]

    return f"{predicted_bin}"

# Extract unique values for dropdowns
job_titles = list(salary_df["job_title"].unique())
experience_levels = list(salary_df["experience_level"].unique())
employment_types = list(salary_df["employment_type"].unique())
work_models = list(salary_df["work_models"].unique())
company_sizes = list(salary_df["company_size"].unique())
employee_residences = list(salary_df["employee_residence"].unique())
company_locations = list(salary_df["company_location"].unique())

# Create Gradio Interface
# Gradio UI
with gr.Blocks(title="Salary Prediction App") as demo:
    gr.Markdown("Select your details and get an estimated salary category using a Decision Tree model.")
    with gr.Row():
        job_title_input = gr.Dropdown(choices=job_titles, label="Job Title")
        experience_level_input = gr.Dropdown(choices=experience_levels, label="Experience Level")
        employment_type_input = gr.Dropdown(choices=employment_types, label="Employment Type")
        work_models_input = gr.Dropdown(choices=work_models, label="Work Model")
    with gr.Row(): 
        company_size_input = gr.Dropdown(choices=company_sizes, label="Company Size")
        employee_residence_input = gr.Dropdown(choices=employee_residences, label="Employee Residence")
        company_location_input = gr.Dropdown(choices=company_locations, label="Company Location")

    predict_button = gr.Button("Predict Salary")
    output_label = gr.Textbox(label="Predicted Salary")
        
    predict_button.click(
        fn=predict_salary,
        inputs=[job_title_input, experience_level_input, employment_type_input, work_models_input, employee_residence_input, company_location_input, company_size_input],
        outputs=output_label
    )

demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7863
* Running on public URL: https://13a9a1bf3f65b0fe07.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


