In [3]:
import pandas as pd
import numpy as np
import joblib
import requests
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [4]:
# Load dataset
df = pd.read_csv("C:/Users/Admin/Desktop/Employee Salary Prediction App/salary.csv")

In [5]:
df.head()

Unnamed: 0,job_title,experience_level,employment_type,work_models,work_year,employee_residence,salary,salary_currency,salary_in_usd,company_location,company_size
0,Data Engineer,Mid-level,Full-time,Remote,2024,United States,148100,USD,148100,United States,Medium
1,Data Engineer,Mid-level,Full-time,Remote,2024,United States,98700,USD,98700,United States,Medium
2,Data Scientist,Senior-level,Full-time,Remote,2024,United States,140032,USD,140032,United States,Medium
3,Data Scientist,Senior-level,Full-time,Remote,2024,United States,100022,USD,100022,United States,Medium
4,BI Developer,Mid-level,Full-time,On-site,2024,United States,120000,USD,120000,United States,Medium


In [6]:
df.isnull()

Unnamed: 0,job_title,experience_level,employment_type,work_models,work_year,employee_residence,salary,salary_currency,salary_in_usd,company_location,company_size
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
6594,False,False,False,False,False,False,False,False,False,False,False
6595,False,False,False,False,False,False,False,False,False,False,False
6596,False,False,False,False,False,False,False,False,False,False,False
6597,False,False,False,False,False,False,False,False,False,False,False


In [7]:
# Fetch live exchange rates (Replace with a real API key if needed)
def get_exchange_rates():
    try:
        response = requests.get("https://api.exchangerate-api.com/v4/latest/USD")
        data = response.json()
        return data["rates"]
    except:
        return {"USD": 1.0, "EUR": 0.91, "GBP": 0.78, "INR": 83.5, "AUD": 1.52, "CAD": 1.36}

currency_rates = get_exchange_rates()

In [8]:
# Selecting relevant features
features = ['job_title', 'experience_level', 'employment_type', 'work_models',
            'employee_residence', 'company_location', 'company_size']
target = 'salary_in_usd'

In [9]:
# Splitting the data
X = df[features]
y = df[target]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# One-Hot Encoding for categorical variables
preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown='ignore'), features)])


In [12]:
# Using Gradient Boosting for better accuracy
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=42))
])


In [13]:
model.fit(X_train, y_train)

In [15]:
# Save model
joblib.dump(model, "salary_model.pkl")

['salary_model.pkl']