# Salary Prediction Model

This notebook trains a Random Forest model to predict salaries based on experience, education, skills, and job factors.

## Features Used
- Years of Experience
- Education Level (High School, Bachelor, Master, PhD)
- Job Type (IT, Healthcare, Finance, Engineering, Sales, Marketing)
- City Tier (1, 2, 3)
- Number of Skills

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

## Generate Synthetic Data

In [None]:
np.random.seed(42)
n_samples = 3000

job_types = ["it", "healthcare", "finance", "engineering", "sales", "marketing"]
job_base = {
    "it": 80000, "healthcare": 90000, "finance": 85000, 
    "engineering": 75000, "sales": 60000, "marketing": 65000
}
edu_multiplier = {"high_school": 0.7, "bachelor": 1.0, "master": 1.3, "phd": 1.6}
city_tier_mult = {1: 1.3, 2: 1.0, 3: 0.7}

salary_data = []
for _ in range(n_samples):
    experience = np.random.uniform(0, 20)
    education = np.random.choice(["high_school", "bachelor", "master", "phd"])
    job = np.random.choice(job_types)
    city_tier = np.random.choice([1, 2, 3])
    n_skills = np.random.randint(0, 10)
    
    base = job_base[job] * edu_multiplier[education] * city_tier_mult[city_tier]
    salary = base * (1 + 0.05 * experience) + n_skills * 3000
    salary += np.random.normal(0, salary * 0.1)
    
    salary_data.append({
        "experience": experience,
        "education": education,
        "job_type": job,
        "city_tier": city_tier,
        "n_skills": n_skills,
        "salary": salary
    })

df = pd.DataFrame(salary_data)
edu_map = {"high_school": 0, "bachelor": 1, "master": 2, "phd": 3}
job_map = {job: i for i, job in enumerate(job_types)}

df["edu_encoded"] = df["education"].map(edu_map)
df["job_encoded"] = df["job_type"].map(job_map)
df.head()

## Train Model

In [None]:
X = df[["experience", "edu_encoded", "job_encoded", "city_tier", "n_skills"]]
y = df["salary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Model Performance:")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"RMSE: ₹{np.sqrt(mean_squared_error(y_test, y_pred)):,.0f}")
print(f"MAE: ₹{mean_absolute_error(y_test, y_pred):,.0f}")

## Feature Importance

In [None]:
feature_names = ["Experience", "Education", "Job Type", "City Tier", "Skills Count"]
importance = model.feature_importances_

plt.figure(figsize=(10, 6))
sns.barplot(x=importance, y=feature_names, palette="coolwarm")
plt.title("Feature Importance - Salary Prediction")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()