In [None]:
# ========================================
# STEP 1: IMPORTS
# ========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import pickle
import spacy
import fitz  # PyMuPDF
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense




In [None]:
# ========================================
# STEP 2: LOAD AND PREPROCESS DATA
# ========================================
df = pd.read_csv("data/salary_data.csv")
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Encode categorical
cat_cols = ["Gender", "Education", "Job Title"]
df = pd.get_dummies(df, columns=cat_cols)

# Scale numeric
scaler = StandardScaler()
df[["Age", "Years of Experience"]] = scaler.fit_transform(df[["Age", "Years of Experience"]])

# Features and Target
X = df.drop("Salary", axis=1)
y = df["Salary"]

# Save encoder columns
encoder_columns = X.columns.tolist()

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [None]:
# ========================================
# STEP 3: ML MODEL - XGBoost
# ========================================
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print("XGBoost MAE:", mean_absolute_error(y_test, y_pred_xgb))
print("XGBoost R2 Score:", r2_score(y_test, y_pred_xgb))

In [None]:
# ========================================
# STEP 4: DL MODEL - Neural Network
# ========================================
model = Sequential([
    Dense(128, activation='relu', input_shape=(X.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)
])
model.compile(optimizer='adam', loss='mae')
model.fit(X_train, y_train, epochs=15, validation_data=(X_test, y_test))

In [None]:
# ========================================
# STEP 5: SHAP EXPLAINABILITY (ML MODEL)
# ========================================
explainer = shap.Explainer(xgb, X_test)
shap_values = explainer(X_test)
shap.plots.beeswarm(shap_values)

In [None]:
# ========================================
# STEP 6: RESUME PARSING (DUMMY)
# ========================================
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

def extract_text_from_resume(file_path):
    doc = fitz.open(file_path)
    return "".join([page.get_text() for page in doc])

def parse_resume(text):
    return {
        "Age": 25,
        "Gender": "Other",
        "Education": "Bachelor's",
        "Job Title": "Software Engineer",
        "Years of Experience": 2
    }


In [None]:
# ========================================
# STEP 7: FAIRNESS CHECK (GENDER BIAS)
# ========================================
def check_gender_fairness(model, X, col_prefix="Gender_"):
    gender_cols = [col for col in X.columns if col.startswith(col_prefix)]
    fairness_result = {}
    for gender_col in gender_cols:
        group = X[X[gender_col] == 1]
        if not group.empty:
            pred = model.predict(group)
            fairness_result[gender_col] = np.mean(pred)
    return fairness_result

fairness_report = check_gender_fairness(xgb, X_test)
print("Fairness Check (by Gender):", fairness_report)

In [None]:
# ========================================
# STEP 8: SKILL RECOMMENDATION
# ========================================
def recommend_skills(user_skills, top_skills):
    return list(set(top_skills) - set(user_skills))

sample_user_skills = ["python", "sql"]
top_skills = ["python", "sql", "ml", "dl", "excel", "aws"]
print("Suggested skills:", recommend_skills(sample_user_skills, top_skills))

In [None]:
# ========================================
# STEP 9: SAVE MODELS
# ========================================
with open("models/model.pkl", "wb") as f:
    pickle.dump(xgb, f)
with open("models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)
with open("models/encoder_columns.pkl", "wb") as f:
    pickle.dump(encoder_columns, f)

print("✅ Models Saved Successfully")


In [None]:
import streamlit as st
import pandas as pd
import pickle
import spacy
import fitz  # PyMuPDF
import re

# Set up
st.set_page_config(page_title="Salary Predictor", layout="centered")
st.title("💼 Employee Salary Predictor")
st.write("Enter your details or upload a resume to get an estimated salary and skill suggestions.")

# Load ML model and preprocessing tools
model = pickle.load(open("models/model.pkl", "rb"))
scaler = pickle.load(open("models/scaler.pkl", "rb"))
encoder_columns = pickle.load(open("models/encoder_columns.pkl", "rb"))
nlp = spacy.load("en_core_web_sm")

# Skill set for recommendations
recommended_skills = ['python', 'sql', 'ml', 'dl', 'aws', 'excel', 'tableau', 'pandas']

# Dummy skill extraction
def extract_skills(text):
    doc = nlp(text.lower())
    return list(set([token.text for token in doc if token.text in recommended_skills]))

# Dummy resume parser
def extract_text_from_resume(file):
    doc = fitz.open(stream=file.read(), filetype="pdf")
    return "".join([page.get_text() for page in doc])

def parse_resume(text):
    data = {
        "Age": 25,
        "Gender": "Other",
        "Education": "Bachelor's",
        "Job Title": "Software Engineer",
        "Years of Experience": 2
    }

    match = re.search(r"(\d+)\+?\s+(years|yrs)\s+(of\s+)?experience", text, re.I)
    if match:
        data["Years of Experience"] = int(match.group(1))

    for title in ["Data Scientist", "Software Engineer", "UX Designer", "Web Developer", "Analyst"]:
        if title.lower() in text.lower():
            data["Job Title"] = title
            break

    if "master" in text.lower():
        data["Education"] = "Master's"
    elif "high school" in text.lower():
        data["Education"] = "High School"

    if re.search(r'\b(she|her)\b', text.lower()):
        data["Gender"] = "Female"
    elif re.search(r'\b(he|him)\b', text.lower()):
        data["Gender"] = "Male"

    return data

# Convert to encoded + scaled DataFrame
def prepare_input(data):
    df = pd.DataFrame([data])
    df_encoded = pd.get_dummies(df)

    for col in encoder_columns:
        if col not in df_encoded:
            df_encoded[col] = 0

    df_encoded = df_encoded[encoder_columns]
    df_encoded[["Age", "Years of Experience"]] = scaler.transform(df_encoded[["Age", "Years of Experience"]])
    return df_encoded

# Prediction + skill recommendation
def predict_salary(data, resume_text=None):
    user_input = prepare_input(data)
    salary = model.predict(user_input)[0]

    user_skills = extract_skills(resume_text) if resume_text else []
    suggestions = list(set(recommended_skills) - set(user_skills))

    return salary, user_skills, suggestions

# Upload or Manual Input
col1, col2 = st.columns(2)

with col1:
    uploaded_file = st.file_uploader("📎 Upload Resume (PDF)", type=["pdf"])

with col2:
    use_manual = st.checkbox("Or fill manually")

user_data = {}
resume_text = None

if uploaded_file:
    resume_text = extract_text_from_resume(uploaded_file)
    user_data = parse_resume(resume_text)
    st.success("✅ Resume parsed successfully.")
elif use_manual:
    user_data["Age"] = st.slider("Age", 20, 60, 25)
    user_data["Gender"] = st.selectbox("Gender", ["Male", "Female", "Other"])
    user_data["Education"] = st.selectbox("Education Level", ["High School", "Bachelor's", "Master's"])
    user_data["Job Title"] = st.selectbox("Job Title", ["Software Engineer", "Data Scientist", "Web Developer", "UX Designer", "Business Analyst"])
    user_data["Years of Experience"] = st.slider("Years of Experience", 0, 40, 2)

# Predict Button
if user_data:
    if st.button("🚀 Predict Salary"):
        salary, user_skills, suggestions = predict_salary(user_data, resume_text)

        st.subheader(f"💰 Predicted Salary: ₹{salary:,.2f}")
        if resume_text:
            st.markdown(f"**Skills Found:** {', '.join(user_skills) or 'None'}")
            st.markdown(f"**Recommended Skills to Improve:** {', '.join(suggestions) or 'None'}")

# Optional: Explanation toggle
if st.checkbox("🔍 Show Explanation (SHAP - Optional)", value=False):
    st.info("This feature is better visualized in notebook for now due to SHAP plots.")
