In [None]:
import os
import joblib
import sqlite3
import pandas as pd
import polars as pl
import numpy as np
import optuna
import shap
import mlflow
import altair as alt
import uvicorn
import dvc.api
import streamlit as st
from fastapi import FastAPI
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.calibration import CalibratedClassifierCV
import pandera as pa
from pandera.typing import DataFrame
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
from google.colab import drive
from concurrent.futures import ThreadPoolExecutor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
import nest_asyncio
from sdv.tabular import GaussianCopula

# Configuración de MLFlow
mlflow.set_tracking_uri("mlruns")
mlflow.set_experiment("salary_prediction")

# Configuración de rutas y carga de datos
def load_and_validate_data():
    df = pl.read_csv("salaries_clean.csv").to_pandas()
    df = df.rename(columns={
        "Age": "age",
        "Gender": "gender",
        "Education Level": "education_level",
        "Job Title": "job_title",
        "Years of Experience": "years_experience",
        "Salary": "salary"
    })
    df["age"] = df["age"].astype("int64")
    df["years_experience"] = df["years_experience"].astype("int64")
    return df

# Generación de Datos Sintéticos
def generate_synthetic_data(output_path="synthetic_salaries.csv", num_samples=1000):
    df = load_and_validate_data()
    model = GaussianCopula()
    model.fit(df)
    synthetic_data = model.sample(num_samples)
    synthetic_data.to_csv(output_path, index=False)
    return synthetic_data

# Preprocesamiento de datos
def preprocess_data(df):
    X = df[['age', 'gender', 'education_level', 'job_title', 'years_experience']]
    y = df['salary']
    preprocessor = ColumnTransformer([
        ("num", StandardScaler(), ['age', 'years_experience']),
        ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['gender', 'education_level', 'job_title'])
    ])
    X_transformed = preprocessor.fit_transform(X)
    return train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Entrenamiento de modelos con MLFlow
def train_and_compare_models():
    df = load_and_validate_data()
    X_train, X_test, y_train, y_test = preprocess_data(df)

    models = {
        "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1),
        "XGBoost": XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, n_jobs=-1),
        "LightGBM": LGBMRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, n_jobs=-1),
        "Neural Network": MLPRegressor(hidden_layer_sizes=(50, 25), max_iter=300, random_state=42)
    }

    best_model = None
    best_score = float('-inf')

    with mlflow.start_run():
        with ThreadPoolExecutor() as executor:
            futures = {executor.submit(lambda m: cross_val_score(m, X_train, y_train, scoring="r2", cv=3).mean(), model): name for name, model in models.items()}
            for future in futures:
                score = future.result()
                mlflow.log_metric(f"R2_{futures[future]}", score)
                if score > best_score:
                    best_score = score
                    best_model = models[futures[future]]

    best_model.fit(X_train, y_train)
    joblib.dump(best_model, "salary_model.pkl")
    mlflow.sklearn.log_model(best_model, "best_model")
    return best_model, X_test, y_test

# API con FastAPI y guardado en SQL
app = FastAPI()
model, X_test, y_test = train_and_compare_models()

@app.post("/predict/")
def predict_salary(age: int, gender: str, education_level: str, job_title: str, years_experience: int):
    X_input = np.asarray([[age, gender, education_level, job_title, years_experience]])
    predicted_salary = model.predict(X_input)[0]

    conn = sqlite3.connect("salaries.db")
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS salary_predictions (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            age INTEGER,
            gender TEXT,
            education_level TEXT,
            job_title TEXT,
            years_experience INTEGER,
            predicted_salary REAL
        )
    ''')
    cursor.execute('''
        INSERT INTO salary_predictions (age, gender, education_level, job_title, years_experience, predicted_salary)
        VALUES (?, ?, ?, ?, ?, ?)
    ''', (age, gender, education_level, job_title, years_experience, predicted_salary))
    conn.commit()
    conn.close()

    return {"predicted_salary": round(predicted_salary, 2)}

# Visualización con Altair
def visualize_data(df):
    chart = alt.Chart(df).mark_circle(size=60).encode(
        alt.X("years_experience:Q", title="Años de Experiencia"),
        alt.Y("salary:Q", title="Salario"),
        alt.Color("education_level:N", title="Nivel Educativo"),
        tooltip=["years_experience", "salary", "education_level"]
    ).interactive()
    chart.save("salary_scatter.html")

# Iniciar el servidor
if __name__ == "__main__":
    nest_asyncio.apply()
    uvicorn.run(app, host="127.0.0.1", port=8000, workers=1)
