In [None]:
#%%
import pandas as pd
import numpy as np
#import seaborn as sns
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from scikeras.wrappers import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator
from sklearn.metrics import mean_absolute_error
from typing import Tuple
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Model
from sklearn.preprocessing import StandardScaler, FunctionTransformer
import warnings
warnings.filterwarnings('ignore')

def bathrooms_featurizer(df: pd.DataFrame) -> pd.Series:
    """map bagni 1, 2, else to 0, 1, 2"""
    mapping = {1: 0, 2: 1}
    bathrooms = df['bagni'].map(mapping).fillna(2)
    return bathrooms

def rooms_featurizer(df: pd.DataFrame) -> pd.Series:
    """map stanze 1, 2, 3, else to 0, 1, 2, 3"""
    mapping = {1: 0, 2: 1, 3: 2}
    rooms = df['stanze'].map(mapping).fillna(3)
    return rooms

CUSTOM_FEATURE_FUNCTIONS = {
    "bathrooms_feature": bathrooms_featurizer,
    "rooms_feature": rooms_featurizer,
}

class PandasFeatureEngineer(BaseEstimator, TransformerMixin):
    """Featurize DataFrame by applying specified functions and return only the series."""

    CUSTOM_FEATURE_FUNCTIONS = {
        "bathrooms_feature": bathrooms_featurizer,
        "rooms_feature": rooms_featurizer,
        }

    def __init__(self, functions=None):
        self.functions = functions

    def fit(self, df, y=None):
        return self

    def transform(self, df):
        if not isinstance(df, pd.DataFrame):
            raise TypeError("Input must be a DataFrame.")
        
        df_transformed = pd.DataFrame()
        for func_name, func in self.functions.items():
            df_transformed[func_name] = func(df)

        return df_transformed

        # # If there's only one function, return its output directly.
        # # Otherwise, concatenate the outputs column-wise.
        # if len(output) == 1:
        #     return output[0].values.reshape(-1, 1)
        # else:
        #     return np.column_stack(output)


class PandasColumnSelector(BaseEstimator, TransformerMixin):
    """Select a sub-set of columns from a pandas DataFrame."""
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, df, y=None, **fit_params):
        return self

    def transform(self, df):
        return df[self.columns].copy()
    

def build_model(
    *,
    # Features & labels
    num_features: int,
    num_labels: int,
    # Network architecture
    hidden_units: Tuple[int, ...],
    dropout_rate: float | None = None,
    # Other wrapper induced parameters,
    **kwargs,
) -> Model:
    features = Input(shape=(num_features,), name="features")

    layers = features
    for idx, units in enumerate(hidden_units):
        layers = Dense(units, activation="relu", name=f"dense_{idx}")(layers)
        if dropout_rate:
            layers = Dropout(dropout_rate, name=f"dropout_{idx}")(layers)

    labels = Dense(num_labels, name="labels")(layers)

    model = Model(features, labels)
    model.compile(**kwargs["compile_kwargs"])
    return model


class KerasRegressorWrapper(KerasRegressor):

    def __init__(self, my_param=None, **kwargs):
        super().__init__(**kwargs)
        self.my_param = my_param

    @property
    def target_encoder(self) -> BaseEstimator:
        return StandardScaler()

    @staticmethod
    def scorer(y_true, y_pred, **kwargs) -> float:
        return mean_absolute_error(y_true, y_pred)


# A simple function to reshape a 1D array to 2D
class TargetReshaper(BaseEstimator, TransformerMixin):
    def fit(self, y=None):
        return self

    def transform(self, y):
        # Convert y to a 2D array
        return y.values.reshape(-1, 1)

    def inverse_transform(self, y):
        # Convert y back to a 1D array
        return y.ravel()



df = pd.read_parquet('../dataframes/rents_clean.parquet')
df = df.dropna(subset=['prezzo', 'superficie'])
df = df.loc[df['prezzo'] < 10000]

numerical_features = ['superficie']
categorical_features = [
    'posti auto', 'bagni', 'stanze', 'ultimo piano', 'stato',
    'classe energetica', 'riscaldamento centralizzato', 'arredato', 'balcone', 'esposizione esterna', 'fibra ottica', 'cancello elettrico', 'cantina', 'giardino comune', 'giardino privato', 'piscina', 'villa', 'intera proprieta', 'appartamento', 'attico', 'loft', 'mansarda']


all_features = numerical_features + categorical_features
num_imputer = SimpleImputer(strategy='mean')  
cat_imputer = SimpleImputer(strategy='most_frequent')
hot_encoder = OneHotEncoder(handle_unknown='ignore')
selector = PandasColumnSelector()
featurizer = PandasFeatureEngineer(functions=CUSTOM_FEATURE_FUNCTIONS)


# numerical pipeline
numerical_pipeline = Pipeline([
    ('selector', PandasColumnSelector(columns=['superficie'])),
    ('imputer', num_imputer),
    ('scaler', StandardScaler()) 
])


# categorical pipeline
categorical_featurizer = ColumnTransformer(
    transformers=[
        ('selector', PandasColumnSelector(columns=categorical_features), categorical_features),
        ('bathrooms_featurizer', PandasFeatureEngineer(functions={"bathrooms_feature": bathrooms_featurizer}), ['bagni']),
        ('rooms_featurizer', PandasFeatureEngineer(functions={"rooms_feature": rooms_featurizer}), ['stanze']), 
    ])

categorical_pipeline = Pipeline([
    ('categorical_featurizer', categorical_featurizer),
    ('imputer', cat_imputer),
    ('encoder', hot_encoder)
])


# final preprocessor
preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, ['superficie']),
    ('categorical', categorical_pipeline, categorical_features)
], remainder='drop')


# Create the target pipeline
target_pipeline = Pipeline([
    ('reshaper', TargetReshaper()),
    ('scaler', StandardScaler())
])

#%% CREATE X AND Y
X = df.drop(columns=['prezzo'])
y = df['prezzo']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)


#%% FIRST LAYER DEEP LEARNING DEFAULT PARAMS§
BEST_PARAMS_FIRST_LAYER = {
    # NN architecture
    "model": build_model,
    "num_features": 73,
    "num_labels": 1,
    "hidden_units": (128,) * 10,
    "dropout_rate": 0.02,
    # Model compilation,
    "optimizer": "adam",
    # "optimizer__learning_rate": 0.05,
    "loss": "log_cosh",
    "metrics": ["mse", "mae"],
}

X_transformed = preprocessor.fit_transform(X_train)
y_transformed = target_pipeline.fit_transform(y_train)

regressor = KerasRegressorWrapper(**BEST_PARAMS_FIRST_LAYER)
regressor.fit(X_transformed, y_transformed)


#%% MODEL PIPELINE NESTED 1 LEVEL
regressor = KerasRegressorWrapper(build_fn=build_model)

BEST_PARAMS_SECOND_LAYER = {
    "model__num_features": 73,
    "model__num_labels": 1,
    "model__hidden_units": (128,) * 5,
    "loss": "log_cosh",
}

regressor.set_params(**BEST_PARAMS_SECOND_LAYER)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', regressor)
])

y_transformer = target_pipeline.fit(y_train)
y_train_transformed = target_pipeline.transform(y_train)
pipeline.fit(X_train, y_train_transformed)

#%% INSPECT VALUES
current_params = regressor.get_params()
for key, value in current_params.items():
    print(f"{key}: {value}")


for param, value in BEST_PARAMS_SECOND_LAYER.items():
    current_value = regressor.get_params().get(param)
    print(f"{param}: {current_value} (Expected: {value})")



#%% GRIDSEARCH
regressor = KerasRegressorWrapper(build_fn=build_model)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('modelL2', regressor)
])

regressor.set_params(**BEST_PARAMS_SECOND_LAYER)

param_grid = {
    'modelL2__model__hidden_units': [(128, 128, 128)],
    'modelL2__model__dropout_rate': [0.1],
    'modelL2__model__optimizer': ['adam'],
}


grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=param_grid,
                           cv=2, 
                           verbose=1, 
                           n_jobs=-1, 
                           scoring='neg_mean_absolute_error')

grid_search.fit(X_train, y_train_transformed)


#%% INSPECT VALUES
current_params = regressor.get_params()
for key, value in current_params.items():
    print(f"{key}: {value}")

for param, value in BEST_PARAMS_SECOND_LAYER.items():
    current_value = regressor.get_params().get(param)
    print(f"{param}: {current_value} (Expected: {value})")

#%% GRIDSEARCH - COMPARE MODELS
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model_placeholder', None)
])

param_grid = [
    {
        'model_placeholder': [RandomForestRegressor()],
        'model_placeholder__n_estimators': [50, 100],
    },
    {
        'model_placeholder': [KerasRegressorWrapper()],
        'model_placeholder__model__hidden_units': [(128, 128, 128)],
        'model_placeholder__model__dropout_rate': [0.1],
        'model_placeholder__model__optimizer': ['adam'],
    }
]

grid_search = GridSearchCV(estimator=pipeline,
                            param_grid=param_grid,
                            cv=2,
                            verbose=1,
                            n_jobs=-1,
                            scoring='neg_mean_absolute_error')

grid_search.fit(X_train, y_train_transformed)





#%%
#%% MODEL PIPELINE NESTED 2 LEVELS
regressor = KerasRegressorWrapper(build_fn=build_model)


pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('gridsearch', regressor)
])

BEST_PARAMS_SECOND_LAYER = {
    "model__num_features": 73,
    "model__num_labels": 1,
    "model__hidden_units": (128,) * 5,
    "loss": "log_cosh",
}

regressor.set_params(**BEST_PARAMS_SECOND_LAYER)

param_grid = [
    {
        'model': [KerasRegressorWrapper()], 
        "gridsearch__model__dropout_rate": [0.01, 0.02],
        "gridsearch__model__hidden_units": [(128,)*10, (128,)*5],
        # "regressor__model__optimizer": ['adam', 'adagrad', 'adadelta', 'rsmsprop'], 
        # "regressor__model__optimizer__learning_rate": [0.05, 0.01],
        # "regressor__model__loss": ["log_cosh"]
    }
]

grid_search = GridSearchCV(pipeline, param_grid, cv=2, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train_transformed)


#%%
grid_search = GridSearchCV(pipeline, param_grid, cv=2, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train_transformed)



#%%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
y_transformer = target_pipeline.fit(y_train)
y_train_transformed = target_pipeline.transform(y_train)

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', regressor)
])

regressor.set_params()

model_pipeline.fit(X_train, y_train_transformed)

#%%



param_grid = [
    {
        'model': [RandomForestRegressor()],
        'model__n_estimators': [50, 100],
        #model__max_depth': [None, 10, 20],
        #'model__min_samples_split': [2, 5],
    }, 
    {
        'model': [KerasRegressorWrapper()], 
        "model__model__dropout_rate": [0.01, 0.02],
        "model__model__hidden_units": [(128,)*10, (128,)*5],
        # "regressor__model__optimizer": ['adam', 'adagrad', 'adadelta', 'rsmsprop'], 
        # "regressor__model__optimizer__learning_rate": [0.05, 0.01],
        # "regressor__model__loss": ["log_cosh"]
    }
]

grid_search = GridSearchCV(model_pipeline, param_grid, cv=2, verbose=1, n_jobs=-1)


# split and prepare data
X = df.drop(columns=['prezzo'])
y = df['prezzo']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
y_transformer = target_pipeline.fit(y_train)
y_transformed = target_pipeline.transform(y_train)

# apply gridsearch
grid_search.fit(X_train, y_transformed)

# Print the best parameters and model
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
print("Best estimator:\n{}".format(grid_search.best_estimator_))


#%%
best_params = grid_search.best_params_


model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', regressor)
])

for param_name, param_value in best_params.items():
    #if param_name.startswith('model__'):
    print(param_name, param_value)


grid_search.best_estimator_.fit(X_train, y_transformed)





#%%
X = df.drop(columns=['prezzo'])
y = df['prezzo']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

y_transformer = target_pipeline.fit(y_train)
y_transformed = target_pipeline.transform(y_train)

model_pipeline.fit(X_train, y_transformed)
y_st = model_pipeline.predict(X_train)




# %%
