**This file is for reference only and cannot be executed directly,
as it requires access to a confidential CSV data file that is not included in this repository.**

**The data used for training contains proprietary company information and is restricted from public access.
For demonstration purposes, you can refer to the model file provided in this repository.**

In [None]:
# Importing necessary libraries
import os
import json
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import shap
import matplotlib.pyplot as plt


### Configuration and Data Loading

Loading the California Housing dataset and splitting it into features and target.


In [None]:
train = pd.read_csv('train.csv')

In [None]:
X = train

In [None]:
# Clean Data

X.columns = [col.replace('.', '_') for col in X.columns]


y = X['Listing_Price_ClosePrice']

# Columns to drop
# Some of them are unnecessary information
X.drop(columns=[
    "Characteristics_LotFeatures",
    "Listing_Dates_CloseDate",
    "Listing_ListingId",
    "Listing_Price_ClosePrice",
    "Location_Address_City",
    "Location_Address_CountyOrParish",
    "Location_Address_PostalCode",
    "Location_Address_PostalCodePlus4",
    "Location_Address_StateOrProvince",
    "Location_Address_StreetDirectionPrefix",
    "Location_Address_StreetDirectionSuffix",
    "Location_Address_StreetName",
    "Location_Address_StreetNumber",
    "Location_Address_StreetSuffix",
    "Location_Address_UnitNumber",
    "Location_Address_UnparsedAddress",
    "Location_Area_SubdivisionName",
    "Location_School_HighSchoolDistrict",
    "Property_PropertyType",
    "Structure_Basement",
    "Structure_Cooling",
    "Structure_Heating",
    "Structure_NewConstructionYN",
    "Structure_ParkingFeatures",
    "Structure_Rooms_RoomsTotal",
    "Tax_Zoning",
    "UnitTypes_UnitTypeType",
    "ImageData_features_reso_results",
    "ImageData_room_type_reso_results",
    "ImageData_style_exterior_summary_label",
    "ImageData_style_stories_summary_label",
    "Location_Address_CensusBlock",
    "Location_Address_CensusTract"
], inplace=True)

In [None]:
# Defining parameters
test_size = 0.2
random_state = 42

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)


### Data Preprocessing Pipeline

Setting up a preprocessing pipeline with transformations for numerical and categorical features.


In [None]:
# Identifying numerical and categorical features
numeric_features = list(X_train.select_dtypes(include=[np.number]).columns)
categorical_features = list(X_train.select_dtypes(include=['object']).columns)

# Defining transformations for numerical features
numeric_transformers = [('scaler', StandardScaler()), ('poly', PolynomialFeatures(degree=2, include_bias=False))]

# Defining transformations for categorical features
categorical_transformers = [('onehot', OneHotEncoder(handle_unknown='ignore'))]

# Building preprocessor with transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=numeric_transformers), numeric_features),
        ('cat', Pipeline(steps=categorical_transformers), categorical_features)
    ]
)


### Model Selection

Selecting and configuring the model based on the configuration. Here we choose Random Forest Regressor.


In [None]:
# Defining model parameters
model_type = 'random_forest'
n_estimators = 100
max_depth = 10
min_samples_split = 2

# Choosing and initialize the model
if model_type == 'linear_regression':
    model = LinearRegression()
elif model_type == 'random_forest':
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=random_state
    )
else:
    raise ValueError(f"Model '{model_type}' not supported")


### Complete Pipeline

Combining the preprocessor and the model into a complete pipeline.


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

# Define your preprocessor steps (e.g., imputing and polynomial features)
preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle NaN values
    ('polynomial_features', PolynomialFeatures(degree=2))
])

# Creating the pipeline with preprocessing and model steps
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', model)
])


### Model Training and Logging in MLflow

Trainning the model and log parameters, metrics, and the model itself in MLflow.


In [None]:
# Starting an MLflow run
mlflow.start_run()

# Logging parameters to MLflow
mlflow.log_param('model_type', model_type)
mlflow.log_param('n_estimators', n_estimators)
mlflow.log_param('max_depth', max_depth)
mlflow.log_param('test_size', test_size)
mlflow.log_param('min_samples_split', min_samples_split)

# Fitting the model
pipeline.fit(X_train, y_train)

# Logging the model to MLflow
mlflow.sklearn.log_model(pipeline, artifact_path='model')


### Model Evaluation

Evaluating the model on the test set, calculate metrics, and log them in MLflow.


In [None]:
# Predicting on the test set
y_pred = pipeline.predict(X_test)

# Calculating metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Logging metrics to MLflow
mlflow.log_metric('mse', mse)
mlflow.log_metric('mae', mae)
mlflow.log_metric('r2_score', r2)

# Printing metrics
print(f'MSE: {mse}, MAE: {mae}, R2 Score: {r2}')


### Feature Importance Logging

Logging feature importances (for models that support it) in MLflow.


In [None]:
# Extracting feature importances from the model (if available)
model = pipeline.named_steps['regressor']
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

# Checking if the model has feature_importances_ or coefficients
if hasattr(model, 'feature_importances_'):
    importances = model.feature_importances_
elif hasattr(model, 'coef_'):
    importances = np.abs(model.coef_)
else:
    importances = None
    print("The model does not have 'feature_importances_' or 'coef_' attributes")

if importances is not None:
    # Creating a DataFrame for feature importances
    feat_imp_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)

    # Saving and logging feature importances as CSV
    feat_imp_csv = 'feature_importance.csv'
    feat_imp_df.to_csv(feat_imp_csv, index=False)
    mlflow.log_artifact(feat_imp_csv)

    # Plotting and saving feature importances
    plt.figure(figsize=(10, 6))
    plt.barh(feat_imp_df['Feature'], feat_imp_df['Importance'])
    plt.gca().invert_yaxis()
    plt.xlabel('Importance')
    plt.title('Feature Importances')
    plt.tight_layout()

    # Saving and log the plot
    feat_imp_png = 'feature_importance.png'
    plt.savefig(feat_imp_png)
    plt.close()
    mlflow.log_artifact(feat_imp_png)


### SHAP Values Logging

Calculating and log SHAP values to explain the model's predictions.


In [None]:
# Directory for SHAP images
shap_images_dir = "shap_images"
os.makedirs(shap_images_dir, exist_ok=True)

# Using a sample from the test data to speed up SHAP computation
X_test_sample = X_test.sample(n=100, random_state=random_state)

# Transforming the data with preprocessing
X_transformed = pipeline.named_steps['preprocessor'].transform(X_test_sample)
X_transformed_df = pd.DataFrame(X_transformed, columns=feature_names)

# Selecting SHAP explainer based on model type
if model_type == 'random_forest':
    explainer = shap.TreeExplainer(model)
else:
    explainer = shap.LinearExplainer(model, X_transformed)

# Calculating SHAP values
shap_values = explainer.shap_values(X_transformed)

# Ploting SHAP summary and save
shap_summary_png = os.path.join(shap_images_dir, 'shap_summary_plot.png')
shap.summary_plot(shap_values, features=X_transformed_df, feature_names=feature_names, show=False)
plt.savefig(shap_summary_png, bbox_inches='tight')
plt.close()
mlflow.log_artifact(shap_summary_png, artifact_path='shap_plots')

# Ploting SHAP dependence for the most important feature and save
top_feature_index = np.argmax(np.abs(shap_values).mean(0))
feature_name = feature_names[top_feature_index]
shap_dependence_png = os.path.join(shap_images_dir, 'shap_dependence_plot.png')
shap.dependence_plot(feature_name, shap_values, X_transformed_df, feature_names=feature_names, show=False)
plt.savefig(shap_dependence_png, bbox_inches='tight')
plt.close()
mlflow.log_artifact(shap_dependence_png, artifact_path='shap_plots')


### End the MLflow Experiment

Ending the MLflow run to finalize the logging.


In [None]:
# Ending the MLflow run
mlflow.end_run()


In [None]:
import pickle

model = pipeline

# Define the file path to save the model
model_path = "model.pkl"

# Save the model to a .pkl file
with open(model_path, "wb") as file:
    pickle.dump(model, file)

print(f"Model saved to {model_path}")

Model saved to model.pkl
