In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def plot_model_transferability(standard_df_dict, model_dict, title):
    """
    Plot model transferability to historical data.
    
    Parameters:
    -----------
    standard_df_dict : dict
        Dictionary containing standardized dataframes for each target variable
        e.g. {'orgc': standard_orgc_df, 'tceq': standard_tceq_df}
    model_dict : dict 
        Dictionary containing trained models for each target variable
        e.g. {'orgc': orgc_model, 'tceq': tceq_model}
        
    Returns:
    --------
    None. Displays plot comparing model performance on historical data.
    """
    
    fig, axes = plt.subplots(1, len(model_dict), figsize=(15, 4))
    
    for i, (target_var, model) in enumerate(model_dict.items()):
        # Get data for this target
        df = standard_df_dict[target_var]
        # historical_data = df[df['year'] < 2000].copy()
        historical_data = df.copy()
        
        # Prepare features and target
        features = historical_data.drop(columns=['year', target_var])
        target = historical_data[target_var]
        
        # Make predictions
        predictions = model.predict(features)
        
        # Calculate metrics
        mse = mean_squared_error(target, predictions)
        mae = mean_absolute_error(target, predictions)
        r2 = r2_score(target, predictions)
        
        # Create scatter plot
        axes[i].scatter(target, predictions, alpha=0.7, edgecolors='k')
        axes[i].plot([target.min(), target.max()], [target.min(), target.max()], 'r--')
        axes[i].set_title(f'{target_var.upper()} Model Performance on Historical Data\n'
                         f'(MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f})')
        axes[i].set_xlabel(f'Actual {target_var.upper()}')
        axes[i].set_ylabel(f'Predicted {target_var.upper()}')

    plt.suptitle(f'Model Transferability {title}')
    plt.tight_layout()
    plt.figtext(0.5, 0.01,
                'The scatter plots show the predicted vs actual values for historical data. '
                'The red dashed line represents the perfect prediction.',
                ha='center', fontsize=10)
    plt.show()

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def standardise_data(df):
    # select numeric columns
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    df_numeric = df[numeric_cols]
    
    # standardise data
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_numeric)
    df_scaled = pd.DataFrame(df_scaled, columns=numeric_cols)
    
    # add back non-numeric columns
    df_scaled = pd.concat([df[df.columns.difference(numeric_cols)], df_scaled], axis=1)
    
    return df_scaled

# Load data
master_df = pd.read_csv('../outputs/20241216/Guatemala_wosis/Guatemala_wosis_merged.csv')

# feature engineering
# create organic matter
master_df['organic_matter'] = master_df.apply(lambda row: 1.724 * row['orgc'], axis=1)

# create bulk density
master_df[f'bulk_density'] = master_df.apply(lambda row: 1.62-0.06 * row['organic_matter'], axis=1)

# create sum of silt plus clay
master_df[f'silt_plus_clay'] = master_df.apply(lambda row: (row['silt'] if not pd.isnull(row['silt']) else 0) + (row['clay'] if not pd.isnull(row['clay']) else 0) , axis=1)

# drop silt and clay column
master_df = master_df.drop(columns=['silt', 'clay']).copy()

print( 'Length of master_df:', len(master_df))

# split data
orgc_df = master_df[master_df['orgc'].notna()].copy()
tceq_df = master_df[master_df['tceq'].notna()].copy()

# standardise data
standard_orgc_df = standardise_data(orgc_df)
standard_tceq_df = standardise_data(tceq_df)

# columns to use
columns_to_use = ['bulk_density', 'lower_depth', 'organic_matter', 'orgc', 'phaq', 'sand',
       'silt_plus_clay', 'tceq', 'upper_depth']

standard_df_dict = {'orgc': standard_orgc_df[columns_to_use], 'tceq': standard_tceq_df[columns_to_use]}

model_dict = {
    'orgc': joblib.load('../models/20241216/Mexico/ORGC_best_model.pkl'),
    'tceq': joblib.load('../models/20241216/Mexico/TCEQ_best_model.pkl')
}

plot_model_transferability(standard_df_dict, model_dict, 'from Mexico to Guatemala')