# Imports

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
# import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams.update(plt.rcParamsDefault)

In [None]:
# For plotting maps
import os
os.environ["PROJ_LIB"] = os.path.join(os.environ["CONDA_PREFIX"], "share", "proj")

# !conda install -c conda-forge basemap
from mpl_toolkits.basemap import Basemap

# # !pip install cartopy
# import cartopy.crs as ccrs
# import cartopy.feature as cfeature

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import mpl_toolkits.basemap # -> install 1.3.2
import sys
print(mpl_toolkits.basemap.__version__)

In [None]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras import layers, models

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer

# 2009 - Jan

## Load the data and labels

In [None]:
_year = 2009
_month = 'jan'

In [None]:
hc_df_original = pd.read_pickle(f"output_files/clusters_spatial_regression_{_year}_{_month}.pkl")
hc_df_original

In [None]:
hc_df_original.columns

In [None]:
# hc_df_original['label'] = f'regime_{_year}_{_month}_' + hc_df_original['cluster'].astype(int).astype(str)
hc_df_original['label'] = hc_df_original['cluster'].astype(int).astype(str)

In [None]:
hc_df_original['label'].value_counts()

## Prepare for training

In [None]:
hc_df = hc_df_original.groupby('cell_id').mean()
hc_df

In [None]:
hc_df['cluster'].value_counts()

In [None]:
hc_df['label'] = hc_df['cluster'].astype(int).astype(str)
hc_df['label'].value_counts()

In [None]:
label_counts = hc_df['label'].value_counts()
total_samples = len(hc_df)

# calculate percentages
label_percentages = label_counts / total_samples * 100

# display the results
label_stats = pd.DataFrame({'Count': label_counts, 'Percentage': label_percentages})
label_stats.round(2)

In [None]:
regime_names_dict = {
    1: 'ICE I',
    2: 'ICE II',
    3: 'SUBTR I',
    4: 'SUBTR II',
    6: 'SUBP + UP I',
    7: 'SUBP + UP II',
    5: 'SUBP + UP III',
}

In [None]:
hc_df

In [None]:
num_classes = len(hc_df['label'].unique())
num_classes

In [None]:
# Define features and target
X = hc_df[['slope_sst', 'slope_dicp', 'slope_alk']]
y = hc_df['label']

In [None]:
y

In [None]:
class_names = np.unique(y)
class_names

In [None]:
X.shape

In [None]:
y.shape

In [None]:
# Split the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,
                                                    shuffle=True, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42,
                                                 shuffle=True, stratify=y_train)

In [None]:
X_train.shape

In [None]:
y_train.shape

# model parametertuning

In [None]:
def create_model(optimizer='adam', learning_rate=0.001):
    model = models.Sequential([
        layers.Dense(64, activation='relu', kernel_regularizer=l2(0.01), input_shape=(X_train.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
        layers.Dropout(0.5),
        layers.Dense(256, activation='relu', kernel_regularizer=l2(0.01)),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
        layers.Dropout(0.5),
        layers.Dense(7, activation='softmax')  # 7 output units for 7 clusters
    ])
    
    if optimizer == 'adam':
        opt = Adam(learning_rate=learning_rate)
    elif optimizer == 'sgd':
        opt = SGD(learning_rate=learning_rate)
    
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy', Precision(), Recall()])
    
    return model

In [None]:
#Hyperparameter tuning
batch_sizes = [16, 32, 64]
optimizers = ['adam', 'sgd']
learning_rates = [0.01, 0.001]
epochs = [50, 100, 150, 200]
n_splits = 5  # Number of folds for cross-validation

In [None]:
count = 0
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
for train_index, val_index in skf.split(X_train, y_train):
    count = count + 1
    print(count)

In [None]:
%%time

results = []

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
# Initialize the label binarizer
label_binarizer = LabelBinarizer()
one_hot_encoded_y_test = label_binarizer.fit_transform(y_test)


_CV = 0
for train_index, val_index in skf.split(X_train, y_train):
    _CV = _CV + 1
    print(f"_CV: {_CV}")
#     Fit and transform your target array
    one_hot_encoded_y = label_binarizer.fit_transform(y_train)
    
    X_train_new, X_val_new = X_train[train_index], X_train[val_index]
    one_hot_encoded_y_train, one_hot_encoded_y_val = one_hot_encoded_y[train_index], one_hot_encoded_y[val_index]
    
#     y_train = label_binarizer.fit_transform(y_train)
#     y_val = label_binarizer.fit_transform(y_val)
    
    for batch_size in batch_sizes:
        for optimizer in optimizers:
            for learning_rate in learning_rates:
                for epoch in epochs:
                    print(f'Batch size: {batch_size}, Optimizer: {optimizer},Learning rate: {learning_rate}, Epochs: {epoch}')
                    print()
                    model = create_model(optimizer=optimizer, learning_rate=learning_rate)
                    history = model.fit(X_train_new, one_hot_encoded_y_train, 
                                        epochs=epoch, batch_size=batch_size, 
                                        validation_data=(X_val_new, one_hot_encoded_y_val), verbose=0)

                    test_loss, test_accuracy, test_precision, test_recall = model.evaluate(X_test, 
                                                                                           one_hot_encoded_y_test, 
                                                                                           verbose=0)
                    results.append({'Cross Validation Iteration #':_CV,
                                    'Batch Size': batch_size, 'Optimizer': optimizer, 
                                    'Learning Rate': learning_rate, 'Epochs': epoch,
                                    'Test Loss': test_loss, 'Test Accuracy': test_accuracy, 
                                    'Test Precision': test_precision, 'Test Recall': test_recall})

In [None]:
results_df = pd.DataFrame(results)
results_df

In [None]:
results_df.to_pickle(f"output_files/parameter_tuning_2009_jan.pkl")

In [None]:
results_df.to_csv(f"output_files/parameter_tuning_2009_jan.csv", decimal='.')
results_df.to_excel(f"output_files/parameter_tuning_2009_jan.xlsx", float_format='%.3f')

# Train the model

In [None]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Initialize the label binarizer
label_binarizer = LabelBinarizer()
one_hot_encoded_y_train = label_binarizer.fit_transform(y_train)
one_hot_encoded_y_val = label_binarizer.fit_transform(y_val)
one_hot_encoded_y_test = label_binarizer.fit_transform(y_test)

In [None]:
%%time

model = models.Sequential([
        layers.Dense(64, activation='relu', kernel_regularizer=l2(0.01), input_shape=(X_train.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
        layers.Dropout(0.5),
        layers.Dense(256, activation='relu', kernel_regularizer=l2(0.01)),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
        layers.Dropout(0.5),
        layers.Dense(7, activation='softmax')  # 7 output units for 7 clusters
    ])

learning_rate = 0.001
opt = Adam(learning_rate=learning_rate)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy', Precision(), Recall()])

# Train the model with class weights and early stopping
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
]

history = model.fit(X_train_scaled, one_hot_encoded_y_train, epochs=50, batch_size=32, 
                    validation_data=(X_val_scaled, one_hot_encoded_y_val), 
#           class_weight=class_weight_dict, 
          callbacks=callbacks)

In [None]:
history.history

In [None]:
mpl.rcParams['xtick.labelsize'] = 15 
mpl.rcParams['ytick.labelsize'] = 15

fig, ax = plt.subplots(figsize=(8, 6), dpi=200)
# ax.set_facecolor('white')



# Get the training and validation loss from the history object
training_loss = history.history['loss']
validation_loss = history.history['val_loss']
epochs = range(1, len(training_loss) + 1)
# Plot training and validation loss
plt.plot(epochs, training_loss, 'b', label='Training Loss')
plt.plot(epochs, validation_loss, 'r', label='Validation Loss')
plt.title('Training and Validation Loss - Categorical crossentropy', fontsize=20)
plt.xlabel('Epochs', fontsize=20)
plt.ylabel('Loss', fontsize=20)
plt.legend()
plt.show()

plt.figure(figsize=(8, 6), dpi=200)
training_acc = history.history['accuracy']
validation_acc = history.history['val_accuracy']
plt.plot(epochs, training_acc, 'b', label='Training Accuracy')
plt.plot(epochs, validation_acc, 'r', label='Validation Accuracy')
plt.title('Training and Validation Accuracy', fontsize=20)
plt.xlabel('Epochs', fontsize=20)
plt.ylabel('Accuracy', fontsize=20)
plt.legend()
plt.show()

plt.figure(figsize=(8, 6), dpi=200)
training_pre = history.history['precision']
validation_pre = history.history['val_precision']
plt.plot(epochs, training_pre, 'b', label='Training Precision')
plt.plot(epochs, validation_pre, 'r', label='Validation Precision')
plt.title('Training and Validation Precision', fontsize=20)
plt.xlabel('Epochs', fontsize=20)
plt.ylabel('Precision', fontsize=20)
plt.legend()
plt.show()

plt.figure(figsize=(8, 6), dpi=200)
training_recall = history.history['recall']
validation_recall = history.history['val_recall']
plt.plot(epochs, training_recall, 'b', label='Training Recall')
plt.plot(epochs, validation_recall, 'r', label='Validation Recall')
plt.title('Training and Validation Recall', fontsize=20)
plt.xlabel('Epochs', fontsize=20)
plt.ylabel('Recall', fontsize=20)
plt.legend()
plt.show()

## Model Evaluation

In [None]:
# Evaluate the model on the test data
model.evaluate(X_test_scaled, one_hot_encoded_y_test)

In [None]:
y_predict = model.predict(X_test_scaled)
y_predict.shape

In [None]:
y_predict

In [None]:
np.argmax(y_predict, axis=1)

In [None]:
# # Create a DataFrame from y_test with the original index
# y_test_df = pd.DataFrame(y_test, columns=label_binarizer.classes_, index=hc_df.index[X_test.index])

# # Merge the y_test_df with the hc_df dataframe using the index
# hc_df_with_predictions = hc_df.merge(y_test_df, left_index=True, right_index=True)

# # hc_df_with_predictions now contains the original features and the one-hot encoded labels
# hc_df_with_predictions.head()

In [None]:
# y_test_encoded is the one-hot encoded ground truth
# y_pred_encoded is the one-hot encoded predictions

# Convert one-hot encoded arrays to class indices
y_test_indices = np.argmax(one_hot_encoded_y_test, axis=1)
y_pred_indices = np.argmax(y_predict, axis=1)

# Calculate accuracy
accuracy = np.mean(y_test_indices == y_pred_indices)
accuracy_percentage = accuracy * 100

print(f"Accuracy: {accuracy_percentage:.2f}%")

In [None]:
y_pred_indices

In [None]:
# Build the confusion matrix
confusion = confusion_matrix(y_test_indices, y_pred_indices)

# Print or display the confusion matrix
print("Confusion Matrix:")
print(confusion)

In [None]:
def plot_confusion_matrix(confusion_matrix, class_names,vmin, vmax):
    plt.figure(figsize=(8, 6), dpi=200)
    sns.set(font_scale=1.2)
    sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, 
                yticklabels=class_names, vmin=vmin, vmax=vmax)
    plt.xlabel('Predicted Carbon Provinces', fontsize=20)
    plt.ylabel('Original Carbon Provinces', fontsize=20,)
    plt.title('Confusion Matrix for Model Evaluation', fontsize=20, pad=10)
    plt.show()

In [None]:
plot_confusion_matrix(confusion_matrix=confusion, class_names=class_names, 
                      vmin=0, vmax=50)

In [None]:
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# confusion matrix for 7 biomes
conf_matrix = np.array([
    [  5,  15,   0,   0,   2,   0,   0],
    [  0,  91,   0,   2,   1,   0,   0],
    [  0,   4, 757,  36,   0,   0,   5],
    [  0,   0,   2, 504,   0,   0,   0],
    [  0,   1,   0,   0, 299,   0,   2],
    [  0,   0,   0,   1,   6,   0,   5],
    [  0,   0,  18,   0,  11,   0, 378]
])

# Calculate metrics
def calculate_metrics(conf_matrix):
    num_classes = conf_matrix.shape[0]
    accuracy = []
    precision = []
    recall = []

    for i in range(num_classes):
        tp = conf_matrix[i, i]
        fn = conf_matrix[i, :].sum() - tp
        fp = conf_matrix[:, i].sum() - tp
        tn = conf_matrix.sum() - (tp + fn + fp)
        
        accuracy.append(tp / conf_matrix[i, :].sum() if conf_matrix[i, :].sum() != 0 else 0)
        precision.append(tp / (tp + fp) if tp + fp != 0 else 0)
        recall.append(tp / (tp + fn) if tp + fn != 0 else 0)

    return accuracy, precision, recall

accuracy, precision, recall = calculate_metrics(conf_matrix)

labels = ['ICE I', 'ICE II','SUBTR I','SUBTR II','SUBP + UP III','SUBP + UP I','SUBP + UP II']
x = np.arange(len(labels))  # the label locations
width = 0.2  # the width of the bars

fig, ax = plt.subplots(figsize=(15, 8), dpi=200)

rects1 = ax.bar(x - width, accuracy, width, label='Accuracy')
rects2 = ax.bar(x, precision, width, label='Precision')
rects3 = ax.bar(x + width, recall, width, label='Recall')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Scores', fontsize=20)
ax.set_title('Model evaluation: Test Accuracy, Precision, and Recall per biome', fontsize=20, pad=15)
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

# Add value labels
def add_labels(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(8, 0),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=11, 
                    rotation=35
                   )

add_labels(rects1)
add_labels(rects2)
add_labels(rects3)

legend_properties = {'weight':'bold', 'size':15}
plt.legend(bbox_to_anchor=(1.18, 1.0), prop=legend_properties)
plt.xticks(rotation=45)
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=15)

fig.tight_layout()

plt.show()

In [None]:
# labels_predicted = np.argmax(y_predict, axis=1)
# labels_original = y_test
# labels = range(0,8)
# labels_predicted = [labels[i] for i in predicted_labels_argmax]
# labels_predicted

In [None]:
# np.unique(y_test.values)
# np.unique(labels_predicted)

In [None]:
class_names = np.sort(hc_df['label'].unique())

In [None]:


# Assuming y_predict is your array of predicted probabilities with shape (number_of_samples, number_of_classes)
# Find the class with the highest probability for each sample
# predicted_labels = np.argmax(y_predict, axis=1)

# Get the selected class names based on the class indices
# selected_class_names = [class_names[label] for label in predicted_labels]

# for label in predicted_labels:
#     print(f"{label} --> {class_names[label]}")

In [None]:
np.argmax(y_predict, axis=1)

## Save the model

In [None]:
##Save the entire model as a `.keras` zip archive.
model.save('jan_2009_model_v1.keras')

In [None]:
## Save the model to a file
model.save('tracking_model_v1')  # Save the entire model

# Predict clusters 

## Looping over multiple files

In [None]:
# predict_year = 2009
# predict_month = 'jan'
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
scaler = StandardScaler()
class_names = ['1', '2', '3', '4', '5', '6', '7']

In [None]:
# Load the saved model
loaded_model = tf.keras.models.load_model('jan_2009_model_v1.keras')

for yr in range(1958, 2019): 
    
    for month in months:
        
        if yr == 2009:
            if month == 'jan':
                continue
                
        print(f"Predicting:{yr} {month}")
        data_new = pd.read_pickle(f"output_files/spatial_regression_{yr}_{month}.pkl")
        hc_df_new = data_new.groupby('cell_id').mean()
        X_new = hc_df_new[['slope_sst', 'slope_dicp', 'slope_alk']].dropna()
        # Standardize features
        X_new_scaled = scaler.fit_transform(X_new.values)
        tarcked_labels_prob = loaded_model.predict(X_new_scaled)
        # Find the class with the highest probability for each sample
        tarcked_labels_indices = np.argmax(tarcked_labels_prob, axis=1)
        # Get the selected class names based on the class indices
        selected_regimes = [class_names[label] for label in tarcked_labels_indices]
    
        X_new = X_new.reset_index()
        X_new['cluster'] = selected_regimes
    
        merged_df = pd.merge(X_new[['cluster', 'grid_id']], data_new, on='grid_id', how='left')
        merged_df.to_pickle(f"output_reg_1958_2018/adaptive_hc_clusters_{yr}_{month}.pkl")
        # print(merged_df)
        print()