In [1]:
import xarray as xr
import xesmf as xe
import numpy as np
import warnings
import geopandas as gpd
import pandas as pd
import regionmask
from cmip_preprocessing_funcs import *
import os
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import requests
import seaborn as sns
from joblib import dump

# Ignore RuntimeWarnings in numpy's nanfunctions
warnings.filterwarnings('ignore', category=RuntimeWarning, module='numpy.lib.nanfunctions')

print("DONE")



DONE


In [2]:
import numpy as np
import xarray as xr
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture

def apply_gaussian_mixture(ds_final, predictors, n_clust, random_state):
    # Standard deviation across time
    ds_gauss = ds_final[predictors].std(dim='time')

    # Reshape the data to 2D array
    data_array = ds_gauss.to_array().transpose('lat', 'lon', 'variable')
    n_samples = data_array.shape[0] * data_array.shape[1]
    n_features = data_array.shape[2]
    df_gauss_w_nans = data_array.values.reshape(n_samples, n_features)

    # Identify the non-NaN row indices
    non_nan_indices = np.where(~np.isnan(df_gauss_w_nans).any(axis=1))[0]

    # Filter out NaNs and normalize
    df_gauss = df_gauss_w_nans[~np.isnan(df_gauss_w_nans).any(axis=1)]
    scaler = StandardScaler()
    df_gauss_norm = scaler.fit_transform(df_gauss)
    
    # Gaussian Mixture Model
    gmm = GaussianMixture(n_components=n_clust, max_iter=1000, random_state=random_state, reg_covar=1e-4, n_init=50)
    gmm.fit(df_gauss_norm)
    labels = gmm.predict(df_gauss_norm)
    probs = gmm.predict_proba(df_gauss_norm)

    # Prepare full arrays for labels and probabilities
    labels_full = np.full((n_samples,), np.nan)
    probs_full = np.full((n_samples, n_clust), np.nan)

    # Insert labels and probabilities
    labels_full[non_nan_indices] = labels
    for i in range(n_clust):
        probs_full[non_nan_indices, i] = probs[:, i]

    # Reshape back to 3D
    labels_reshaped = labels_full.reshape(data_array.shape[0], data_array.shape[1])
    probs_reshaped = probs_full.reshape(data_array.shape[0], data_array.shape[1], n_clust)

    # Convert to xarray DataArray and add to ds_final
    ds_final['cluster'] = xr.DataArray(labels_reshaped, dims=('lat', 'lon'), coords={'lat': ds_gauss['lat'], 'lon': ds_gauss['lon']})
    for i in range(n_clust):
        ds_final[f'c{i}'] = xr.DataArray(probs_reshaped[:, :, i], dims=('lat', 'lon'), coords={'lat': ds_gauss['lat'], 'lon': ds_gauss['lon']})

    return ds_final

In [3]:
shp = gpd.read_file('../Data/New EIWG LMEs/eiwg_boundaries_20230512_ZachS_ChangeLongto0_360.shp')
lmes = list(shp['RegionName'])[:11]

preds_path = f'../Data/Fall 2023/Predictors_and_CMEMS.nc'
ds_preds = xr.open_dataset(preds_path)

# Corresponding n_clusts values
n_clusts = [2, 5, 3, 5, 2, 2, 3, 4, 2, 4, 5]

# Creating a dictionary
lmes_n_clusts_dict = dict(zip(lmes, n_clusts))

# Initialize empty dictionary to collect error stats dataframes
error_stats_dict = {}

for lme, n_clust in lmes_n_clusts_dict.items():
    print(lme)
    print("STARTING")
    if lme == 'Pacific Islands':
        ds_final = ds_preds.sel(lat=slice(-20, 40), lon=slice(140, 220))
    else: 
        ds_final = ds_preds.where(ds_preds['lme'] == lmes.index(lme), drop=True)
    print("SUBSETTED")
    predictors = ['mslp', 'SST', 'CHL'] if lme not in ['Chukchi and North Bering Seas', 'Beaufort Sea'] else ['mslp', 'SST', 'WindSpeed']
    ds_final_with_clusters = apply_gaussian_mixture(ds_final, predictors, n_clust, random_state = 69420)
    ds_final_with_clusters.to_netcdf(f"../Data/Fall 2023/{lme}_Clusters.nc")
    print('--------------------------')





U.S. Caribbean
STARTING
SUBSETTED
--------------------------
Pacific Islands
STARTING
SUBSETTED
--------------------------
Gulf of Mexico
STARTING
SUBSETTED
--------------------------
California Current
STARTING
SUBSETTED
--------------------------
Chukchi and North Bering Seas
STARTING
SUBSETTED
--------------------------
Northeast U.S.
STARTING
SUBSETTED
--------------------------
Gulf of Alaska
STARTING
SUBSETTED
--------------------------
Aleutian Islands
STARTING
SUBSETTED
--------------------------
Beaufort Sea
STARTING
SUBSETTED
--------------------------
East Bering Sea
STARTING
SUBSETTED
--------------------------
Southeast U.S.
STARTING
SUBSETTED
--------------------------
