In [1]:
import pandas as pd
import glob
import numpy as np
import sompy
from sklearn.preprocessing import MinMaxScaler

CACHEDIR=/albedo/home/yvjennig/.cache/matplotlib
Using fontManager instance from /albedo/home/yvjennig/.cache/matplotlib/fontlist-v330.json


In [2]:
# load time gridded data
tables = {}
for filepath in glob.glob("data/P_*_time_gridded.csv"):
    param = filepath.lstrip("data/").rstrip("_time_gridded.csv")
    print(param)
    tables[param] = pd.read_csv(filepath)

P_SILICATE
P_OXYGEN
P_PHOSPHATE
P_NITRATE
P_SALINITY
P_TEMPERATURE


In [3]:
# create wide table
df_wide = tables[list(tables.keys())[0]]

for param in list(tables.keys())[1:]:
    df_wide = pd.merge(df_wide, tables[param], on=["LATITUDE", "LONGITUDE", "LEV_M", "DATEANDTIME"], how="outer")

df_wide["DATEANDTIME"] = pd.to_datetime(df_wide["DATEANDTIME"])

In [4]:
df_wide.dtypes

LATITUDE                float64
LONGITUDE               float64
LEV_M                   float64
DATEANDTIME      datetime64[ns]
P_SILICATE              float64
P_OXYGEN                float64
P_PHOSPHATE             float64
P_NITRATE               float64
P_SALINITY              float64
P_TEMPERATURE           float64
dtype: object

In [5]:
len(df_wide)

131496

In [6]:
# transform date (month to cyclic variable)
df_wide["year"] = (df_wide["DATEANDTIME"] + pd.Timedelta(days=1)).dt.year.astype(int)
# df_wide["month"] = (df_wide["DATEANDTIME"] + pd.Timedelta(days=1)).dt.month

# months to cyclic
# df_wide["month0"] = np.sin(2*np.pi*df_wide["month"]/12)
# df_wide["month1"] = np.cos(2*np.pi*df_wide["month"]/12)

# drop original datetime
df_wide = df_wide.drop(["DATEANDTIME"], axis=1)

In [7]:
df_wide["LONGITUDE"].unique()

array([-50.5, -49.5, -48.5, -47.5, -46.5, -45.5, -44.5, -43.5, -42.5,
       -41.5, -40.5, -39.5, -38.5, -37.5, -36.5, -35.5, -34.5, -33.5,
       -32.5, -31.5, -30.5, -29.5, -28.5, -27.5, -26.5, -25.5, -24.5,
       -23.5, -22.5, -21.5, -20.5, -19.5, -18.5, -17.5, -16.5, -15.5,
       -14.5, -13.5, -12.5, -11.5, -10.5,  -9.5,  -8.5,  -7.5,  -6.5,
        -5.5,  -4.5,  -3.5,  -2.5,  -1.5,  -0.5,   0.5,   1.5,   2.5,
         3.5,   4.5,   5.5,   6.5,   7.5,   8.5,   9.5, -51.5, -52.5,
       -57.5, -56.5, -55.5, -54.5, -53.5, -58.5, -59.5, -76.5, -71.5,
       -60.5, -65.5, -64.5, -62.5, -61.5, -75.5, -74.5, -73.5, -70.5,
       -68.5, -67.5, -66.5, -63.5, -72.5, -69.5,  27.5,  19.5,  16.5,
        17.5,  18.5,  26.5,  28.5,  29.5,  15.5,  20.5,  23.5,  24.5,
        25.5,  11.5,  12.5,  13.5,  14.5,  21.5,  22.5,  10.5])

In [None]:
# transform longitude to cyclic variable
df_wide["LONGITUDE0"] = np.sin(2*np.pi*(180+df_wide["LONGITUDE"])/360)
df_wide["LONGITUDE1"] = np.cos(2*np.pi*(180+df_wide["LONGITUDE"])/360)

df_wide = df_wide.drop("LONGITUDE", axis=1)

In [None]:
# scale data
scaler = MinMaxScaler() # StandardScaler()  MinMaxScaler  RobustScaler
scaler.fit(df_wide)
df_scaled = pd.DataFrame(scaler.transform(df_wide), columns=df_wide.columns)

In [None]:
# embed data not possible on nan values
import umap
# embedding = umap.UMAP(min_dist=0.0, n_components=3, n_neighbors=20).fit_transform(df_scaled)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(embedding[:, 0], embedding[:, 1], embedding[:, 2], alpha=0.1, zorder=4, s=0.1)  # , s=s, alpha=1, zorder=4)
plt.show(block=True)

In [None]:
len(df_scaled)

In [None]:
len(df_scaled.dropna())

In [None]:
# SOMPY imputation
mapsize = [20, 20]

som = sompy.SOMFactory.build(df_scaled,   # embedding df_scaled X
                             mapsize=mapsize, 
                             mask=None, 
                             mapshape='planar', 
                             lattice='hexa', 
                             normalization=None, 
                             initialization='pca', 
                             neighborhood='gaussian', 
                             training='batch', 
                             name='sompy')

som.train(n_job=1, verbose='info', 
          train_rough_len=60,
          train_rough_radiusin=3,
          train_rough_radiusfin=1,
          train_finetune_len=150,
          train_finetune_radiusin=1.3,
          train_finetune_radiusfin=0.1,)  # verbose='debug' will print more, and verbose=None wont print anything

# prediction
bmus = som.find_bmu(df_scaled)#, njb=1, nth=1)#, metric='euclidean')
x_pred_som = bmus[0, :]

In [None]:
# mean imputation
from sklearn.impute import SimpleImputer

# model training
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
mean_imputer.fit(df_scaled)

# prediction
x_pred_mean = mean_imputer.transform(df_scaled)

In [None]:
# KNN imputation
from sklearn.impute import KNNImputer

# training
knn_imputer = KNNImputer()  # n_neighbors=20, weights="distance"
knn_imputer.fit(df_scaled)

# prediction
x_pred_knn = pd.DataFrame(knn_imputer.transform(df_scaled), columns=df_scaled.columns)

In [None]:
11:45 - 12:05

In [None]:
# implicit NN imputation

In [None]:
# undo scaling
df_out = pd.DataFrame(scaler.inverse_transform(x_pred_knn), columns=df_scaled.columns)

In [None]:
a = df_out.drop(["month", "month0", "month1"], axis=1)
a.to_csv("df_knn_3x20.csv", index=False)