In [None]:
# determining regions from lat/lon using DBSCAN clustering
from sklearn.cluster import DBSCAN

df['lon'] = (df['lon'] + 180) % 360 - 180 # converting to 180 scale
dbscan = DBSCAN(eps=50/6371., 
                min_samples=20, 
                algorithm='ball_tree', 
                metric='haversine').fit(np.radians(df[['lat','lon']]))
df['region'] = dbscan.labels_

print(np.unique(dbscan.labels_,return_counts=True))

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def preprocess(data, train, standardize=True):
    data = data.drop('PSL',axis=1)  # removing as the column values are almost the same with PS
    #data['lon'] = (data['lon'] + 180) % 360 - 180
    if train:
        # drop duplicates
        data.drop_duplicates(inplace=True)

    # Extract year, month, and day from the 'time' column
    data['year'] = data.index.year
    data['month'] = data.index.month
    data['day'] = data.index.day

    # Create interaction features
    data['U850_V850_interaction'] = data['U850'] * data['V850']
    data['TMQ_T200_interaction'] = data['TMQ'] * data['T200']
    data['TMQ_T500_interaction'] = data['TMQ'] * data['T500']
    data['TMQ_TS_interaction'] = data['TMQ'] * data['TS']
    data['TMQ_TREFHT_interaction'] = data['TMQ'] * data['TREFHT']

    if train:
        # Splitting the dataset into features (X) and target (y)
        X = data.drop(columns=['Label'])  # Assuming 'Label' is your target column
        y = data['Label']
    else:
        X = data
        y = None

    # Separate columns to avoid standardization
    X_region = X['region']
    X_lat_lon = X[['lat', 'lon']]
    X_temporal = X[['year', 'month', 'day']]
    X = X.drop(columns=['lat', 'lon', 'year', 'month', 'day'])
    
    scaler = StandardScaler() if standardize else MinMaxScaler()
    X_normalized = X.groupby('region').transform(lambda x: scaler.fit_transform(x.values[:,np.newaxis]).ravel())
    
    # Combining back the non-standardized columns
    X_region = X_region.reset_index(drop=True)
    X_lat_lon = X_lat_lon.reset_index(drop=True)
    X_temporal = X_temporal.reset_index(drop=True)
    X_normalized = X_normalized.reset_index(drop=True)

    X = pd.concat([X_temporal, X_normalized], axis=1)

    # Transforming month and day into cyclical features
    X['sin_month'] = np.sin(2 * np.pi * X['month'] / 12)
    X['cos_month'] = np.cos(2 * np.pi * X['month'] / 12)
    X['sin_day'] = np.sin(2 * np.pi * X['day'] / 30)
    X['cos_day'] = np.cos(2 * np.pi * X['day'] / 30)

    # Dropping the original month and day columns
    X = X.drop(columns=['month', 'day', 'year'])

    if train:
        return X, y
    else:
        return X


In [None]:
plt.rc('figure', figsize=(16, 12))
plt.rc('font', size=15)

data = df[(df['Region'] == 0) & (df.index.year.isin([1996,1997]))]
mean_resampled_data = data.resample('D').mean()
std_resampled_data = data.resample('D').std()

standardized_data = mean_resampled_data / std_resampled_data

# Handle any missing data
standardized_data = standardized_data.interpolate(method='spline', order=3)
result = seasonal_decompose(standardized_data['TMQ'], model='additive')
fig = result.plot()

