In [1]:
import xarray as xr
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

from tqdm.notebook import tqdm

sns.set_theme('notebook')

In [6]:
data = pd.read_csv("data/csv_w_labels.csv")
data

Unnamed: 0,lon,lat,month,avg_evap,latent_heatflux_up,longwave_radiation_up,humidity,heatflux_down,sensible_heatflux_up,shortwave_radiation_down,air_temp,surface_temp,windspeed,climate_zone,hemisphere,mhw_label
0,0.5,-70.5,9,,,,,,,,,,,3,1,0.0
1,0.5,-70.5,10,,,,,,,,,,,3,1,1.0
2,0.5,-70.5,11,,,,,,,,,,,3,1,0.0
3,0.5,-70.5,12,23.200000,18.9,,3.05,,0.5,,-1.31,-1.33,9.24,3,1,0.0
4,0.5,-70.5,1,34.600002,28.2,,2.88,,-0.5,,-1.09,-1.14,10.37,3,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17464675,359.5,89.5,8,,,,,,,,,,,3,-1,0.0
17464676,359.5,89.5,9,,,,,,,,,,,3,-1,0.0
17464677,359.5,89.5,10,,,,,,,,,,,3,-1,0.0
17464678,359.5,89.5,11,,,,,,,,,,,3,-1,0.0


In [7]:
labels = data.pop("mhw_label")
data = data.fillna(-1)

In [8]:
labels

0           0.0
1           1.0
2           0.0
3           0.0
4           0.0
           ... 
17464675    0.0
17464676    0.0
17464677    0.0
17464678    0.0
17464679    0.0
Name: mhw_label, Length: 17464680, dtype: float64

In [10]:
X_train, X_test, y_train, y_test = \
    train_test_split(data, labels, test_size=0.3, shuffle=True, stratify=labels, random_state=1)

In [11]:
scoring = ['accuracy', 'precision', 'recall']
pipe = make_pipeline(StandardScaler(), SGDClassifier(loss='hinge', verbose=10))
scores = cross_validate(pipe, X_train, y_train, scoring=scoring, verbose=10, n_jobs=6)

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
report = classification_report(y_test, y_pred)

print(scores)
print(report)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   2 out of   5 | elapsed:   35.5s remaining:   53.3s
[Parallel(n_jobs=6)]: Done   3 out of   5 | elapsed:   35.6s remaining:   23.7s
[Parallel(n_jobs=6)]: Done   5 out of   5 | elapsed:   35.8s remaining:    0.0s
[Parallel(n_jobs=6)]: Done   5 out of   5 | elapsed:   35.8s finished


-- Epoch 1
Norm: 0.07, NNZs: 15, Bias: -1.010180, T: 12225276, Avg. loss: 0.178834
Total training time: 2.31 seconds.
-- Epoch 2
Norm: 0.04, NNZs: 15, Bias: -1.005622, T: 24450552, Avg. loss: 0.169059
Total training time: 4.65 seconds.
-- Epoch 3
Norm: 0.03, NNZs: 15, Bias: -1.004252, T: 36675828, Avg. loss: 0.168727
Total training time: 6.91 seconds.
-- Epoch 4
Norm: 0.02, NNZs: 15, Bias: -1.002871, T: 48901104, Avg. loss: 0.168590
Total training time: 9.36 seconds.
-- Epoch 5
Norm: 0.02, NNZs: 15, Bias: -1.001916, T: 61126380, Avg. loss: 0.168516
Total training time: 11.66 seconds.
-- Epoch 6
Norm: 0.01, NNZs: 15, Bias: -1.001622, T: 73351656, Avg. loss: 0.168468
Total training time: 13.82 seconds.
-- Epoch 7
Norm: 0.01, NNZs: 15, Bias: -1.001373, T: 85576932, Avg. loss: 0.168436
Total training time: 16.07 seconds.
Convergence after 7 epochs took 16.07 seconds


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'fit_time': array([28.0287807 , 28.25298381, 28.04872823, 28.23702693, 27.8318162 ]), 'score_time': array([3.82196021, 3.75885463, 3.80699921, 3.80572891, 3.71725917]), 'test_accuracy': array([0.91587105, 0.91587142, 0.91587142, 0.91587142, 0.91587101]), 'test_precision': array([0., 0., 0., 0., 0.]), 'test_recall': array([0., 0., 0., 0., 0.])}
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96   4798619
         1.0       0.00      0.00      0.00    440785

    accuracy                           0.92   5239404
   macro avg       0.46      0.50      0.48   5239404
weighted avg       0.84      0.92      0.88   5239404



  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
pipe = make_pipeline(StandardScaler(), SGDClassifier(loss='log', verbose=10))
scores = cross_validate(pipe, X_train, y_train, scoring=scoring, verbose=10, n_jobs=6)

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
report = classification_report(y_test, y_pred)

print(scores)
print(report)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   2 out of   5 | elapsed:   36.9s remaining:   55.5s
[Parallel(n_jobs=6)]: Done   3 out of   5 | elapsed:   37.0s remaining:   24.6s
[Parallel(n_jobs=6)]: Done   5 out of   5 | elapsed:   37.2s remaining:    0.0s
[Parallel(n_jobs=6)]: Done   5 out of   5 | elapsed:   37.2s finished


-- Epoch 1
Norm: 2.17, NNZs: 15, Bias: -2.479033, T: 12225276, Avg. loss: 0.286669
Total training time: 2.94 seconds.
-- Epoch 2
Norm: 2.19, NNZs: 15, Bias: -2.444251, T: 24450552, Avg. loss: 0.282290
Total training time: 5.92 seconds.
-- Epoch 3
Norm: 2.17, NNZs: 15, Bias: -2.454253, T: 36675828, Avg. loss: 0.282215
Total training time: 8.86 seconds.
-- Epoch 4
Norm: 2.18, NNZs: 15, Bias: -2.434576, T: 48901104, Avg. loss: 0.282186
Total training time: 11.73 seconds.
-- Epoch 5
Norm: 2.17, NNZs: 15, Bias: -2.459658, T: 61126380, Avg. loss: 0.282168
Total training time: 14.79 seconds.
-- Epoch 6
Norm: 2.17, NNZs: 15, Bias: -2.460195, T: 73351656, Avg. loss: 0.282160
Total training time: 17.89 seconds.
-- Epoch 7
Norm: 2.17, NNZs: 15, Bias: -2.452158, T: 85576932, Avg. loss: 0.282150
Total training time: 20.87 seconds.
Convergence after 7 epochs took 20.87 seconds
{'fit_time': array([30.10480142, 29.87740898, 30.14259171, 30.10648012, 30.36809707]), 'score_time': array([3.7063787 , 3.78