In [71]:
import numpy as np
import pandas as pd

In [147]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.model_selection import train_test_split

In [148]:
random_state = 100

In [171]:
data = pd.read_csv('Bias_correction_ucl.csv')
data['Date'] = pd.to_datetime(data['Date'])

data = data[data['Next_Tmax'].notnull() & data['Next_Tmin'].notnull()]
data = data[data['Date'].notnull() & data['station'].notnull()]
data = data[data['LDAPS_Tmax_lapse'].notnull() & data['LDAPS_Tmin_lapse'].notnull()]

y_Tmax = data['Next_Tmax']
y_Tmin = data['Next_Tmin']
X = data.drop(['Next_Tmax', 'Next_Tmin'], axis='columns')

In [172]:
X_train, X_test, y_train, y_test = train_test_split(data, y_Tmax, test_size=0.2, shuffle=True, random_state=random_state)
# X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, random_state=100)

In [173]:
class MissingDataRowsDropper(BaseEstimator, TransformerMixin):
    
    def __init__(self, cols):
        if cols is None:
            cols = []
        self.cols = cols
        self.predicate = None
    
    def fit(self, X, y=None):
        if any([v not in X.columns for v in self.cols]):
            raise Exception("Wrong column name provided")
        self.predicate = X[self.cols].notnull().all(axis='columns')
        return self
    
    def transform(self, X, y=None):
        if self.predicate is None:
            raise Exception("Have not been fed before transformation")
        if y:
            return X[self.predicate], y[self.predicate]
        return X[self.predicate]
        

In [174]:
preprocessing = make_pipeline(
#     MissingDataRowsDropper(cols=['station', 'Date']), 
    make_column_transformer(
         (SimpleImputer(strategy='median'), ['Present_Tmax', 'Present_Tmin', 'LDAPS_RHmin',
       'LDAPS_RHmax', 'LDAPS_Tmax_lapse', 'LDAPS_Tmin_lapse', 'LDAPS_WS',
       'LDAPS_LH', 'LDAPS_CC1', 'LDAPS_CC2', 'LDAPS_CC3', 'LDAPS_CC4',
       'LDAPS_PPT1', 'LDAPS_PPT2', 'LDAPS_PPT3', 'LDAPS_PPT4', 'Next_Tmax',
       'Next_Tmin'])
    ),
    StandardScaler()
)

In [175]:
class BaseModel(BaseEstimator, ClassifierMixin):
    
    def __init__(self, to_predict):
        if to_predict not in ['Next_Tmax', 'Next_Tmin']:
            raise Exception('Unknown prediction type')
        self.to_predict = to_predict
    
    def fit(self, X, y):
        return self
    
    def predict(self, X):
        if self.to_predict == 'Next_Tmax':
            return X['LDAPS_Tmax_lapse']
        else:
            return X['LDAPS_Tmin_lapse']


In [176]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import Lasso, Ridge, ElasticNet

In [177]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=random_state)
models = {}

In [178]:
model = Pipeline([
    ('preprocessing', preprocessing), 
    ('regressor', Lasso())
])
params = {
    'regressor__alpha': [1e-5, 1e-4, 1e-3]
}

grid = GridSearchCV(
    estimator=model, 
    param_grid=params,
    n_jobs=-1,
    verbose=3,
    cv=k_fold
)
grid.fit(X_train, y_train)
print(grid.best_params_)
models['LR'] = grid.best_estimator_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
{'regressor__alpha': 0.0001}


# Models evaluation

In [179]:
from sklearn.metrics import mean_squared_error

In [180]:
models['base_model'] = BaseModel('Next_Tmax')

In [183]:
for name, model in models.items():
    print(name)
    print(f'mean_squared_error: {mean_squared_error(model.predict(X_test), y_test)}')

LR
mean_squared_error: 2.0281103717201048e-07
base_model
mean_squared_error: 3.6321580125711477
