In [1]:
import numpy as np
import pandas as pd

In [126]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

In [127]:
data = pd.read_csv('Bias_correction_ucl.csv')
data['Date'] = pd.to_datetime(data['Date'])
data.head(5)

Unnamed: 0,station,Date,Present_Tmax,Present_Tmin,LDAPS_RHmin,LDAPS_RHmax,LDAPS_Tmax_lapse,LDAPS_Tmin_lapse,LDAPS_WS,LDAPS_LH,...,LDAPS_PPT2,LDAPS_PPT3,LDAPS_PPT4,lat,lon,DEM,Slope,Solar radiation,Next_Tmax,Next_Tmin
0,1.0,2013-06-30,28.7,21.4,58.255688,91.116364,28.074101,23.006936,6.818887,69.451805,...,0.0,0.0,0.0,37.6046,126.991,212.335,2.785,5992.895996,29.1,21.2
1,2.0,2013-06-30,31.9,21.6,52.263397,90.604721,29.850689,24.035009,5.69189,51.937448,...,0.0,0.0,0.0,37.6046,127.032,44.7624,0.5141,5869.3125,30.5,22.5
2,3.0,2013-06-30,31.6,23.3,48.690479,83.973587,30.091292,24.565633,6.138224,20.57305,...,0.0,0.0,0.0,37.5776,127.058,33.3068,0.2661,5863.555664,31.1,23.9
3,4.0,2013-06-30,32.0,23.4,58.239788,96.483688,29.704629,23.326177,5.65005,65.727144,...,0.0,0.0,0.0,37.645,127.022,45.716,2.5348,5856.964844,31.7,24.3
4,5.0,2013-06-30,31.4,21.9,56.174095,90.155128,29.113934,23.48648,5.735004,107.965535,...,0.0,0.0,0.0,37.5507,127.135,35.038,0.5055,5859.552246,31.2,22.5


In [128]:
class MissingDataRowsDropper(BaseEstimator, TransformerMixin):
    
    def __init__(self, cols):
        if cols is None:
            cols = []
        self.cols = cols
        self.predicate = None
    
    def fit(self, X, y=None):
        if any([v not in X.columns for v in self.cols]):
            raise Exception("Wrong column name provided")
        self.predicate = X[cols].notnull().all(axis='columns')
        return self
    
    def transform(self, X, y=None):
        if self.predicate is None:
            raise Exception("Have not been fed before transformation")
        return X[self.predicate]


In [129]:
data.columns[data.isnull().any(axis=0)]

Index(['station', 'Date', 'Present_Tmax', 'Present_Tmin', 'LDAPS_RHmin',
       'LDAPS_RHmax', 'LDAPS_Tmax_lapse', 'LDAPS_Tmin_lapse', 'LDAPS_WS',
       'LDAPS_LH', 'LDAPS_CC1', 'LDAPS_CC2', 'LDAPS_CC3', 'LDAPS_CC4',
       'LDAPS_PPT1', 'LDAPS_PPT2', 'LDAPS_PPT3', 'LDAPS_PPT4', 'Next_Tmax',
       'Next_Tmin'],
      dtype='object')

In [130]:
preprocessing = make_pipeline(
    MissingDataRowsDropper(cols=['station', 'Date']), 
    make_column_transformer(
         (SimpleImputer(strategy='median'), ['Present_Tmax', 'Present_Tmin', 'LDAPS_RHmin',
       'LDAPS_RHmax', 'LDAPS_Tmax_lapse', 'LDAPS_Tmin_lapse', 'LDAPS_WS',
       'LDAPS_LH', 'LDAPS_CC1', 'LDAPS_CC2', 'LDAPS_CC3', 'LDAPS_CC4',
       'LDAPS_PPT1', 'LDAPS_PPT2', 'LDAPS_PPT3', 'LDAPS_PPT4', 'Next_Tmax',
       'Next_Tmin'])
    ),
    StandardScaler()
)

In [131]:
preprocessing.fit_transform(data)

array([[-0.36218226, -0.76194411,  0.10377578, ..., -0.22316574,
        -0.37725684, -0.69932289],
       [ 0.72133242, -0.67854922, -0.30707346, ..., -0.22316574,
         0.07177469, -0.17485675],
       [ 0.61975292,  0.03030732, -0.55204333, ..., -0.22316574,
         0.26421677,  0.38995294],
       ...,
       [-2.19061328, -2.55493417, -2.05694359, ..., -0.22316574,
        -0.63384629, -1.94997292],
       [-2.19061328, -2.30474951, -2.24210996, ..., -0.22316574,
        -0.53762524, -1.66756807],
       [-2.22447311, -2.42984184, -2.31805169, ..., -0.22316574,
        -0.79421469, -2.23237776]])