In [85]:
import pandas as pd
from pandas.core.frame import DataFrame

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00514/Bias_correction_ucl.csv')

In [10]:
df['Date'] = pd.to_datetime(df['Date'])

### Pre-processing

In [82]:
df.isna().sum()

station              2
Date                 2
Present_Tmax        70
Present_Tmin        70
LDAPS_RHmin         75
LDAPS_RHmax         75
LDAPS_Tmax_lapse    75
LDAPS_Tmin_lapse    75
LDAPS_WS            75
LDAPS_LH            75
LDAPS_CC1           75
LDAPS_CC2           75
LDAPS_CC3           75
LDAPS_CC4           75
LDAPS_PPT1          75
LDAPS_PPT2          75
LDAPS_PPT3          75
LDAPS_PPT4          75
lat                  0
lon                  0
DEM                  0
Slope                0
Solar radiation      0
Next_Tmax           27
Next_Tmin           27
dtype: int64

In [96]:
def handle_missing_data(df: DataFrame) -> DataFrame:
    df_processed = df.copy()
    df_processed = df_processed.dropna(subset=['Next_Tmax', 'Date'])
    df_processed.reset_index(drop=True, inplace=True)
    # missing_cols = df.columns[df.isna().any()].tolist()

    df_date = df_processed['Date']
    df_processed = df_processed.drop(['Date'], axis=1)
    # df_missing = df[['enrollee_id'] + missing_cols]
    # df_non_missing = df.drop(missing_cols, axis=1)

    knn_imputer = KNNImputer(n_neighbors=1)

    X = np.round(knn_imputer.fit_transform(df_processed))

    df_processed = pd.DataFrame(X, columns = df_processed.columns)

    df_processed['Date'] = df_date

    return df_processed

In [97]:
df_processed = handle_missing_data(df)

In [98]:
X = df_processed.drop(['Date', 'Next_Tmin', 'Next_Tmax'], axis = 1)
Y = df_processed['Next_Tmax']

In [99]:
numeric_data = list(X.columns)
numeric_data.remove('station')
categoric_data = ['station']

In [100]:
transformers=[('cat_scale', OneHotEncoder(), categoric_data),
             ('num_scale', MinMaxScaler(), numeric_data)]

preprocessor = ColumnTransformer(transformers=transformers)

lgb = LGBMRegressor()

steps = [('preprocessor', preprocessor),
        ('lgb', lgb)]

lgb_model = Pipeline(steps=steps, verbose=1)
lgb_model

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat_scale', OneHotEncoder(),
                                                  ['station']),
                                                 ('num_scale', MinMaxScaler(),
                                                  ['Present_Tmax',
                                                   'Present_Tmin',
                                                   'LDAPS_RHmin', 'LDAPS_RHmax',
                                                   'LDAPS_Tmax_lapse',
                                                   'LDAPS_Tmin_lapse',
                                                   'LDAPS_WS', 'LDAPS_LH',
                                                   'LDAPS_CC1', 'LDAPS_CC2',
                                                   'LDAPS_CC3', 'LDAPS_CC4',
                                                   'LDAPS_PPT1', 'LDAPS_PPT2',
                                                   'LDAPS_PPT3', 'LDAPS_PPT4',


In [101]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [102]:
lgb_model.fit(X_train, Y_train)

[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing lgb, total=   0.1s


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat_scale', OneHotEncoder(),
                                                  ['station']),
                                                 ('num_scale', MinMaxScaler(),
                                                  ['Present_Tmax',
                                                   'Present_Tmin',
                                                   'LDAPS_RHmin', 'LDAPS_RHmax',
                                                   'LDAPS_Tmax_lapse',
                                                   'LDAPS_Tmin_lapse',
                                                   'LDAPS_WS', 'LDAPS_LH',
                                                   'LDAPS_CC1', 'LDAPS_CC2',
                                                   'LDAPS_CC3', 'LDAPS_CC4',
                                                   'LDAPS_PPT1', 'LDAPS_PPT2',
                                                   'LDAPS_PPT3', 'LDAPS_PPT4',


In [103]:
Y_pred = lgb_model.predict(X_test)

In [104]:
r2_score(Y_test, Y_pred)

0.8778592987148331