In [1]:
import os

import pandas as pd
import polars as pl
import numpy as np
import seaborn as sns
import sgpp, sgml, sgutil
import joblib

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
expr_dic = {}
for i in ['sunshine', 'cloud', 'dewpoint', 'chp', 'cos_wd', 'sin_wd']:
    for j in range(1, 7):
        expr_dic['{}_{}'.format(i, j)] =  pl.col(i).shift(j).fill_null(strategy = 'backward')
for i in ['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed', 'cos_wd', 'sin_wd', 'chp']:
    expr_dic['{}_ma24'.format(i)] = pl.mean(i).rolling(index_column = 'id', period = '24i', closed = 'left').fill_null(strategy = 'backward')

p1 = make_pipeline(
    sgpp.PolarsProcessor(predefined_types = {'id': pl.Int64}),
    sgpp.ExprProcessor({
        'winddirection': pl.col('winddirection').fill_null(strategy = 'forward'),
        'windspeed': pl.col('windspeed').fill_null(strategy = 'forward')
    }),
    sgpp.ExprProcessor({
        'cos_wd': (pl.col('winddirection') / 180 * np.pi).cos() * pl.col('windspeed'),
        'sin_wd': (pl.col('winddirection') / 180 * np.pi).sin() * pl.col('windspeed'),
        'chp': pl.col('cloud') * pl.col('humidity') / pl.col('pressure'),
        'expected_day': (pl.col('id') % 365) + 1,
        'sin_ed': ((pl.col('id') % 365) / 365 * np.pi).sin(),
        'year': pl.col('id') // 365, 
    })
)
df_train = p1.fit_transform(['data/train.csv'])
df_test = p1.transform(['data/test.csv'])
p2 = make_pipeline(
    sgpp.ExprProcessor(expr_dic),
    sgpp.PandasCoverter(index_col = 'id')
)
df_all = p2.fit_transform(
    pl.concat([df_train, df_test], how = 'align')
)

p3 = make_pipeline(
    sgpp.ApplyWrapper(
        StandardScaler().set_output(transform='pandas'), 
        ['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed'] + ['cos_wd', 'sin_wd', 'chp'] + list(expr_dic.keys()),
    ),
    sgpp.ApplyWrapper(
        MinMaxScaler().set_output(transform='pandas'), ['year', 'expected_day', 'winddirection']
    )
)
df_all = p3.fit_transform(df_all)

df_train = df_all.loc[df_all['rainfall'].notna()]
df_test = df_all.loc[df_all['rainfall'].isna()].drop(columns = ['rainfall'])

df_org = make_pipeline(
    sgpp.PolarsProcessor(predefined_types = {'id': pl.Int64, 'day': pl.Int16, 'rainfall': pl.String}),
).fit_transform(['data/Rainfall.csv']).rename(
    lambda x: x.strip()
)
df_org = make_pipeline(
    sgpp.ExprProcessor({
        'winddirection': pl.col('winddirection').fill_null(strategy = 'forward'),
        'windspeed': pl.col('windspeed').fill_null(strategy = 'forward')
    }),
    sgpp.ExprProcessor({
        'id': pl.arange(1, pl.col('day').len() + 1),
        'sin_ed': (pl.arange(1, pl.col('day').len() + 1) / 365 * np.pi).sin(),
        'cos_wd': (pl.col('winddirection') / 180 * np.pi).cos() * pl.col('windspeed'),
        'sin_wd': (pl.col('winddirection') / 180 * np.pi).sin() * pl.col('windspeed'),
        'chp': pl.col('cloud') * pl.col('humidity') / pl.col('pressure'),
        'rainfall': pl.col('rainfall').replace({'yes': 1, 'no': 0}).cast(pl.Int8),
    }),
    sgpp.ExprProcessor(expr_dic),
    sgpp.PandasCoverter(), 
).fit_transform(df_org).assign(
    expected_day = lambda x: x.index + 1,
    year = -1
).pipe(
    lambda x: x.set_index(-(len(x) - x.index))
)
df_org = p3.transform(df_org)

target = 'rainfall'
sc = sgutil.SGCache('img', 'result', 'model')

In [3]:
data_processor = make_pipeline(
    p1, p2, p3
)
joblib.dump(data_processor, 'model/data_processor.joblib')

['model/data_processor.joblib']

In [4]:
data_processor_org = make_pipeline(
    sgpp.PolarsProcessor(predefined_types = {'id': pl.Int64, 'day': pl.Int16, 'rainfall': pl.String}),
    sgpp.ColumnNameCleaner(),
    sgpp.ExprProcessor({
        'winddirection': pl.col('winddirection').fill_null(strategy = 'forward'),
        'windspeed': pl.col('windspeed').fill_null(strategy = 'forward')
    }),
    sgpp.ExprProcessor({
        'id': -pl.arange(pl.col('day').len(), 0, -1),
        'expected_day': pl.arange(1, pl.col('day').len() + 1),
        'sin_ed': (pl.arange(1, pl.col('day').len() + 1) / 365 * np.pi).sin(),
        'cos_wd': (pl.col('winddirection') / 180 * np.pi).cos() * pl.col('windspeed'),
        'sin_wd': (pl.col('winddirection') / 180 * np.pi).sin() * pl.col('windspeed'),
        'chp': pl.col('cloud') * pl.col('humidity') / pl.col('pressure'),
        'rainfall': pl.col('rainfall').replace({'yes': 1, 'no': 0}).cast(pl.Int8),
        'year': -1
    }),
    sgpp.ExprProcessor(expr_dic),
    sgpp.PandasCoverter(index_col = 'id'), 
).fit(['data/Rainfall.csv'])
joblib.dump(
    make_pipeline(
        data_processor_org, p3
    ), 'model/data_processor_org.joblib'
)

['model/data_processor_org.joblib']

In [5]:
from sklearn.model_selection import train_test_split

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

def include_org(df, include_org = False):
    return pd.concat([df, df_org]) if include_org else df

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict_proba(df[X])[:, 1], index = df.index),
    'score_func': lambda df, prds: roc_auc_score(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'train_data_proc': include_org,
    'y': target,
}

In [6]:
for i in sc.get_cv_list():
    sc.train_cv(i, df_train, config)