In [320]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn import set_config
set_config(transform_output = "pandas")

train = pd.read_csv('../data/processed/train_test/DVORANA TABOR/train.csv')
test = pd.read_csv('../data/processed/train_test/DVORANA TABOR/test.csv')

In [321]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        
        X['date'] = pd.to_datetime(X['date'])
        X.sort_values(by='date', inplace=True)
        X['time_of_day'] = X['date'].dt.hour // 6
    
        X['day_of_week'] = X['date'].dt.dayofweek
        X = pd.get_dummies(X, columns=['time_of_day', 'day_of_week'], drop_first=True)
    
        X['lagged_available_bike_stands'] = X['available_bike_stands'].shift(1)
    
        window_size = 7
        X['rolling_mean_bike_stands'] = X['available_bike_stands'].rolling(window=window_size).mean()
    
        X['rolling_std_bike_stands'] = X['available_bike_stands'].rolling(window=window_size).std()
    
        X['diff_available_bike_stands'] = X['available_bike_stands'].diff()
    
        X['temperature_diff'] = X['temperature'] - X['apparent_temperature']
    
        X = X[['date', 'temperature', 'dew_point', 'apparent_temperature',
                 'surface_pressure', 'available_bike_stands',
                 'lagged_available_bike_stands', 'rolling_mean_bike_stands',
                 'rolling_std_bike_stands', 'diff_available_bike_stands',
                 'temperature_diff']]

        return X.sort_values(by='date').drop(columns='date')

In [322]:
from sklearn.impute import SimpleImputer

standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

standard_features = ['temperature', 'dew_point', 'apparent_temperature', 'surface_pressure']
minmax_features = ['lagged_available_bike_stands',
                   'rolling_mean_bike_stands',
                   'rolling_std_bike_stands', 'diff_available_bike_stands',
                   'temperature_diff']
target_feature = ['available_bike_stands']

column_transformer = ColumnTransformer(
    transformers=[
        ('standard_scaler', standard_scaler, standard_features),
        ('minmax_scaler', minmax_scaler, minmax_features),
    ]
)

pipeline = Pipeline(steps=[
    ('feature_engineering', FeatureEngineer()),
    ('imputer', SimpleImputer(strategy='mean')),
    ('column_transformer', column_transformer),
])

In [327]:
df = pipeline.fit_transform(train)
df[target_feature] = train[target_feature]
df.head()

Unnamed: 0,standard_scaler__temperature,standard_scaler__dew_point,standard_scaler__apparent_temperature,standard_scaler__surface_pressure,minmax_scaler__lagged_available_bike_stands,minmax_scaler__rolling_mean_bike_stands,minmax_scaler__rolling_std_bike_stands,minmax_scaler__diff_available_bike_stands,minmax_scaler__temperature_diff,available_bike_stands
0,-1.118476,-1.115176,-1.108914,-1.085295,0.4,0.3745,0.247898,0.421053,0.888889,11
1,-1.118476,-1.115176,-1.108914,-1.085295,0.526316,0.3745,0.247898,0.368421,0.888889,10
2,-1.118476,-1.115176,-1.108914,-1.085295,0.473684,0.3745,0.247898,0.473684,0.888889,11
3,-1.118476,-1.115176,-1.108914,-1.085295,0.526316,0.3745,0.247898,0.473684,0.888889,12
4,-1.118476,-1.115176,-1.108914,-1.085295,0.578947,0.3745,0.247898,0.526316,0.888889,14


In [324]:
# def create_features(df):
#     df['date'] = pd.to_datetime(df['date'])
#     df.sort_values(by='date', inplace=True)
#     df['time_of_day'] = df['date'].dt.hour // 6
# 
#     df['day_of_week'] = df['date'].dt.dayofweek
#     df = pd.get_dummies(df, columns=['time_of_day', 'day_of_week'], drop_first=True)
# 
#     df['lagged_available_bike_stands'] = df['available_bike_stands'].shift(1)
# 
#     window_size = 7
#     df['rolling_mean_bike_stands'] = df['available_bike_stands'].rolling(window=window_size).mean()
# 
#     df['rolling_std_bike_stands'] = df['available_bike_stands'].rolling(window=window_size).std()
# 
#     df['diff_available_bike_stands'] = df['available_bike_stands'].diff()
# 
#     df['temperature_diff'] = df['temperature'] - df['apparent_temperature']
# 
#     df = df[['date', 'temperature', 'dew_point', 'apparent_temperature',
#              'surface_pressure', 'available_bike_stands',
#              'lagged_available_bike_stands', 'rolling_mean_bike_stands',
#              'rolling_std_bike_stands', 'diff_available_bike_stands',
#              'temperature_diff']]
# 
#     return df
