Column transformations
---

In [None]:
import pandas as pd

data_df = pd.read_csv('bike-sharing.csv')
data_df.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Create encoder
encoder = OneHotEncoder()
encoder.fit_transform(data_df)

In [None]:
# Create encoder
encoder = OneHotEncoder(sparse=False)
encoded = encoder.fit_transform(data_df)
encoded

In [None]:
encoded.shape

In [None]:
from sklearn.compose import ColumnTransformer

# Handle categorical variables
cat_columns = ['yr', 'workingday', 'holiday', 'weekday', 'season', 'weathersit']
cat_transformer = OneHotEncoder(sparse=False)

# Create the column transformer
preprocessor = ColumnTransformer([
    ('categorical', cat_transformer, cat_columns)
], remainder='passthrough')

In [None]:
encoded = preprocessor.fit_transform(data_df)
encoded

In [None]:
print('Shape:', encoded.shape)
print('Type:', type(encoded))
print('Data type:', encoded.dtype)

In [None]:
try:
    cat_transformer.get_feature_names()
except Exception as e:
    print(e)

In [None]:
preprocessor.named_transformers_

In [None]:
preprocessor.named_transformers_['categorical'].get_feature_names()

In [None]:
new_data = data_df.iloc[:1].copy()
new_data['weathersit'] = 'storm'
new_data

In [None]:
try:
    preprocessor.transform(new_data)
except Exception as e:
    print(e)

In [None]:
# Handle categorical variables
cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Create the column transformer
preprocessor = ColumnTransformer([
    ('categorical', cat_transformer, cat_columns)
], remainder='passthrough')
preprocessor.fit_transform(data_df)
preprocessor.transform(new_data)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Handle ordinal variables
ord_columns = ['weathersit']
ord_transformer = OrdinalEncoder(categories=[['clear', 'cloudy', 'rainy']])

In [None]:
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Add polynomial features
poly_columns = ['temp', 'hum', 'windspeed']
poly_transformer = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', FunctionTransformer(lambda X: np.c_[X, X**2, X**3]))
])

In [None]:
from sklearn.preprocessing import PolynomialFeatures

polyfeat = PolynomialFeatures(degree=3, include_bias=False)
polyfeat.fit(data_df[poly_columns])
polyfeat.get_feature_names()

In [None]:
# Create the column transformer
preprocessor = ColumnTransformer([
    ('categorical', cat_transformer, cat_columns),
    ('ordinal', ord_transformer, ord_columns),
    ('poly', poly_transformer, poly_columns)
], remainder='drop')

encoded = preprocessor.fit_transform(data_df)
encoded.shape

In [None]:
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Create Pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MAE

# Split into train/test sets
X = data_df.drop('casual', axis=1)
y = data_df.casual
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=0)

# Fit/evaluate pipeline
pipe.fit(X_tr, y_tr)
print('MAE: {:.0f}'.format(MAE(y_te, pipe.predict(X_te))))