Column transformations
---

In [1]:
import pandas as pd

data_df = pd.read_csv('bike-sharing.csv')
data_df.head()

Unnamed: 0,temp,hum,windspeed,yr,workingday,holiday,weekday,season,weathersit,casual
0,0.344,0.806,0.16,2011,no,no,6,spring,cloudy,331
1,0.363,0.696,0.249,2011,no,no,0,spring,cloudy,131
2,0.196,0.437,0.248,2011,yes,no,1,spring,clear,120
3,0.2,0.59,0.16,2011,yes,no,2,spring,clear,108
4,0.227,0.437,0.187,2011,yes,no,3,spring,clear,82


In [2]:
from sklearn.preprocessing import OneHotEncoder

# Create encoder
encoder = OneHotEncoder()
encoder.fit_transform(data_df)

<731x1714 sparse matrix of type '<class 'numpy.float64'>'
	with 7310 stored elements in Compressed Sparse Row format>

In [3]:
# Create encoder
encoder = OneHotEncoder(sparse=False)
encoded = encoder.fit_transform(data_df)
encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [4]:
encoded.shape

(731, 1714)

In [5]:
from sklearn.compose import ColumnTransformer

# Handle categorical variables
cat_columns = ['yr', 'workingday', 'holiday', 'weekday', 'season', 'weathersit']
cat_transformer = OneHotEncoder(sparse=False)

# Create the column transformer
preprocessor = ColumnTransformer([
    ('categorical', cat_transformer, cat_columns)
], remainder='passthrough')

In [6]:
encoded = preprocessor.fit_transform(data_df)
encoded

array([[1.00e+00, 0.00e+00, 1.00e+00, ..., 8.06e-01, 1.60e-01, 3.31e+02],
       [1.00e+00, 0.00e+00, 1.00e+00, ..., 6.96e-01, 2.49e-01, 1.31e+02],
       [1.00e+00, 0.00e+00, 0.00e+00, ..., 4.37e-01, 2.48e-01, 1.20e+02],
       ...,
       [0.00e+00, 1.00e+00, 1.00e+00, ..., 7.53e-01, 1.24e-01, 1.59e+02],
       [0.00e+00, 1.00e+00, 1.00e+00, ..., 4.83e-01, 3.51e-01, 3.64e+02],
       [0.00e+00, 1.00e+00, 0.00e+00, ..., 5.78e-01, 1.55e-01, 4.39e+02]])

In [7]:
print('Shape:', encoded.shape)
print('Type:', type(encoded))
print('Data type:', encoded.dtype)

Shape: (731, 24)
Type: <class 'numpy.ndarray'>
Data type: float64


In [8]:
try:
    cat_transformer.get_feature_names()
except Exception as e:
    print(e)

This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.


In [9]:
preprocessor.named_transformers_

{'categorical': OneHotEncoder(categorical_features=None, categories=None,
        dtype=<class 'numpy.float64'>, handle_unknown='error',
        n_values=None, sparse=False),
 'remainder': 'passthrough'}

In [10]:
preprocessor.named_transformers_['categorical'].get_feature_names()

array(['x0_2011', 'x0_2012', 'x1_no', 'x1_yes', 'x2_no', 'x2_yes', 'x3_0',
       'x3_1', 'x3_2', 'x3_3', 'x3_4', 'x3_5', 'x3_6', 'x4_fall',
       'x4_spring', 'x4_summer', 'x4_winter', 'x5_clear', 'x5_cloudy',
       'x5_rainy'], dtype=object)

In [11]:
new_data = data_df.iloc[:1].copy()
new_data['weathersit'] = 'storm'
new_data

Unnamed: 0,temp,hum,windspeed,yr,workingday,holiday,weekday,season,weathersit,casual
0,0.344,0.806,0.16,2011,no,no,6,spring,storm,331


In [12]:
try:
    preprocessor.transform(new_data)
except Exception as e:
    print(e)

Found unknown categories ['storm'] in column 5 during transform


In [13]:
# Handle categorical variables
cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Create the column transformer
preprocessor = ColumnTransformer([
    ('categorical', cat_transformer, cat_columns)
], remainder='passthrough')
preprocessor.fit_transform(data_df)
preprocessor.transform(new_data)

array([[1.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 1.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        1.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 3.44e-01, 8.06e-01, 1.60e-01, 3.31e+02]])

In [14]:
from sklearn.preprocessing import OrdinalEncoder

# Handle ordinal variables
ord_columns = ['weathersit']
ord_transformer = OrdinalEncoder(categories=[['clear', 'cloudy', 'rainy']])

In [15]:
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Add polynomial features
poly_columns = ['temp', 'hum', 'windspeed']
poly_transformer = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', FunctionTransformer(lambda X: np.c_[X, X**2, X**3]))
])

In [16]:
from sklearn.preprocessing import PolynomialFeatures

polyfeat = PolynomialFeatures(degree=3, include_bias=False)
polyfeat.fit(data_df[poly_columns])
polyfeat.get_feature_names()

['x0',
 'x1',
 'x2',
 'x0^2',
 'x0 x1',
 'x0 x2',
 'x1^2',
 'x1 x2',
 'x2^2',
 'x0^3',
 'x0^2 x1',
 'x0^2 x2',
 'x0 x1^2',
 'x0 x1 x2',
 'x0 x2^2',
 'x1^3',
 'x1^2 x2',
 'x1 x2^2',
 'x2^3']

In [17]:
# Create the column transformer
preprocessor = ColumnTransformer([
    ('categorical', cat_transformer, cat_columns),
    ('ordinal', ord_transformer, ord_columns),
    ('poly', poly_transformer, poly_columns)
], remainder='drop')

encoded = preprocessor.fit_transform(data_df)
encoded.shape



(731, 30)

In [18]:
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Create Pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MAE

# Split into train/test sets
X = data_df.drop('casual', axis=1)
y = data_df.casual
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=0)

# Fit/evaluate pipeline
pipe.fit(X_tr, y_tr)
print('MAE: {:.0f}'.format(MAE(y_te, pipe.predict(X_te))))

MAE: 253
