# Pandas Interoperability

<a href="https://colab.research.google.com/github/thomasjpfan/ml-workshop-intermediate-1-of-2/blob/master/notebooks/04-pandas-interoperability.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab" title="Open and Execute in Google Colaboratory"></a>

In [None]:
# Install dependencies for google colab
import sys
if 'google.colab' in sys.modules:
    %pip install -r https://raw.githubusercontent.com/thomasjpfan/ml-workshop-intermediate-1-of-2/master/requirements.txt

In [None]:
import sklearn
assert sklearn.__version__.startswith("1.0"), "Plese install scikit-learn 1.0"

In [None]:
import numpy as np
sklearn.set_config(display='diagram')

## Categorical Data

In [None]:
import pandas as pd

df_train = pd.DataFrame({
    "pet": ["snake", "dog", "cat", "cow"],
})

### OridinalEncoder

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
ord_encoder = OrdinalEncoder()
ord_encoder.fit_transform(df_train)

In [None]:
ord_encoder.categories_

In [None]:
df_test = pd.DataFrame({
    "pet": ["cow", "cat"]
})
df_test

In [None]:
ord_encoder.transform(df_test)

### Categories that are unknown during `fit`

In [None]:
df_test_unknown = pd.DataFrame({
    "pet": ["bear"]
})

In [None]:
try:
    ord_encoder.transform(df_test_unknown)
except ValueError as err:
    print(err)

### How to handle unknown categories in OridinalEncoder?

### Provide all the categories in the constructor

In [None]:
df_train

In [None]:
ord_encoder = OrdinalEncoder(
    categories=[['snake', 'dog', 'cat', 'cow', 'bear']])
ord_encoder.fit_transform(df_train)

In [None]:
df_test_unknown

In [None]:
ord_encoder.transform(df_test_unknown)

### Setting a value for unknown values directly

In [None]:
ord_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

In [None]:
ord_encoder.fit_transform(df_train)

In [None]:
df_test_unknown

In [None]:
ord_encoder.transform(df_test_unknown)

## OneHotEncoder

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
X_trans = ohe.fit_transform(df_train)
X_trans

By default it is sparse!

In [None]:
X_trans.toarray()

### Switch to dense

In [None]:
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(df_train)

### Unknown categories during transform?

In [None]:
df_test_unknown

In [None]:
# this will fail
try:
    ohe.transform(df_test_unknown)
except ValueError as exc:
    print(exc)

### OHE can handle unknowns

In [None]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe.fit(df_train)

In [None]:
ohe.transform(df_test_unknown)

In [None]:
ohe.categories_

## Two categorical features

In [None]:
df_train = pd.DataFrame({
    "pet": ["cat", "dog", "snake"],
    "city": ["New York", "London", "London"]
})

In [None]:
ohe.fit(df_train)

In [None]:
ohe.categories_

In [None]:
ohe.transform(df_train)

# Column Transformer!

In [None]:
import pandas as pd

In [None]:
X_df = pd.DataFrame({
    'age': [10, 20, 15, 5, 20, 14],
    'height': [5, 7, 6.5, 4.1, 5.4, 5.4],
    'pet': ['dog', 'snake', 'cat', 'dog', 'cat', 'cat']
})
X_df

## With OridinalEncoder

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [None]:
ct = ColumnTransformer([
    ('numerical', StandardScaler(), ['age', 'height']),
    ('categorical', OrdinalEncoder(), ['pet'])
])

ct.fit_transform(X_df)

## With OneHotEncoder

In [None]:
ct = ColumnTransformer([
    ('numerical', StandardScaler(), ['age', 'height']),
    ('categorical', OneHotEncoder(), ['pet'])
])

In [None]:
ct.fit_transform(X_df)

## Titanic dataset

In [None]:
from sklearn.datasets import fetch_openml
titanic = fetch_openml(data_id=40945, as_frame=True)
X, y = titanic.data, titanic.target

In [None]:
y

In [None]:
X.head()

### Are three categories already encoded in the dataset?

In [None]:
X.dtypes

### Are there missing values in the dataset?

In [None]:
missing_values = pd.concat({"na_cnt": X.isna().sum(), "dtypes": X.dtypes}, axis='columns')
missing_values

### Split data into training and test set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=42)

## ColumnTransformer

In [None]:
missing_values

### Numerical preprocessing

In [None]:
numerical_features = ['age', 'sibsp', 'parch', 'fare', 'body']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_prep = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

In [None]:
num_prep

#### Running only on numerical features 

In [None]:
num_trans = num_prep.fit_transform(X_train[numerical_features])
num_trans

In [None]:
num_trans.shape

### Categorical preprocessing

In [None]:
categorical_features = ['sex', 'embarked']

In [None]:
cat_prep = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [None]:
cat_prep

#### Running only on the categorical features

In [None]:
cat_trans = cat_prep.fit_transform(X_train[categorical_features])
cat_trans

In [None]:
cat_trans.shape

## ColumnTransformer!

In [None]:
ct = ColumnTransformer([
   ('numerical', num_prep, numerical_features),
   ('categorical', cat_prep, categorical_features)
])

In [None]:
ct

In [None]:
X_trans = ct.fit_transform(X_train)

In [None]:
X_trans[:, :5]

In [None]:
X_trans[:, 5:]

In [None]:
X_trans.shape

### Linear model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg = Pipeline([
    ('preprocess', ct),
    ('log_reg', LogisticRegression(random_state=42))
])
log_reg

In [None]:
log_reg.fit(X_train, y_train)

In [None]:
log_reg.score(X_train, y_train)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = Pipeline([
    ('preprocess', ct),
    ('log_reg', RandomForestClassifier(random_state=42))
])
rf

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf.score(X_train, y_train)

## Exercise 1

1. Load the ames housing dataset using `sklearn.datasets.fetch_openml` with `data_id=41211` and `as_frame=True`.
    - **Hint**: You may ignore the version warning
1. How many samples and features are there?
1. Find and save the categorical and numerical feature names.
    - **Hint**: You can use `X.select_dtypes(include='category').columns` and `X.select_dtypes(include='number').columns`
1. What are the categorical feature names? What are the numerical feature names?
1. Split the data into training and test dataset.
1. Build pipeline using a `ColumnTransformer`, `OrdinalEncoder`, and `sklearn.ensemble.HistGradientBoostingRegressor` and fit on the train dataset.
    - **Hint**: Use `'passthrough'` option for numerical columns.
    - **Hint**: Use `OrdinalEncoder` with `handle_unknown='use_encoded_value'` and `unknown_value=-1`.
1. Evaluate the model on the test set.
1. **Extra**: Use `sklearn.compose.make_column_selector` instead of passing the feature names directly.

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.compose import make_column_selector

**If you are running locally**, you can uncomment the following cell to load the solution into the cell. On **Google Colab**, [see solution here](https://github.com/thomasjpfan/ml-workshop-intermediate-1-of-2/blob/master/notebooks/solutions/04-ex01-solutions.py).  

In [None]:
# %load solutions/04-ex01-solutions.py