# ColumnTransformer

Follow _Introduction to Machine Learning_  [Chapter 4](https://github.com/amueller/introduction_to_ml_with_python/blob/master/04-representing-data-feature-engineering.ipynb)
- Section 4.3 Convenient ColumnTransformer (p.224)

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
import mglearn

In [8]:
import os
# The file has no headers naming the columns, so we pass header=None
# and provide the column names explicitly in "names"
adult_path = os.path.join(mglearn.datasets.DATA_PATH, "adult.data")
data = pd.read_csv(adult_path, header=None, index_col=False,
    skipinitialspace=True, #remove space after comma
    names=['age', 'workclass', 'fnlwgt', 'education',  'education-num',
           'marital-status', 'occupation', 'relationship', 'race', 'gender',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
           'income'])
# For illustration purposes, we only select some of the columns
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week',
             'occupation', 'income']]
# IPython.display allows nice output formatting within the Jupyter notebook
display(data.head())

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K


### Build the ColumnTransformer

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

ct = ColumnTransformer(
    [("scaling", StandardScaler(), ['age', 'hours-per-week']),
     ("onehot", OneHotEncoder(sparse_output=False), ['workclass', 'education', 'gender', 'occupation'])])

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# get all columns apart from income for the features
data_features = data.drop("income", axis=1)
# split dataframe and income
X_train, X_val, y_train, y_val = train_test_split(
    data_features, data.income, random_state=0)

ct.fit(X_train)
X_train_trans = ct.transform(X_train)
print(X_train_trans.shape)

(24420, 44)


### Train the model using transformed data

Note that validation data `X_val` needs to be transformed with the learned transformer too.

In [11]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_trans, y_train)

X_val_trans = ct.transform(X_val)
print("Validation score: {:.2f}".format(logreg.score(X_val_trans, y_val)))

Validation score: 0.81


### Access ColumnTransformer components 

In [12]:
ct.named_transformers_.onehot

In [13]:
ct.named_transformers_.onehot.get_feature_names_out()

array(['workclass_?', 'workclass_Federal-gov', 'workclass_Local-gov',
       'workclass_Never-worked', 'workclass_Private',
       'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc',
       'workclass_State-gov', 'workclass_Without-pay', 'education_10th',
       'education_11th', 'education_12th', 'education_1st-4th',
       'education_5th-6th', 'education_7th-8th', 'education_9th',
       'education_Assoc-acdm', 'education_Assoc-voc',
       'education_Bachelors', 'education_Doctorate', 'education_HS-grad',
       'education_Masters', 'education_Preschool',
       'education_Prof-school', 'education_Some-college', 'gender_Female',
       'gender_Male', 'occupation_?', 'occupation_Adm-clerical',
       'occupation_Armed-Forces', 'occupation_Craft-repair',
       'occupation_Exec-managerial', 'occupation_Farming-fishing',
       'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct',
       'occupation_Other-service', 'occupation_Priv-house-serv',
       'occupation_Prof-spe

### Convenience function: `make_column_transformer()` 

In [14]:
from sklearn.compose import make_column_transformer
ct = make_column_transformer(
    (StandardScaler(), ['age', 'hours-per-week']),
    (OneHotEncoder(sparse_output=False), ['workclass', 'education', 'gender', 'occupation']))

In [15]:
ct.fit(X_train)

In [16]:
ct.named_transformers_

{'standardscaler': StandardScaler(),
 'onehotencoder': OneHotEncoder(sparse_output=False)}

## Excercise: Apply ColumnTransformer to heart disease data

In [17]:
def load_heart_disease():
    '''Load and pre-process heart disease data
    
    if processed.hungarian.data file is not present.
    
    it will be downloaded from
    https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data
    
    return: data(DataFrame)
    
    '''
    
    import os
    import requests
    
    
    file_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data'
    file_name = file_url.split('/')[-1]
    
    if not os.path.isfile(file_name):
        print('Downloading from {}'.format(file_url))
        r = requests.get(file_url)
        with open(file_name,'wb') as output_file:
            output_file.write(r.content)
        
    data = pd.read_csv(file_name, 
                   na_values='?', 
                   names=[ 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
                            'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
                            'ca', 'thal', 'num'])
    
    # drop columns with many missing data
    data = data.drop(columns=['slope', 'ca', 'thal'])
    
    # fill in remaining missing data with mean() per column
    data = data.fillna(data.mean())
    
    return data

In [18]:
data = load_heart_disease()

Downloading from https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data


In [19]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
0,28,1,2,130.0,132.0,0.0,2.0,185.0,0.0,0.0,0
1,29,1,2,120.0,243.0,0.0,0.0,160.0,0.0,0.0,0
2,29,1,2,140.0,250.848708,0.0,0.0,170.0,0.0,0.0,0
3,30,0,1,170.0,237.0,0.0,1.0,170.0,0.0,0.0,0
4,31,0,2,100.0,219.0,0.0,1.0,150.0,0.0,0.0,0


In [20]:
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
count,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0,294.0
mean,47.826531,0.72449,2.982993,132.583618,250.848708,0.06993,0.21843,139.129693,0.303754,0.586054,0.360544
std,7.811812,0.447533,0.965117,17.596463,64.947939,0.251964,0.46008,23.549459,0.459878,0.908648,0.480977
min,28.0,0.0,1.0,92.0,85.0,0.0,0.0,82.0,0.0,0.0,0.0
25%,42.0,0.0,2.0,120.0,211.25,0.0,0.0,122.0,0.0,0.0,0.0
50%,49.0,1.0,3.0,130.0,248.5,0.0,0.0,140.0,0.0,0.0,0.0
75%,54.0,1.0,4.0,140.0,277.0,0.0,0.0,155.0,1.0,1.0,1.0
max,66.0,1.0,4.0,200.0,603.0,1.0,2.0,190.0,1.0,5.0,1.0


### Which columns are numerical (quantitative), which are categorical (qualitative)?
Consult the data description, or use `value_counts()` to guess.

In [21]:
data.cp.value_counts()

cp
4    123
2    106
3     54
1     11
Name: count, dtype: int64

By using the mean to fill in NaN, we made a mistake for the `restecg` column:

In [22]:
data.restecg.value_counts()

restecg
0.00000    235
1.00000     52
2.00000      6
0.21843      1
Name: count, dtype: int64

Let's fix this:

In [23]:
# Pandas where function replaces every value that does not satisfy the condition with the inputted value (default is NaN)
data.restecg = data.restecg.where(data.restecg >= 1, 0)

In [24]:
data.restecg.value_counts()

restecg
0.0    236
1.0     52
2.0      6
Name: count, dtype: int64

In [25]:
# TODO: which columns to scale, onehot or do nothing?

In [26]:
# get all columns apart from income for the features
X = data.drop(columns='num')
y = data['num']
print(X.shape)
print(y.shape)

# split dataframe and income
X_train, X_val, y_train, y_val = train_test_split(X, y,
                            test_size=0.1, stratify=y,random_state=31)

ct.fit(X_train)
X_train_trans = ct.transform(X_train)
print(X_train_trans.shape)

(294, 10)
(294,)


ValueError: A given column is not a column of the dataframe

In [None]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_trans, y_train)

X_val_trans = ct.transform(X_val)
print("Train score: {:.2f}".format(logreg.score(X_train_trans, y_train)))
print("Validation score: {:.2f}".format(logreg.score(X_val_trans, y_val)))

Are we overfitting? Let's try and reduce complexity by increasing regularization - reduce C:

In [None]:
logreg = LogisticRegression(C=0.01,max_iter=1000)
logreg.fit(X_train_trans, y_train)

X_val_trans = ct.transform(X_val)
print("Train score: {:.2f}".format(logreg.score(X_train_trans, y_train)))
print("Validation score: {:.2f}".format(logreg.score(X_val_trans, y_val)))

Compare that to the unscaled dataset:

In [None]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

print("Train score: {:.2f}".format(logreg.score(X_train, y_train)))
print("Validation score: {:.2f}".format(logreg.score(X_val, y_val)))