In [2]:
import pandas as pd
import numpy as np
import os

In [4]:
print(os.listdir('input/'))

['test.csv', 'train.csv']


In [8]:
train = pd.read_csv('input/train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [25]:
numerical = train.select_dtypes(include=np.number).columns.tolist()

In [13]:
categories = train.select_dtypes(include=np.object).columns.tolist()

In [15]:
train[categories] = train[categories].astype('category')

In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null category
Sex            891 non-null category
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null category
Fare           891 non-null float64
Cabin          204 non-null category
Embarked       889 non-null category
dtypes: category(5), float64(2), int64(5)
memory usage: 134.4 KB


In [22]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.feature_names]
        
        except KeyError:
            e = list(set(self.feature_names) - set(X.feature_names))
            raise KeyError('The DataFrame does not include the columns: %s'.format(e))

In [23]:
cs = ColumnSelector(feature_names=categories)
cs.fit_transform(train).head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


In [26]:
cs = ColumnSelector(feature_names=numerical)
cs.fit_transform(train).head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0,3,22.0,1,0,7.25
1,2,1,1,38.0,1,0,71.2833
2,3,1,3,26.0,0,0,7.925
3,4,1,1,35.0,1,0,53.1
4,5,0,3,35.0,0,0,8.05


In [27]:
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])

In [28]:
ts = TypeSelector('number')
ts.fit_transform(train).head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0,3,22.0,1,0,7.25
1,2,1,1,38.0,1,0,71.2833
2,3,1,3,26.0,0,0,7.925
3,4,1,1,35.0,1,0,53.1
4,5,0,3,35.0,0,0,8.05


In [35]:
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


pipeline = make_pipeline(
    ColumnSelector(feature_names=train.columns),
    FeatureUnion(transformer_list=[
        ("numeric", make_pipeline(
            TypeSelector(np.number),
            SimpleImputer(strategy="median"),
            StandardScaler())),
        ("category", make_pipeline(
            TypeSelector("category"),
            SimpleImputer(strategy="most_frequent"),
            OneHotEncoder()))
    ])
)

In [37]:
pipeline.fit_transform(train)

<891x1731 sparse matrix of type '<class 'numpy.float64'>'
	with 10691 stored elements in Compressed Sparse Row format>