## Fill Missing Values with Imputation

Statistical imputation transform for the horse colic dataset

In [3]:
# import necssary libraries
from numpy import isnan 
from pandas import read_csv 
from sklearn.impute import SimpleImputer 

# load dataset
url = 'http://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'
dataframe = read_csv(url, header=None, na_values='?')

# split into input and output
data = dataframe.values
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]

# print total missing 
print("Missing: %d" % sum(isnan(X).flatten()))

# define imputer 
imputer = SimpleImputer(strategy='mean')

# fit on the dataset
imputer.fit(X)

# transform the dataset
Xtrans = imputer.transform(X)

# print total missing
print("Missing: %d" % sum(isnan(Xtrans).flatten()))


Missing: 1605
Missing: 0


## Select Features with RFE 

report which features were selected by RFE 

In [8]:
from sklearn.datasets import make_classification 
from sklearn.feature_selection import RFE 
from sklearn.tree import DecisionTreeClassifier

# define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)

# define RFE 
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=5)

# fit RFE
rfe.fit(X, y)

# summarize all features 
for i in range(X.shape[1]): 
    print("Column: %d, Selected=%s, Rank:%d" % (i, rfe.support_[i], rfe.ranking_[i]))


Column: 0, Selected=False, Rank:4
Column: 1, Selected=False, Rank:5
Column: 2, Selected=True, Rank:1
Column: 3, Selected=True, Rank:1
Column: 4, Selected=True, Rank:1
Column: 5, Selected=False, Rank:6
Column: 6, Selected=True, Rank:1
Column: 7, Selected=False, Rank:3
Column: 8, Selected=True, Rank:1
Column: 9, Selected=False, Rank:2


## Scale Data with Normalization

example of normalizing input data 

In [10]:
from sklearn.datasets import make_classification 
from sklearn.preprocessing import MinMaxScaler 

# define dataset
X, y = make_classification(n_samples=10000, n_features=10, n_informative=5, n_redundant=0, random_state=1)

# summarize data before transform
print(X[:3, :])

# define the scaler 
trans = MinMaxScaler()

# transform the data
X_norm = trans.fit_transform(X)

# summarize data after transform 
print("After Transformation")
print(X_norm[:3, :])

[[ 1.16445804  2.20489985  0.98705394 -0.24892557  0.02730749 -0.54607659
  -1.13378169  2.92506351  1.26673358  1.46401078]
 [ 1.07746722 -4.1026129  -0.28684337  2.26184715  0.24576105 -0.00716217
  -1.10238571 -0.34331894  0.64221845 -1.22448632]
 [ 1.03496179 -1.90438963  0.51909698 -1.79314569 -2.59816732 -1.17865026
   1.2355152   2.93608911 -0.73405942  1.24542021]]
After Transformation
[[0.47346034 0.68996947 0.53789567 0.50720087 0.56775645 0.41669053
  0.31761837 0.7371739  0.62726634 0.71577552]
 [0.46605616 0.1968639  0.4356307  0.79618474 0.58941628 0.48397861
  0.32153252 0.46146966 0.54828922 0.34243806]
 [0.46243834 0.3687155  0.50032937 0.32946487 0.30743877 0.33770828
  0.61299983 0.73810396 0.37424304 0.68542099]]


## Transform Categories with One Hot Encoding

one hot encode the breast cancer dataset


In [13]:
from pandas import read_csv
from sklearn.preprocessing import OneHotEncoder

# define the location of dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv"

# load the dataset
dataset = read_csv(url, header=None)

# retrieve the array of data
data = dataset.values

# separate into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)

# summarize the raw data 
print(X[:3, :])

# define the one hot encoding transform
encoder = OneHotEncoder(sparse=False)

# fit and apply the transform to the input data
X_oe = encoder.fit_transform(X)

# summarize the transfromed data
print(X_oe[:3, :])


[["'40-49'" "'premeno'" "'15-19'" "'0-2'" "'yes'" "'3'" "'right'"
  "'left_up'" "'no'"]
 ["'50-59'" "'ge40'" "'15-19'" "'0-2'" "'no'" "'1'" "'right'" "'central'"
  "'no'"]
 ["'50-59'" "'ge40'" "'35-39'" "'0-2'" "'no'" "'2'" "'left'" "'left_low'"
  "'no'"]]
[[0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0.]]


## Transfrom Numbers to Categories with kBins

discretize numeric input variables

In [20]:
from sklearn.datasets import make_classification
from sklearn.preprocessing import KBinsDiscretizer

# define dataset
X, y = make_classification(n_samples=1000, n_features=5, n_informative=5, n_redundant=0, random_state=1)

# summarize data before the transform
print(X[:3, :])

# define the transforms
trans = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')

# transform the data
X_discrete = trans.fit_transform(X)

# summmarize data after the transforms
print("After transform")
print(X_discrete[:3, :])


[[ 2.39324489 -5.77732048 -0.59062319 -2.08095322  1.04707034]
 [-0.45820294  1.94683482 -2.46471441  2.36590955 -0.73666725]
 [ 2.35162422 -1.00061698 -0.5946091   1.12531096 -0.65267587]]
After transform
[[7. 0. 4. 1. 5.]
 [4. 7. 2. 6. 4.]
 [7. 5. 4. 5. 4.]]


## Dimensionality Reduction with PCA

In [24]:
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA

# dafine dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=7, 
random_state=1)

# summarize data before transforms
print(X[:3, :])

# define the transforms
trans = PCA(n_components=3)

# transform the data
X_dims = trans.fit_transform(X)

# summarize data after transforms
print("After the transform")
print(X_dims[:3, :])

[[-0.53448246  0.93837451  0.38969914  0.0926655   1.70876508  1.14351305
  -1.47034214  0.11857673 -2.72241741  0.2953565 ]
 [-2.42280473 -1.02658758 -2.34792156 -0.82422408  0.59933419 -2.44832253
   0.39750207  2.0265065   1.83374105  0.72430365]
 [-1.83391794 -1.1946668  -0.73806871  1.50947233  1.78047734  0.58779205
  -2.78506977 -0.04163788 -1.25227833  0.99373587]]
After the transform
[[-1.64710578 -2.11683302  1.98256096]
 [ 0.92840209  4.8294997   0.22727043]
 [-3.83677757  0.32300714  0.11512801]]
