Data preparation techniques, Statistical Imputation transform for the horse colic dataset

In [1]:
from numpy import isnan
from pandas import read_csv
from sklearn.impute import SimpleImputer

In [2]:
#loading the csv file
dataframe = read_csv("G:\\data\\datasets\\horse.csv", header=None, na_values='?')

In [3]:
#split into input and output elements
data = dataframe.values
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]
#print the total missing values
print("Missing: %d" % sum(isnan(X).flatten()))

Missing: 1605


In [4]:
#define the imputer
imputer = SimpleImputer(strategy='mean')

In [5]:
#fit the imputer on the dataset
imputer.fit(X)

SimpleImputer()

In [6]:
#transform the dataset
Xtrans = imputer.transform(X)

In [7]:
#print the total missing values
print("Missing: %d" % sum(isnan(Xtrans).flatten()))

Missing: 0


In [8]:
dataframe.isnull().sum()

0       1
1       0
2       0
3      60
4      24
5      58
6      56
7      69
8      47
9      32
10     55
11     44
12     56
13    104
14    106
15    247
16    102
17    118
18     29
19     33
20    165
21    198
22      1
23      0
24      0
25      0
26      0
27      0
dtype: int64

In [9]:
dataframe.to_csv("G:\\data\\datasets\\horse_new.csv", index=False)

Feature selection Using the Recursive Feature Elimination Technique

In [10]:
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

In [11]:
#define a dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)

In [12]:
#define RFE
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=5)

In [13]:
#fit RFE
rfe.fit(X,y)

RFE(estimator=DecisionTreeClassifier(), n_features_to_select=5)

In [14]:
#summarize all the features
for i in range(X.shape[1]):
    print("Column: %d, Selected= %s, Rank: %d" % (i, rfe.support_[i], rfe.ranking_[i]))

Column: 0, Selected= False, Rank: 4
Column: 1, Selected= False, Rank: 5
Column: 2, Selected= True, Rank: 1
Column: 3, Selected= True, Rank: 1
Column: 4, Selected= True, Rank: 1
Column: 5, Selected= False, Rank: 6
Column: 6, Selected= True, Rank: 1
Column: 7, Selected= False, Rank: 2
Column: 8, Selected= True, Rank: 1
Column: 9, Selected= False, Rank: 3


In [15]:
import pandas as pd 
from pandas import read_csv
from sklearn.preprocessing import OneHotEncoder

data = pd.read_csv("G:\\data\\datasets\\breast_cancer.csv")

In [16]:
dataset = data.values

In [17]:
X = dataset[:, :-1].astype(str)
Y = dataset[:, -1].astype(str)

In [18]:
print(X[:3, :])

[['50-59' 'ge40' '15-19' '0-2' 'no' '1' 'right' 'central' 'no']
 ['50-59' 'ge40' '35-39' '0-2' 'no' '2' 'left' 'left_low' 'no']
 ['40-49' 'premeno' '35-39' '0-2' 'yes' '3' 'right' 'left_low' 'yes']]


In [19]:
encoder = OneHotEncoder(sparse=False)

In [20]:
X_oe = encoder.fit_transform(X)

In [21]:
print(X_oe[:3, :])

[[0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1.]]


Scaling data with normalization

In [22]:
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler


In [27]:
#define the dataset
X, y = make_classification(n_samples=1000, n_features=5, n_informative=5, n_redundant=0, random_state=1)
#summarize the data before transformation
print(X[:3, :])

[[ 2.39324489 -5.77732048 -0.59062319 -2.08095322  1.04707034]
 [-0.45820294  1.94683482 -2.46471441  2.36590955 -0.73666725]
 [ 2.35162422 -1.00061698 -0.5946091   1.12531096 -0.65267587]]


In [28]:
#define the scaler
trans = MinMaxScaler()

In [29]:
X_norm = trans.fit_transform(X)
print(X_norm[:3, :])

[[0.77608466 0.0239289  0.48251588 0.18352101 0.59830036]
 [0.40400165 0.79590304 0.27369632 0.6331332  0.42104156]
 [0.77065362 0.50132629 0.48207176 0.5076991  0.4293882 ]]


Transforming numbers to categories using kBins

In [31]:
from sklearn.datasets import make_classification
from sklearn.preprocessing import KBinsDiscretizer
#define a dataset
X, Y = make_classification(n_samples=1000, n_features=5, n_informative=5, n_redundant=0, random_state=1)


In [32]:
#summmarize the data before transformation
print(X[:3, :])

[[ 2.39324489 -5.77732048 -0.59062319 -2.08095322  1.04707034]
 [-0.45820294  1.94683482 -2.46471441  2.36590955 -0.73666725]
 [ 2.35162422 -1.00061698 -0.5946091   1.12531096 -0.65267587]]


In [33]:
#define the transform
trans = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
#transform the data
X_discrete = trans.fit_transform(X)
#summarize after data transform
print(X_discrete[:3, :])

[[7. 0. 4. 1. 5.]
 [4. 7. 2. 6. 4.]
 [7. 5. 4. 5. 4.]]


Dimensionality Reduction with Principal Component Analysis


In [34]:
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
#define the dataset
X, Y = make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=7, random_state=1)
#summarize the data beofore transform
print(X[:3, :])

[[-0.53448246  0.93837451  0.38969914  0.0926655   1.70876508  1.14351305
  -1.47034214  0.11857673 -2.72241741  0.2953565 ]
 [-2.42280473 -1.02658758 -2.34792156 -0.82422408  0.59933419 -2.44832253
   0.39750207  2.0265065   1.83374105  0.72430365]
 [-1.83391794 -1.1946668  -0.73806871  1.50947233  1.78047734  0.58779205
  -2.78506977 -0.04163788 -1.25227833  0.99373587]]


In [38]:
#define the transform
trans = PCA(n_components=9)

In [39]:
#transform the data
X_dim = trans.fit_transform(X)
print(X[:3, :])

[[-0.53448246  0.93837451  0.38969914  0.0926655   1.70876508  1.14351305
  -1.47034214  0.11857673 -2.72241741  0.2953565 ]
 [-2.42280473 -1.02658758 -2.34792156 -0.82422408  0.59933419 -2.44832253
   0.39750207  2.0265065   1.83374105  0.72430365]
 [-1.83391794 -1.1946668  -0.73806871  1.50947233  1.78047734  0.58779205
  -2.78506977 -0.04163788 -1.25227833  0.99373587]]
