# ch3
Normalization

In [3]:
from pandas import Series
from sklearn.preprocessing import MinMaxScaler

In [4]:
# define contrived series
data = [10.0, 20.0, 30.0, 40, 50, 60, 70, 80, 90, 100]
series = Series(data)
print(series)

0     10.0
1     20.0
2     30.0
3     40.0
4     50.0
5     60.0
6     70.0
7     80.0
8     90.0
9    100.0
dtype: float64


In [6]:
# prepare data for normalization
values = series.values
print(values.shape)
values = values.reshape((len(values)), 1)
print(values.shape)

(10,)
(10, 1)


In [7]:
# train the normalization
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(values)
print('Min: %f, Max: %f' % (scaler.data_min_, scaler.data_max_))

Min: 10.000000, Max: 100.000000


In [9]:
# normalize the dataset and print
normalized = scaler.transform(values)
print(normalized)

[[0.        ]
 [0.11111111]
 [0.22222222]
 [0.33333333]
 [0.44444444]
 [0.55555556]
 [0.66666667]
 [0.77777778]
 [0.88888889]
 [1.        ]]


In [10]:
# inverse transform and print
inversed = scaler.inverse_transform(normalized)
print(inversed)

[[ 10.]
 [ 20.]
 [ 30.]
 [ 40.]
 [ 50.]
 [ 60.]
 [ 70.]
 [ 80.]
 [ 90.]
 [100.]]


# StandardScaler

In [13]:
from sklearn.preprocessing import StandardScaler
from math import sqrt

In [14]:
# define contrived series
data = [1.0, 5.5, 9.0, 2.6, 8.8, 3.0, 4.1, 7.9, 6.3]
series = Series(data)
print(series)

0    1.0
1    5.5
2    9.0
3    2.6
4    8.8
5    3.0
6    4.1
7    7.9
8    6.3
dtype: float64


In [15]:
# prepare data form standardization
values = series.values
values = values.reshape(len(values), 1)
print(values)

[[1. ]
 [5.5]
 [9. ]
 [2.6]
 [8.8]
 [3. ]
 [4.1]
 [7.9]
 [6.3]]


In [19]:
# train the standardization
scaler = StandardScaler()
scaler = scaler.fit(values)
print('Mean: %f, StandardDeviation: %f ' % (scaler.mean_, sqrt(scaler.var_)))

Mean: 5.355556, StandardDeviation: 2.712568 


In [20]:
# standardization the dataset and print
standardized = scaler.transform(values)
print(standardized)

[[-1.60569456]
 [ 0.05325007]
 [ 1.34354035]
 [-1.01584758]
 [ 1.26980948]
 [-0.86838584]
 [-0.46286604]
 [ 0.93802055]
 [ 0.34817357]]


In [22]:
# inverse transform and print
inversed = scaler.inverse_transform(standardized)
print(inversed)

[[1. ]
 [5.5]
 [9. ]
 [2.6]
 [8.8]
 [3. ]
 [4.1]
 [7.9]
 [6.3]]


# prepare categorical data

In [23]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [25]:
# define example
data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
values = array(data)
print(values)

['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']


In [26]:
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values) 
print(integer_encoded)

[0 0 2 0 1 1 2 0 2 1]


In [28]:
# binary encode
one_hot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = one_hot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


In [29]:
# invert first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
print(inverted)

['cold']


  if diff:


In [30]:
onehot_encoded[0, :]

array([1., 0., 0.])

In [32]:
argmax(onehot_encoded[0, :])

0

In [33]:
label_encoder.classes_

array(['cold', 'hot', 'warm'], dtype='<U4')

# Sequence padding
pre-sequence-padding

In [34]:
from keras.preprocessing.sequence import pad_sequences

In [36]:
# define sequences
sequences = [
    [1, 2, 3, 4],
    [1, 2, 3],
    [1]
]
print(sequences)

[[1, 2, 3, 4], [1, 2, 3], [1]]


In [37]:
# pad sequences
padded = pad_sequences(sequences)
print(padded)

[[1 2 3 4]
 [0 1 2 3]
 [0 0 0 1]]


In [39]:
# post-sequence padding
padded = pad_sequences(sequences, padding='post')
print(padded)

[[1 2 3 4]
 [1 2 3 0]
 [1 0 0 0]]


# Sequence Truncation

In [41]:
# pre-sequence truncation
truncated = pad_sequences(sequences, maxlen=2)
print(truncated)

[[3 4]
 [2 3]
 [0 1]]


In [43]:
# post-sequence truncation
truncated = pad_sequences(sequences, maxlen=2, truncating='post')
print(truncated)

[[1 2]
 [1 2]
 [0 1]]


In [44]:
# Sequence prediction as Supervised learning

In [45]:
from pandas import DataFrame

In [46]:
# define the sequence
df = DataFrame()
df['t'] = [x for x in range(10)]
print(df)

   t
0  0
1  1
2  2
3  3
4  4
5  5
6  6
7  7
8  8
9  9


In [47]:
# shift forward
df['t-1'] = df['t'].shift(1)
print(df)

   t  t-1
0  0  NaN
1  1  0.0
2  2  1.0
3  3  2.0
4  4  3.0
5  5  4.0
6  6  5.0
7  7  6.0
8  8  7.0
9  9  8.0


In [48]:
df = DataFrame()
df['t'] = [x for x in range(10)]
print(df)
df['t+1'] = df['t'].shift(-1)
print(df)

   t
0  0
1  1
2  2
3  3
4  4
5  5
6  6
7  7
8  8
9  9
   t  t+1
0  0  1.0
1  1  2.0
2  2  3.0
3  3  4.0
4  4  5.0
5  5  6.0
6  6  7.0
7  7  8.0
8  8  9.0
9  9  NaN


In [None]:
#ch4