# This is just some playing around with scikit

In [1]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


In [2]:
data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
values = array(data)
print(values)

['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']


In [3]:
label_encoder = LabelEncoder()
int_encoded = label_encoder.fit_transform(values)
print(int_encoded)

[0 0 2 0 1 1 2 0 2 1]


In [4]:
onehot_encoder = OneHotEncoder(sparse=False)
int_encoded = int_encoded.reshape(len(int_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(int_encoded)
print(onehot_encoded)

[[ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]
 [ 0.  0.  1.]
 [ 0.  1.  0.]]


In [8]:
#We use argmax below to get the index of the column with the largest value
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[2, :])])
print(inverted)

['warm']


## When dealing with sequences, we often need to pad our data

In [1]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.
  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [2]:
# define sequences
sequences = [
  [1, 2, 3, 4],
    [1, 2, 3],
[1] ]

In [3]:
#Lets try some padding
padded = pad_sequences(sequences)
print(padded)

[[1 2 3 4]
 [0 1 2 3]
 [0 0 0 1]]


In [4]:
#Can also post-pad
padded = pad_sequences(sequences, padding='post')
print(padded)

[[1 2 3 4]
 [1 2 3 0]
 [1 0 0 0]]


In [6]:
#Truncation may be nescessary at times - default is pre
truncated = pad_sequences(sequences, maxlen=2) #adding ,truncating='post' 
#will remove from end
print(truncated)

[[3 4]
 [2 3]
 [0 1]]


In [7]:
## Pandas shift function
### Given a DataFrame, the shift() function can be used to create copies 
### of columns that are pushed forward (rows of NaN values added to the 
### front) or pulled back (rows of NaN values added to the end)

In [9]:
from pandas import DataFrame
df = DataFrame()
df['t'] = [x for x in range(10)]
print(df)

   t
0  0
1  1
2  2
3  3
4  4
5  5
6  6
7  7
8  8
9  9


In [11]:
#Can shift all observations down by one time step 
df['t-1'] = df['t'].shift(1)
print(df)

   t  t-1
0  0  NaN
1  1  0.0
2  2  1.0
3  3  2.0
4  4  3.0
5  5  4.0
6  6  5.0
7  7  6.0
8  8  7.0
9  9  8.0


In [13]:
#Shift can also take negative values which will pulls values back
df['t+1'] = df['t'].shift(-1) 
print(df)

   t  t-1  t+1
0  0  NaN  1.0
1  1  0.0  2.0
2  2  1.0  3.0
3  3  2.0  4.0
4  4  3.0  5.0
5  5  4.0  6.0
6  6  5.0  7.0
7  7  6.0  8.0
8  8  7.0  9.0
9  9  8.0  NaN
