In [1]:
import boto3
# on EC2 ubuntu machine, reading data directly from S3
client = boto3.client('s3') #low-level functional API

resource = boto3.resource('s3') #high-level object-oriented API
my_bucket = resource.Bucket('my-bucket') #subsitute this for your s3 bucket name.

import pandas as pd

obj = client.get_object(Bucket='shuo-zhang-bb', Key='banking/banking.csv')
df = pd.read_csv(obj['Body'])

print(df.head())

   age          job  marital          education  default housing loan  \
0   44  blue-collar  married           basic.4y  unknown     yes   no   
1   53   technician  married            unknown       no      no   no   
2   28   management   single  university.degree       no     yes   no   
3   39     services  married        high.school       no      no   no   
4   55      retired  married           basic.4y       no     yes   no   

    contact month day_of_week  ...  campaign  pdays  previous     poutcome  \
0  cellular   aug         thu  ...         1    999         0  nonexistent   
1  cellular   nov         fri  ...         1    999         0  nonexistent   
2  cellular   jun         thu  ...         3      6         2      success   
3  cellular   apr         fri  ...         2    999         0  nonexistent   
4  cellular   aug         fri  ...         1      3         1      success   

  emp_var_rate  cons_price_idx  cons_conf_idx  euribor3m  nr_employed  y  
0          1.4   

# ML toy example

https://mmuratarat.github.io/2019-06-12/embeddings-with-numeric-variables-Keras

Below I modified the target from continuous to be categorical binary, and then I modified the softmax activatio, loss, epoches, etc. to perform a classification.

In [32]:

#example of building ML model with categorical and numeric input features
import tensorflow as tf
from tensorflow import keras
import numpy as np

#Three numerical variables
num_data = np.random.random(size=(10,3))

#One categorical variables with 4 levels
############## this corresponds to words with the vocab size being 4. but the input len
############### should be 1 if we're talking embedding args. This tells me that 
#### for this kind of categorical data that is not text, we may not want to copy the NLP
#### paradigm. We may want to do something more efficient. like this:
# https://medium.com/@satnalikamayank12/on-learning-embeddings-for-categorical-data-using-keras-165ff2773fc9
cat_data = np.random.randint(0,4,10)

#Let's create one-hot encoded matrix since expected input_1 to have shape (4,)
one_hot_encoded_cat_data = np.eye(cat_data.max()+1)[cat_data]

In [34]:
num_data


array([[0.8132253 , 0.90926582, 0.22090073],
       [0.89826491, 0.93630556, 0.66557909],
       [0.16787571, 0.38787317, 0.47307567],
       [0.80825125, 0.82991306, 0.98040159],
       [0.40792785, 0.10344957, 0.56987869],
       [0.62457835, 0.83833851, 0.50070103],
       [0.08522605, 0.95080124, 0.18189365],
       [0.56925521, 0.98577432, 0.48042421],
       [0.18712421, 0.76607558, 0.6437047 ],
       [0.12180947, 0.68582759, 0.43736882]])

In [35]:
num_data.shape

(10, 3)

In [4]:
one_hot_encoded_cat_data

array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.]])

In [10]:
np.eye(cat_data.max()+1)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [7]:
cat_data

array([1, 0, 2, 3, 0, 2, 1, 3, 2, 0])

In [14]:
a=np.eye(cat_data.max()+1)

In [15]:
a.shape

(4, 4)

In [33]:
a[cat_data]

array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.]])

In [37]:
len(np.unique(cat_data))

4

In [59]:

rng = np.random.default_rng()
target = rng.choice(2, 10)
num_classes=2
# Convert class vectors to binary class matrices.
target = keras.utils.to_categorical(target, num_classes)

In [60]:
target

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.]], dtype=float32)

In [39]:
# model specification and compilation without ref to concrete input data

#target =  np.random.random(size=(10,1))
# original target is continuous real numbers
# let's use binary categorical and we'll change the output activation
rng = np.random.default_rng()
target = rng.choice(2, 10)
num_classes=2
# Convert class vectors to binary class matrices.
target = keras.utils.to_categorical(target, num_classes)

no_of_unique_cat  = len(np.unique(cat_data))
#Jeremy Howard provides the following rule of thumb; embedding size = min(50, number of categories/2).
embedding_size = min(np.ceil((no_of_unique_cat)/2), 50 )
embedding_size = int(embedding_size) # embedding size is 4/2 = 2


# numeric features
inp_num_data = keras.layers.Input(shape=(num_data.shape[1],)) #(None, 3) #doc on input layer: https://keras.io/layers/core/



######################## This doesn't make sense!!!!!!!!
# Cat features embed 
# Use Input layers, specify input shape (dimensions except first)
inp_cat_data = keras.layers.Input(shape=(no_of_unique_cat,)) # 4 - therefore it is assuming
#indeed the one-hot encoded input, which doesn't make sense comparing to the NLP example.
# in NLP, the input dim should be the input_length, the max num of word in a sentence
# expected. WHat makes this more obviously wrong is that the input_length and the input_dim
# below both equal to the no_of_unique_cat, which is obviously a misunderstanding.


# Bind nulti_hot to embedding layer
emb = keras.layers.Embedding(input_dim=no_of_unique_cat, output_dim=embedding_size)(inp_cat_data)  #emb size(None, 4, 2)
# Also you need flatten embedded output of shape (?,3,2) to (?, 6) -
# otherwise it's not possible to concatenate it with inp_num_data
flatten = keras.layers.Flatten()(emb) # after flatten => (None, 8)


# Concatenate two layers
conc = keras.layers.Concatenate()([flatten, inp_num_data])
dense1 = keras.layers.Dense(3, activation=tf.nn.relu, )(conc) #doc on dense layer: https://keras.io/layers/core/

# Creating output layer
####### here i was observing that this has no activation
####### look at Dense vs Activation layer below
out = keras.layers.Dense(2, activation='softmax')(dense1)

# functional API: only string together the model now with input and output
# inputs is a list 
# https://keras.io/models/model/ 
# In the case of multi-input or multi-output models, you can use lists as well:
# model = Model(inputs=[a1, a2], outputs=[b1, b2, b3])

model = keras.Model(inputs=[inp_cat_data, inp_num_data], outputs=out)

# for continuous targets
#model.compile(optimizer=tf.train.AdamOptimizer(0.01),
              #loss=keras.losses.mean_squared_error,
             # metrics=[keras.metrics.mean_squared_error])
opt = keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6)

model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])


In [49]:
target # here it is well suited to softmax with two output neurons in the last layer. you can also use one output
#unit, and then use sigmoid, and adjust the dimension of the output target here.

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [34]:
embedding_size

2

In [36]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 4, 2)         8           input_2[0][0]                    
__________________________________________________________________________________________________
flatten (Flatten)               (None, 8)            0           embedding[0][0]                  
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 3)]          0                                            
______________________________________________________________________________________________

In [40]:
epochs=20
model.fit([one_hot_encoded_cat_data, num_data], target,epochs=epochs)
# WARNING:tensorflow:From /anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
# Instructions for updating:
# Use tf.cast instead.
# 10/10 [==============================] - 0s 13ms/sample - loss: 0.1767 - mean_squared_error: 0.1767
# <tensorflow.python.keras.callbacks.History at 0xb2ff1efd0>
model.layers[1].get_weights()[0]


Train on 10 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


array([[-0.00913948, -0.05074099],
       [-0.01812365,  0.03708987],
       [-0.03673273, -0.04423362],
       [-0.04698031, -0.00359062]], dtype=float32)

In [43]:
model.layers

[<tensorflow.python.keras.engine.input_layer.InputLayer at 0x1455f33d0>,
 <tensorflow.python.keras.layers.embeddings.Embedding at 0x1455f3d10>,
 <tensorflow.python.keras.layers.core.Flatten at 0x1455f3d90>,
 <tensorflow.python.keras.engine.input_layer.InputLayer at 0x1455ed110>,
 <tensorflow.python.keras.layers.merge.Concatenate at 0x145525f10>,
 <tensorflow.python.keras.layers.core.Dense at 0x1455edf90>,
 <tensorflow.python.keras.layers.core.Dense at 0x145644610>]

In [45]:
w=model.layers[1].get_weights()
w

[array([[-0.00913948, -0.05074099],
        [-0.01812365,  0.03708987],
        [-0.03673273, -0.04423362],
        [-0.04698031, -0.00359062]], dtype=float32)]

In [48]:
w[0].shape

(4, 2)

In [None]:
w_

## Notes: dense vs activation layers


Dense: Just your regular densely-connected NN layer.

Dense implements the operation: ```output = activation(dot(input, kernel) + bias)``` where activation is the element-wise activation function passed as the activation argument, kernel is a weights matrix created by the layer, and bias is a bias vector created by the layer (only applicable if use_bias is True).

Note: if the input to the layer has a rank greater than 2, then it is flattened prior to the initial dot product with kernel.

example from above: ```dense1 = keras.layers.Dense(3, activation=tf.nn.relu, )(conc) ```

However, if default to ```activation = None```, then it's just dense, like the example below.

https://keras.io/layers/core/

### example using both

```
model.add(Dense(512)) # here activation is default to None

model.add(Activation('relu'))
```

https://keras.io/examples/cifar10_cnn/

# now apply on banking data

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,53,technician,married,unknown,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,0
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,...,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,services,married,high.school,no,no,no,cellular,apr,fri,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,...,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1


In [9]:
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

print(is_string_dtype(df['emp_var_rate']))


print(is_numeric_dtype(df['euribor3m']))


False
True


In [16]:

cat_features = {}
num_features = {}
for col in df:
    if is_numeric_dtype(df[col]):
        num_features[col]=df[col]
    else:
        cat_features[col]=df[col]
    

In [19]:
df_num = pd.DataFrame(num_features)
df_cat = pd.DataFrame(cat_features)

In [21]:
df_cat.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,nonexistent
1,technician,married,unknown,no,no,no,cellular,nov,fri,nonexistent
2,management,single,university.degree,no,yes,no,cellular,jun,thu,success
3,services,married,high.school,no,no,no,cellular,apr,fri,nonexistent
4,retired,married,basic.4y,no,yes,no,cellular,aug,fri,success


## example for one hot encode

In [62]:
# assigning labels to integers
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
# creating initial dataframe
bridge_types = ('Arch','Beam','Truss','Cantilever','Tied Arch','Suspension','Cable')
bridge_df = pd.DataFrame(bridge_types, columns=['Bridge_Types'])
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
bridge_df['Bridge_Types_Cat'] = labelencoder.fit_transform(bridge_df['Bridge_Types'])
bridge_df

Unnamed: 0,Bridge_Types,Bridge_Types_Cat
0,Arch,0
1,Beam,1
2,Truss,6
3,Cantilever,3
4,Tied Arch,5
5,Suspension,4
6,Cable,2


In [65]:
np.array(bridge_df[['Bridge_Types_Cat']])

array([[0],
       [1],
       [6],
       [3],
       [5],
       [4],
       [2]])

In [63]:
# one hot encode
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
# creating instance of one-hot-encoder
enc = OneHotEncoder(handle_unknown='ignore')
# passing bridge-types-cat column (label encoded values of bridge_types)
enc_df = enc.fit_transform(bridge_df[['Bridge_Types_Cat']]).toarray()
# merge with main df bridge_df on key values
#bridge_df = bridge_df.join(enc_df)
#bridge_df
enc_df


array([[1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.]])

## embedding layer example from Keras

In [53]:
from keras.models import Sequential
from keras.layers import Embedding
model = Sequential()
model.add(Embedding(1000, 64, input_length=10))
# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be
# no larger than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch dimension.

input_array = np.random.randint(1000, size=(32, 10))#an array of (32,10), with vocab size (integer range) of 1000

model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)
assert output_array.shape == (32, 10, 64)




In [55]:
input_array.shape #32 sentences, each 10 words

(32, 10)

In [56]:
output_array.shape #output is 64-dimension embedding for each of the 10 words of each of the 32 sentences

(32, 10, 64)

### thoughts

in this case, the NLP case, the input is not one-hot encoded, as in our example above, so that tutorial example may have some flaws. the output dimension of (None,4,2) makes sense in the non-one-hot case.

# let's use this new tutorial

https://github.com/mayanksatnalika/ipython/tree/master/embeddings%20project/cycle_sharing

we will talk through the other notebook: embeddings project/cycle_sharing

## back to banking.csv

we've learned the flaws of the toy example given above, then we dug into how NLP embedding works, and then we decided it's better to follow this one for the current data of banking: https://medium.com/@satnalikamayank12/on-learning-embeddings-for-categorical-data-using-keras-165ff2773fc9

# scrappy

In [48]:
>>> rng = np.random.default_rng()
>>> rng.choice(2, 10)
#array([0, 3, 4]) # random

array([1, 1, 1, 0, 0, 0, 0, 1, 0, 0])