In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
data = pd.read_csv("./pqt.csv")
data.head()

Unnamed: 0,doctor,Gender,Age,st
0,ent,m,50,29
1,paediatrics,m,11,10
2,paediatrics,f,20,14
3,ophthalmology,f,50,11
4,o&g,f,34,27


In [3]:
len(data)

369

In [4]:
train, test = np.split(data, [int(len(data) * 0.8)])

In [5]:
print("training data: ", len(train))
print("test data: ", len(test))

training data:  295
test data:  74


In [6]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels = df.pop('st')
    df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

In [7]:
batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)

  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}


In [8]:
[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of ages:', train_features['Age'])
print('A batch of st:', label_batch )

Every feature: ['doctor', 'Gender', 'Age', 'st']
A batch of ages: tf.Tensor(
[[20]
 [ 2]
 [20]
 [20]
 [62]], shape=(5, 1), dtype=int64)
A batch of st: tf.Tensor([59 23 40 21 24], shape=(5,), dtype=int64)


In [9]:
def get_normalization_layer(name, dataset):
    # Create a Normalization layer for the feature.
    normalizer = tf.keras.layers.Normalization(axis=None)

    # Prepare a Dataset that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)

    return normalizer

In [10]:
st_col = train_features['st']
layer = get_normalization_layer('st', train_ds)
layer(st_col)

<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[ 1.9428859 ],
       [-0.30134344],
       [ 0.7584315 ],
       [-0.42602286],
       [-0.23900375]], dtype=float32)>

In [11]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a layer that turns strings into integer indices.
    if dtype == 'string':
        index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
    # Otherwise, create a layer that turns integer values into integer indices.
    else:
        index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)

    # Prepare a `tf.data.Dataset` that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Encode the integer indices.
    encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())

    # Apply multi-hot encoding to the indices. The lambda function captures the
    # layer, so you can use them, or include them in the Keras Functional model later.
    return lambda feature: encoder(index(feature)) 

In [12]:
test_doctor_col = train_features['doctor']
test_doctor_layer = get_category_encoding_layer(name='doctor',
                                                dataset=train_ds,
                                                dtype='string')
test_doctor_layer(test_doctor_col)

<tf.Tensor: shape=(5, 10), dtype=float32, numpy=
array([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]], dtype=float32)>

In [13]:
test_age_col = train_features['Age']
test_age_layer = get_category_encoding_layer(name='Age',
                                            dataset=train_ds,
                                            dtype='int64',
                                            max_tokens=5)
test_age_layer(test_age_col)

<tf.Tensor: shape=(5, 5), dtype=float32, numpy=
array([[0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.]], dtype=float32)>

In [14]:
print("min age: ", min(data['Age']))
print("max age: ", max(data['Age']))

min age:  2
max age:  62


In [15]:
batch_size = 1
train_ds = df_to_dataset(train, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}


In [16]:
all_inputs = []
encoded_features = []

In [17]:
age_col = tf.keras.Input(shape=(1,), name='Age', dtype='int64')

encoding_layer = get_category_encoding_layer(name='Age',
                                             dataset=train_ds,
                                             dtype='int64',
                                             max_tokens=5)
encoded_age_col = encoding_layer(age_col)
all_inputs.append(age_col)
encoded_features.append(encoded_age_col)

In [18]:
categorical_cols = ['doctor', 'Gender']

for col in categorical_cols:
    categorical_col = tf.keras.Input(shape=(1,), name=col, dtype='string')
    encoding_layer = get_category_encoding_layer(name=col,
                                                 dataset=train_ds,
                                                 dtype='string',
                                                 max_tokens=11)
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)

In [19]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(64, activation='relu')(all_features)
x = tf.keras.layers.Dense(32, activation='relu')(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

model.compile(optimizer='adam',
              loss='mae',
              metrics=["mae"])

In [20]:
model.fit(train_ds, epochs=10)

Epoch 1/10


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2b43a57b040>

In [21]:
model.evaluate(test_ds)



[4.139502048492432, 4.139502048492432]

In [22]:
sample = {
    'doctor': 'paediatrics',
    'Gender': 'm',
    'Age': 11
}

In [23]:
sample = {
    'doctor': '0&g',
    'Gender': 'f',
    'Age': 34
}

In [24]:
input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
predictions = model.predict(input_dict)
#prob = tf.nn.sigmoid(predictions[0])
print(predictions)

[[22.37985]]
