In [157]:
import ast
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, MultiLabelBinarizer
from tensorflow import keras, feature_column
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import Dense, GRU, Dropout

In [204]:
df = pd.read_csv('data/task_environments_2021-07-15_transformed.csv', header=0, index_col=0)

In [205]:
float_cols = df.select_dtypes(include=['float64']).columns
str_cols = df.select_dtypes(include=['object']).columns

#df.fillna(df.dtypes.replace({'float64': -99, '0': 'u'}), inplace=True)
df.loc[:, float_cols] = df.loc[:, float_cols].fillna(-99)
df.loc[:, str_cols] = df.loc[:, str_cols].fillna('u')

In [206]:
mask = df.applymap(type) != bool
bool_to_str = {True: 'TRUE', False: 'FALSE'}

df = df.where(mask, df.replace(bool_to_str))

In [207]:
df

Unnamed: 0,sequence,seq1,seq2,seq3,seq4,seq5,seq6,seq7,seq8,seq9,...,coordinates_l.z,coordinates_a.x,coordinates_a.y,coordinates_a.z,coordinates_h.x,coordinates_h.y,coordinates_h.z,coordinates_q.x,coordinates_q.y,coordinates_q.z
0,pocgkr,p,o,c,g,k,r,u,u,u,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
1,cgwpcfks,c,g,w,p,c,f,k,s,u,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
2,kfsfkspwg,k,f,s,f,k,s,p,w,g,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
3,pfkswkfsococg,p,f,k,s,w,k,f,s,o,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
4,wptgkfsoc,w,p,t,g,k,f,s,o,c,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,hhsdbg,h,h,s,d,b,g,u,u,u,...,-99.0,-99.0,-99.0,-99.0,0.0,4.0,1.0,-99.0,-99.0,-99.0
186,hhsdgb,h,h,s,d,g,b,u,u,u,...,-99.0,-99.0,-99.0,-99.0,0.0,4.0,1.0,-99.0,-99.0,-99.0
187,hhsgdb,h,h,s,g,d,b,u,u,u,...,-99.0,-99.0,-99.0,-99.0,0.0,4.0,1.0,-99.0,-99.0,-99.0
188,hhsgdb,h,h,s,g,d,b,u,u,u,...,-99.0,-99.0,-99.0,-99.0,0.0,4.0,1.0,-99.0,-99.0,-99.0


In [208]:
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

In [209]:
print(len(train), len(test))

121 38


In [210]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = list(dataframe['sequence'])
    encoder = MultiLabelBinarizer()
    transformed_labels = encoder.fit_transform(labels)
    
    #indicator_col_names = ['seq1','seq2','seq3','seq4','seq5','seq6','seq7','seq8','seq9',
    #                      'seq10','seq11','seq12','seq13','seq14','seq15','seq16','seq17']
    dataframe.drop('sequence', axis=1, inplace=True)
    #dataframe.drop(indicator_col_names, axis=1, inplace=True)
    
    dataset = tf.data.Dataset.from_tensor_slices((dict(dataframe), transformed_labels))
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(dataframe))
        
    dataset = dataset.batch(batch_size)
    
    return dataset

In [148]:
#batch_size = 10

In [193]:
#train_ds = df_to_dataset(train, batch_size=batch_size)

In [219]:
def create_input_data(dataframe):
    feature_columns = []
    indicator_col_names = ['seq1','seq2','seq3','seq4','seq5','seq6','seq7','seq8','seq9',
                          'seq10','seq11','seq12','seq13','seq14','seq15','seq16','seq17']
    
    for header in dataframe.columns:
        if 'coord' in header:
            feature_columns.append(feature_column.numeric_column(header))
        elif 'containment' in header or 'food' in header or 'mid' in header or \
        'strong' in header:
            header_col = feature_column.categorical_column_with_vocabulary_list(
                        header, ['TRUE', 'FALSE'])
            header_embedding = feature_column.embedding_column(header_col, dimension=10)
            feature_columns.append(header_embedding)
        #elif 'seq' in header:
        #    col = feature_column.categorical_column_with_vocabulary_list(header,
        #            dataframe[header].unique())
        #    col_embedding = feature_column.embedding_column(col, dimension=10)
        #    feature_columns.append(col_embedding)
    
    for col in indicator_col_names:
        categorial_col = feature_column.categorical_column_with_vocabulary_list(col,
                                dataframe[col].unique())
        embedding_col = feature_column.embedding_column(categorial_col, dimension=8)
        feature_columns.append(embedding_col)
    
    return feature_columns

In [220]:
feature_cols = create_input_data(df)

In [221]:
feature_layer = tf.keras.layers.DenseFeatures(feature_cols)

In [222]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [223]:
model = tf.keras.Sequential([
    feature_layer,
    layers.Dense(128, activation='relu'),
    layers.Dropout(.1),
    layers.Dense(1)
])

In [224]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.categorical_crossentropy,
              metrics=['accuracy'])

In [225]:
model.fit(train_ds, validation_data=val_ds, epochs=10)

Train for 4 steps, validate for 1 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f563307e690>

In [226]:
loss, accuracy = model.evaluate(test_ds)

