In [54]:
from datetime import datetime
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, MultiLabelBinarizer
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import Dense, GRU, Dropout

In [2]:
df = pd.read_csv('data/task_environments_2021-07-15_transformed.csv', header=0, index_col=0)

In [3]:
### Select columns by type

In [4]:
float_cols = df.select_dtypes(include=['float64']).columns
str_cols = df.select_dtypes(include=['object']).columns

In [5]:
### Fill NAs

In [6]:
df.loc[:, float_cols] = df.loc[:, float_cols].fillna(-99)
#df.loc[:, str_cols] = df.loc[:, str_cols].fillna('u')

In [7]:
### Convert str to bool for true/false values

In [8]:
mask = df.applymap(type) != bool
bool_to_str = {True: 'TRUE', False: 'FALSE'}
df = df.where(mask, df.replace(bool_to_str))

In [9]:
df

Unnamed: 0,sequence,seq1,seq2,seq3,seq4,seq5,seq6,seq7,seq8,seq9,...,coordinates_l.z,coordinates_a.x,coordinates_a.y,coordinates_a.z,coordinates_h.x,coordinates_h.y,coordinates_h.z,coordinates_q.x,coordinates_q.y,coordinates_q.z
0,pocgkr,p,o,c,g,k,r,,,,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
1,cgwpcfks,c,g,w,p,c,f,k,s,,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
2,kfsfkspwg,k,f,s,f,k,s,p,w,g,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
3,pfkswkfsococg,p,f,k,s,w,k,f,s,o,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
4,wptgkfsoc,w,p,t,g,k,f,s,o,c,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,hhsdbg,h,h,s,d,b,g,,,,...,-99.0,-99.0,-99.0,-99.0,0.0,4.0,1.0,-99.0,-99.0,-99.0
186,hhsdgb,h,h,s,d,g,b,,,,...,-99.0,-99.0,-99.0,-99.0,0.0,4.0,1.0,-99.0,-99.0,-99.0
187,hhsgdb,h,h,s,g,d,b,,,,...,-99.0,-99.0,-99.0,-99.0,0.0,4.0,1.0,-99.0,-99.0,-99.0
188,hhsgdb,h,h,s,g,d,b,,,,...,-99.0,-99.0,-99.0,-99.0,0.0,4.0,1.0,-99.0,-99.0,-99.0


In [10]:
single_step_df = pd.DataFrame()

In [48]:
list_dicts = []

for row in range(0, len(df)):
    for position, char in enumerate(df.loc[row, 'sequence'][:-1]):
        if position == 0:
            dict_row = {}
            input_value = '<start>'
            target_value = char
            dict_row['input'] = input_value
            dict_row['target'] = target_value
            
            start_coords_col = 'start_coords' + str(position+1)
            dict_row['start_coords_x'] = df.loc[row, str(start_coords_col + '.x')]
            dict_row['start_coords_y'] = df.loc[row, str(start_coords_col + '.y')]
            dict_row['start_coords_z'] = df.loc[row, str(start_coords_col + '.z')]
            
            char_coordinates_x = 'coordinates_' + char + '.x'
            char_coordinates_y = 'coordinates_' + char + '.y'
            char_coordinates_z = 'coordinates_' + char + '.z'
            dict_row[char_coordinates_x] = df.loc[row, char_coordinates_x]
            dict_row[char_coordinates_y] = df.loc[row, char_coordinates_y]
            dict_row[char_coordinates_z] = df.loc[row, char_coordinates_z]
            
            char_containment = char + '.containment'
            char_strong_k = char + '.strong_k'
            char_mid_k = char + '.mid_k'
            char_food_k = char + '.food_k'
            dict_row[char_containment] = df.loc[row, char_containment]
            dict_row[char_food_k] = df.loc[row, char_food_k]
            dict_row[char_strong_k] = df.loc[row, char_strong_k]
            dict_row[char_mid_k] = df.loc[row, char_mid_k]
            
            list_dicts.append(dict_row)
        else:
            dict_row = {}
            input_value = char
            target_value = df.loc[row, 'sequence'][position+1]
            dict_row['input'] = input_value
            dict_row['target'] = target_value
            
            start_coords_col = 'start_coords' + str(position+1)
            dict_row['start_coords_x'] = df.loc[row, str(start_coords_col + '.x')]
            dict_row['start_coords_y'] = df.loc[row, str(start_coords_col + '.y')]
            dict_row['start_coords_z'] = df.loc[row, str(start_coords_col + '.z')]
            
            char_coordinates_x = 'coordinates_' + char + '.x'
            char_coordinates_y = 'coordinates_' + char + '.y'
            char_coordinates_z = 'coordinates_' + char + '.z'
            dict_row[char_coordinates_x] = df.loc[row, char_coordinates_x]
            dict_row[char_coordinates_y] = df.loc[row, char_coordinates_y]
            dict_row[char_coordinates_z] = df.loc[row, char_coordinates_z]
            
            char_containment = char + '.containment'
            char_strong_k = char + '.strong_k'
            char_mid_k = char + '.mid_k'
            char_food_k = char + '.food_k'
            dict_row[char_containment] = df.loc[row, char_containment]
            dict_row[char_food_k] = df.loc[row, char_food_k]
            dict_row[char_strong_k] = df.loc[row, char_strong_k]
            dict_row[char_mid_k] = df.loc[row, char_mid_k]
            
            list_dicts.append(dict_row)
            
    # for coords cols:
    # check if object in sequence, if yes, add to new df
print(list_dicts)

[{'input': '<start>', 'target': 'p', 'start_coords_x': -0.451354, 'start_coords_y': -0.413918, 'start_coords_z': 0.156247, 'coordinates_p.x': 0.008034, 'coordinates_p.y': 0.957082, 'coordinates_p.z': 0.6890539999999999, 'p.containment': 'FALSE', 'p.food_k': 'FALSE', 'p.strong_k': 'FALSE', 'p.mid_k': 'TRUE'}, {'input': 'o', 'target': 'c', 'start_coords_x': 0.513, 'start_coords_y': -0.531, 'start_coords_z': 0.74, 'coordinates_o.x': -0.185, 'coordinates_o.y': 0.928, 'coordinates_o.z': 0.481035, 'o.containment': 'FALSE', 'o.food_k': 'FALSE', 'o.strong_k': 'FALSE', 'o.mid_k': 'TRUE'}, {'input': 'c', 'target': 'g', 'start_coords_x': 0.513, 'start_coords_y': -0.531, 'start_coords_z': 0.74, 'coordinates_c.x': -0.525007, 'coordinates_c.y': 0.923283, 'coordinates_c.z': 0.438, 'c.containment': 'FALSE', 'c.food_k': 'FALSE', 'c.strong_k': 'FALSE', 'c.mid_k': 'FALSE'}, {'input': 'g', 'target': 'k', 'start_coords_x': 0.513, 'start_coords_y': -0.531, 'start_coords_z': 0.74, 'coordinates_g.x': -0.59500

In [49]:
single_step_df = pd.DataFrame(list_dicts)

In [50]:
single_step_df

Unnamed: 0,input,target,start_coords_x,start_coords_y,start_coords_z,coordinates_p.x,coordinates_p.y,coordinates_p.z,p.containment,p.food_k,...,h.food_k,h.strong_k,h.mid_k,coordinates_q.x,coordinates_q.y,coordinates_q.z,q.containment,q.food_k,q.strong_k,q.mid_k
0,<start>,p,-0.451354,-0.413918,0.156247,0.008034,0.957082,0.689054,FALSE,FALSE,...,,,,,,,,,,
1,o,c,0.513000,-0.531000,0.740000,,,,,,...,,,,,,,,,,
2,c,g,0.513000,-0.531000,0.740000,,,,,,...,,,,,,,,,,
3,g,k,0.513000,-0.531000,0.740000,,,,,,...,,,,,,,,,,
4,k,r,0.513000,-0.531000,0.740000,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,<start>,h,1.000000,3.000000,2.000000,,,,,,...,FALSE,FALSE,FALSE,,,,,,,
1268,h,s,0.000000,4.000000,1.000000,,,,,,...,FALSE,FALSE,FALSE,,,,,,,
1269,s,d,0.000000,4.000000,1.000000,,,,,,...,,,,,,,,,,
1270,d,g,0.000000,2.000000,2.000000,,,,,,...,,,,,,,,,,


In [57]:
date = datetime.today().strftime('%Y-%m-%d')
filename = 'data/single_step_df_' + str(date) + '.csv'

filename

'data/single_step_df_2021-12-16.csv'

In [56]:
single_step_df.to_csv(filename, index=True, header=True)