In [1]:
from collections import defaultdict
from datetime import datetime
import pandas as pd

In [2]:
df = pd.read_csv('data/task_environments_2021-07-15_transformed.csv', header=0, index_col=0)

In [6]:
### Select columns by type

In [3]:
float_cols = df.select_dtypes(include=['float64']).columns
str_cols = df.select_dtypes(include=['object']).columns

In [21]:
### Fill NAs

In [4]:
df.loc[:, float_cols] = df.loc[:, float_cols].fillna(-99)
#df.loc[:, str_cols] = df.loc[:, str_cols].fillna('u')

In [23]:
### Convert str to bool for true/false values

In [5]:
mask = df.applymap(type) != bool
bool_to_str = {True: 1, False: 0}
df = df.where(mask, df.replace(bool_to_str))

In [6]:
df

Unnamed: 0,sequence,seq1,seq2,seq3,seq4,seq5,seq6,seq7,seq8,seq9,...,coordinates_l.z,coordinates_a.x,coordinates_a.y,coordinates_a.z,coordinates_h.x,coordinates_h.y,coordinates_h.z,coordinates_q.x,coordinates_q.y,coordinates_q.z
0,pocgkr,p,o,c,g,k,r,,,,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
1,cgwpcfks,c,g,w,p,c,f,k,s,,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
2,kfsfkspwg,k,f,s,f,k,s,p,w,g,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
3,pfkswkfsococg,p,f,k,s,w,k,f,s,o,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
4,wptgkfsoc,w,p,t,g,k,f,s,o,c,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,hhsdbg,h,h,s,d,b,g,,,,...,-99.0,-99.0,-99.0,-99.0,0.0,4.0,1.0,-99.0,-99.0,-99.0
186,hhsdgb,h,h,s,d,g,b,,,,...,-99.0,-99.0,-99.0,-99.0,0.0,4.0,1.0,-99.0,-99.0,-99.0
187,hhsgdb,h,h,s,g,d,b,,,,...,-99.0,-99.0,-99.0,-99.0,0.0,4.0,1.0,-99.0,-99.0,-99.0
188,hhsgdb,h,h,s,g,d,b,,,,...,-99.0,-99.0,-99.0,-99.0,0.0,4.0,1.0,-99.0,-99.0,-99.0


In [7]:
def get_unique_values(df):
    sequence_list = [list(row) for row in df['sequence']]
    unique_items = set([item for sublist in sequence_list for item in sublist])

    return unique_items

In [8]:
unique_items = get_unique_values(df)

In [64]:
def get_sequence_info(df, unique_items):
    list_dicts = []
    input_target_values = []
    list_already_seen = []

    for row in range(0, len(df)):
    
        for position, char in enumerate(df.loc[row, 'sequence']):
            new_row_nr = row + position
            char_dict = {}
            sequence = df.loc[row, 'sequence']
        
            # specific for position in sequence
            start_coords_col = 'start_coords' + str(position+1)
            char_dict['start_coords.x'] = df.loc[row, str(start_coords_col + '.x')]
            char_dict['start_coords.y'] = df.loc[row, str(start_coords_col + '.y')]
            char_dict['start_coords.z'] = df.loc[row, str(start_coords_col + '.z')]
            
            char_dict['row'] = row
        
            if position == 0:
                input_value = '<start>'
                target_value = char
                input_target_values.append([input_value, target_value])
                
                list_already_seen.append([])
            
            else:
                input_value = df.loc[row, 'sequence'][position-1]
                target_value = char
                input_target_values.append([input_value, target_value])

                list_already_seen.append(df.loc[row, 'sequence'][:position])
                
            list_dicts.append(char_dict)
    
    return list_dicts, input_target_values, list_already_seen, dict_already_seen

In [25]:
def get_row_info(df):
    dicts_row = {}
    
    for row in range(0, len(df)):
        row_dict = {}
        sequence = df.loc[row, 'sequence']
        
        for position, char in enumerate(df.loc[row, 'sequence']):
            char_coordinates_x = 'coordinates_' + char + '.x'
            char_coordinates_y = 'coordinates_' + char + '.y'
            char_coordinates_z = 'coordinates_' + char + '.z'
            row_dict[char_coordinates_x] = df.loc[row, char_coordinates_x]
            row_dict[char_coordinates_y] = df.loc[row, char_coordinates_y]
            row_dict[char_coordinates_z] = df.loc[row, char_coordinates_z]
            
            char_containment = char + '.containment'
            char_strong_k = char + '.strong_k'
            char_mid_k = char + '.mid_k'
            char_food_k = char + '.food_k'
            row_dict[char_containment] = df.loc[row, char_containment]
            row_dict[char_food_k] = df.loc[row, char_food_k]
            row_dict[char_strong_k] = df.loc[row, char_strong_k]
            row_dict[char_mid_k] = df.loc[row, char_mid_k]
        
            dicts_row[row] = row_dict
    
    return dicts_row        

In [65]:
list_dicts, input_target_values, list_already_seen, dict_already_seen = get_sequence_info(df, unique_items)

In [12]:
dicts_row = get_row_info(df)

In [66]:
len(dicts_row)

190

In [72]:
len(list_already_seen)

1462

In [73]:
single_step_df = pd.DataFrame(list_dicts)
single_step_df.insert(loc=0, column='input', value=0)
single_step_df.insert(loc=1, column='target', value=0)

In [74]:
for row in range(0, len(input_target_values)):
    single_step_df.loc[row, 'input'] = input_target_values[row][0]
    single_step_df.loc[row, 'target'] = input_target_values[row][1]

In [76]:
for row, elem in enumerate(list_already_seen):
    for item in unique_items:
        single_step_df.loc[row, str(item + '.already_seen')] = 0
        if item in elem:
            single_step_df.loc[row, str(item + '.already_seen')] = 1

In [79]:
single_step_df

Unnamed: 0,input,target,start_coords.x,start_coords.y,start_coords.z,row,r.already_seen,q.already_seen,n.already_seen,l.already_seen,...,h.food_k,h.strong_k,h.mid_k,coordinates_q.x,coordinates_q.y,coordinates_q.z,q.containment,q.food_k,q.strong_k,q.mid_k
0,<start>,p,-0.451354,-0.413918,0.156247,0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,p,o,0.513000,-0.531000,0.740000,0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,o,c,0.513000,-0.531000,0.740000,0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,c,g,0.513000,-0.531000,0.740000,0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,g,k,0.513000,-0.531000,0.740000,0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1457,h,h,0.000000,4.000000,1.000000,189,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,,,,,
1458,h,s,0.000000,4.000000,1.000000,189,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,,,,,
1459,s,d,0.000000,2.000000,2.000000,189,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,,,,,
1460,d,g,0.000000,1.000000,3.000000,189,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,,,,,


In [78]:
for row in range(0, len(single_step_df)):
    for key, values in dicts_row.items():
        #print(values)
        if single_step_df.loc[row, 'row'] == key:
            for k,v in values.items():
                single_step_df.loc[row, k] = v

  self.obj[key] = infer_fill_value(value)


In [81]:
date = datetime.today().strftime('%Y-%m-%d')
filename = 'data/single_step_df_ints_' + str(date) + '.csv'

filename

'data/single_step_df_ints_2022-03-15.csv'

In [82]:
single_step_df.to_csv(filename, index=False, header=True)