In [1]:
from collections import defaultdict
from datetime import datetime
import pandas as pd

In [2]:
def read_data(path_to_csv):
    df = pd.read_csv(path_to_csv, header=0, index_col=0)
    
    # select columns by type
    float_cols = df.select_dtypes(include=['float64']).columns
    str_cols = df.select_dtypes(include=['object']).columns
    
    # fill NAs
    df.loc[:, float_cols] = df.loc[:, float_cols].fillna(-99)
    
    # convert str to bool for true/false values
    mask = df.applymap(type) != bool
    bool_to_str = {True: 1, False: 0}
    df = df.where(mask, df.replace(bool_to_str))
    
    return df

In [3]:
def get_unique_values(df):
    sequence_list = [list(row) for row in df['sequence']]
    unique_items = set([item for sublist in sequence_list for item in sublist])

    return unique_items

In [9]:
def get_sequence_info(df, unique_items):
    list_dicts = []
    input_target_values = []
    list_already_seen = []

    for row in range(0, len(df)):
    
        for position, char in enumerate(df.loc[row, 'sequence']):
            new_row_nr = row + position
            char_dict = {}
            sequence = df.loc[row, 'sequence']
        
            # specific for position in sequence
            start_coords_col = 'start_coords' + str(position+1)
            char_dict['start_coords.x'] = df.loc[row, str(start_coords_col + '.x')]
            char_dict['start_coords.y'] = df.loc[row, str(start_coords_col + '.y')]
            char_dict['start_coords.z'] = df.loc[row, str(start_coords_col + '.z')]
            
            char_dict['row'] = row
        
            if position == 0:
                input_value = '<start>'
                target_value = char
                input_target_values.append([input_value, target_value])
                
                list_already_seen.append([])
            
            else:
                input_value = df.loc[row, 'sequence'][position-1]
                target_value = char
                input_target_values.append([input_value, target_value])

                list_already_seen.append(df.loc[row, 'sequence'][:position])
                
            list_dicts.append(char_dict)
    
    return list_dicts, input_target_values, list_already_seen

In [5]:
def get_row_info(df):
    dicts_row = {}
    
    for row in range(0, len(df)):
        row_dict = {}
        sequence = df.loc[row, 'sequence']
        
        for position, char in enumerate(df.loc[row, 'sequence']):
            char_coordinates_x = 'coordinates_' + char + '.x'
            char_coordinates_y = 'coordinates_' + char + '.y'
            char_coordinates_z = 'coordinates_' + char + '.z'
            row_dict[char_coordinates_x] = df.loc[row, char_coordinates_x]
            row_dict[char_coordinates_y] = df.loc[row, char_coordinates_y]
            row_dict[char_coordinates_z] = df.loc[row, char_coordinates_z]
            
            char_containment = char + '.containment'
            char_strong_k = char + '.strong_k'
            char_mid_k = char + '.mid_k'
            char_food_k = char + '.food_k'
            row_dict[char_containment] = df.loc[row, char_containment]
            row_dict[char_food_k] = df.loc[row, char_food_k]
            row_dict[char_strong_k] = df.loc[row, char_strong_k]
            row_dict[char_mid_k] = df.loc[row, char_mid_k]
        
            dicts_row[row] = row_dict
    
    return dicts_row        

In [6]:
def create_singlestep_df(list_dicts, input_target_values, list_already_seen):
    single_step_df = pd.DataFrame(list_dicts)
    single_step_df.insert(loc=0, column='input', value=0)
    single_step_df.insert(loc=1, column='target', value=0)

    for row in range(0, len(input_target_values)):
        single_step_df.loc[row, 'input'] = input_target_values[row][0]
        single_step_df.loc[row, 'target'] = input_target_values[row][1]
        
    for row, elem in enumerate(list_already_seen):
        for item in unique_items:
            single_step_df.loc[row, str(item + '.already_seen')] = 0
            if item in elem:
                single_step_df.loc[row, str(item + '.already_seen')] = 1
                
    for row in range(0, len(single_step_df)):
        for key, values in dicts_row.items():
            if single_step_df.loc[row, 'row'] == key:
                for k,v in values.items():
                    single_step_df.loc[row, k] = v
                    
    return single_step_df

In [10]:
df = read_data('data/task_environments_2021-07-15_transformed.csv')
unique_items = get_unique_values(df)

list_dicts, input_target_values, list_already_seen = get_sequence_info(df, unique_items)
dicts_row = get_row_info(df)

single_step_df = create_singlestep_df(list_dicts, input_target_values, list_already_seen)

  self.obj[key] = infer_fill_value(value)


In [11]:
date = datetime.today().strftime('%Y-%m-%d')
filename = 'data/single_step_df_ints_' + str(date) + '.csv'

filename

'data/single_step_df_ints_2022-03-15.csv'

In [12]:
single_step_df.to_csv(filename, index=False, header=True)