In [1]:
# Import core libraries
import pandas as pd
import numpy as np
import datetime
# Set random seed for reproducibility
np.random.seed(42)

### Helper Function 


In [3]:
#Helper functions
def gen_id(data):
    """Create a unique identifier from group, number, generation.

    Args:
        data (pd.DataFrame): raw data
    Returns:
        pd.DataFrame: data with an additional 'id' column
    """
    data['id'] = data['Group'].astype(str) + '_' + data['NR'].astype(str) + '_' + data['Generation'].astype(str)
    return data

def make_rat_one_row(data):
    """From 4 trials converts each rat into a single row.

    Args:
        data (pd.DataFrame): raw data
    Returns:
        pd.DataFrame: data with each rat as a single row
    """
    re_df = pd.DataFrame()
    for id in data['id'].unique():
        temp = data[data['id'] == id]
        to_merge = temp.loc[temp['Trials'] == 1]
        for i in range(2, 5):
            tmp = temp.loc[temp['Trials'] == i].drop(columns=params['no_merge_cols'])
            to_merge = pd.merge(to_merge, tmp, how='inner', on="id", suffixes=('', f'_{i}'))

        re_df = pd.concat([re_df, to_merge], ignore_index=True)
    return re_df
        

### Configuration Parameters
This dictionary defines:
- The path to the raw dataset
- Columns to drop during the cleaning process
- String-based columns that should be standardized to lowercase

These parameters make the cleaning functions reusable and adaptable to other datasets.

In [13]:
params = {
    #Original data path
    'data' : 'raw/ambitus_0_15_log_24_07_2025.parquet',
    #Columns from the original dataset that are not needed for the analysis 
    'to_drop_cols' : ['Separation', 'Date_Ambitus', 'GR_Gender', 'G_S', 'Animal', 'EAT_E_Nr', 'EAT_I_Nr', 'EAT_TOT_Nr', 'Expl_E_BEF_Nr', 'Expl_I_BEF_Nr', 'Expl_E_I_BEF_Nr', 'Expl_E_AFT_Nr', 'Expl_I_AFT_Nr', 
                      'Expl_E_I_AFT_Nr', 'Expl_E_BEF_T', 'Expl_I_BEF_T', 'Expl_E_I_BEF_T', 'Expl_E_AFT_T', 'Expl_I_AFT_T', 'Expl_E_I_AFT_T', 'LAT_E', 'LAT_I', 'LAT_E_I', 'Expl_REP_E_BEF_Nr', 'Expl_REP_I_BEF_Nr',
                      'Expl_REP_BEF_Nr', 'Expl_REP_E_AFT_Nr', 'Expl_REP_I_AFT_Nr', 'Expl_REP_AFT_Nr', 'LOCO_BEF', 'L_C_Tot',
                      'Expl_E_AFT_T_Calc','Expl_I_AFT_T_Calc','Expl_E_I_AFT_T_Calc','Expl_E_BEF_Calc_T','Expl_I_BEF_Calc_T','Expl_E_I_BEF_Calc_T','Expl_E_AFT_Calc','Expl_I_AFT_Calc','Expl_E_I_AFT_Calc',
                      'Expl_REP_E_AFT_Nr_Calc',	'Expl_REP_I_AFT_Nr_Calc',	'Expl_REP_AFT_Nr_Calc'],
    'no_merge_cols': ['Group', 'NR', 'Generation', 'Trials', 'Paradigm']
}

### Load and Annotate the Raw Dataset
We load the raw data from the specified parquet file and apply the gen_id() function to add a unique identifier (id) for each animal. This step prepares the data for downstream cleaning and ensures consistent identification of records.

In [16]:
parquet_data = pd.read_parquet(params['data'])
parquet_data = gen_id(parquet_data)
parquet_data

Unnamed: 0,Animal,Generation,Season,Separation,G_S,Paradigm,Date_Ambitus,Year,NR,Group,...,Expl_I_BEF_Loco_ratio,Expl_EI_BEF_Loco_ratio,Expl_E_TOT_Loco_ratio,Expl_I_TOT_Loco_ratio,Expl_E_I_TOT_Loco_ratio,Eff_Expl_E,Eff_Expl_I,Eff_Expl_EI,E_E,id
0,LE2F1_1,0,Autumn,2019-08-26,0_Autumn,1,2019-10-07,0,1,Lisket,...,1.076923,2.923077,1.846154,1.076923,2.923077,1.000,0.75,0.875000,0.8750,Lisket_1_0
1,LE2F2_1,0,Autumn,2019-08-26,0_Autumn,1,2019-10-07,0,1,Lisket,...,1.857143,3.428571,1.500000,2.100000,3.600000,1.000,1.00,1.000000,1.0000,Lisket_1_0
2,LE2F3_1,0,Autumn,2019-08-26,0_Autumn,2,2019-10-07,0,1,Lisket,...,1.800000,2.800000,0.769231,1.000000,1.769231,,1.00,0.533333,1.0000,Lisket_1_0
3,LE2F4_1,0,Autumn,2019-08-26,0_Autumn,2,2019-10-07,0,1,Lisket,...,3.166667,4.833333,1.857143,2.357143,4.214286,,1.00,0.500000,1.0000,Lisket_1_0
4,LE2F1_2,0,Autumn,2019-08-26,0_Autumn,1,2019-10-07,0,2,Lisket,...,0.916667,2.250000,1.333333,0.916667,2.250000,1.000,1.00,1.000000,1.0000,Lisket_2_0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5363,Rat28,15,Winter,2024-12-30,15_Winter,2,2025-02-18,6,98,LE,...,1.600000,2.500000,0.968750,1.343750,2.312500,,1.00,0.533333,1.0000,LE_98_15
5364,Rat11,15,Winter,2024-12-30,15_Winter,1,2025-02-18,6,99,LE,...,0.520000,1.120000,0.600000,0.520000,1.120000,0.875,1.00,0.937500,0.9375,LE_99_15
5365,Rat12,15,Winter,2024-12-30,15_Winter,1,2025-02-18,6,99,LE,...,1.200000,2.400000,0.846154,0.807692,1.653846,1.000,1.00,1.000000,1.0000,LE_99_15
5366,Rat33,15,Winter,2024-12-30,15_Winter,2,2025-02-18,6,99,LE,...,1.750000,2.875000,0.971429,0.800000,1.771429,,1.00,0.533333,1.0000,LE_99_15


## Reduce and Restructure Dataset
To prepare the dataset for machine learning:
- We remove unused metadata columns.
- We restructure the data so that each rat is represented by a single row using make_rat_one_row().
- (Optionally) We standardize feature names or values (e.g., converting % to "perc").

This format is suitable for classical machine learning algorithms that expect tabular inputs.

In [None]:
# Remove unnecessary columns based on the parameter list
parquet_data = parquet_data.drop(columns=params['to_drop_cols'])
# Transform the dataset so that each rat is represented by a single row
one_rowed_data = make_rat_one_row(parquet_data)
# Optionally standardize column names or values
one_rowed_data = one_rowed_data.replace('male', 'Male')
one_rowed_data = one_rowed_data.replace('female', 'Female')
one_rowed_data = one_rowed_data.fillna(-1)
one_rowed_data

### Drop Redundant or Non-Feature Columns
We automatically remove columns that are not needed for machine learning, including:
- Redundant group/gender/year fields (duplicated across time)
- Intermediate or repeated trial-level metadata
- Columns with placeholder names like Unnamed:

This keeps only relevant, flat (one-row-per-animal) behavioral features in the dataset.

In [None]:
#Drop columns that are not needed for the analysis
columns_to_drop = one_rowed_data.filter(regex='Group_|Gender_|Season_|Trials_|NR_|Year_|Paradigm_|Generation_|Unnamed').columns
one_rowed_data = one_rowed_data.drop(columns=columns_to_drop)
#Drop columns that are only contains -1 values
one_rowed_data = one_rowed_data.loc[:, (one_rowed_data != -1).any(axis=0)]
one_rowed_data

Save the Cleaned Dataset
After removing the final unused column (Trials), we export the cleaned and restructured dataset to the processed/ directory.
The filename includes the current date, allowing for versioning and traceability.

Resulting file example:
 - ambitus_0_15_ml_ready_25-07-2025.csv

In [None]:
# Drop the 'Trials' column if it's no longer relevant
one_rowed_data.drop(columns=['Trials'], inplace=True)
# Save the cleaned and flattened dataset to CSV with timestamp
one_rowed_data.to_csv(f"processed/ambitus_0_15_ml_ready_{datetime.datetime.now().strftime("%d-%M-%Y")}.csv", index=False)