In [173]:
import pandas as pd
import numpy as np
from datetime import timedelta

In [174]:
baseline_clean = pd.read_pickle('data_clean/baseline_clean.pkl')
survey_clean = pd.read_pickle('data_clean/survey_clean.pkl')

In [175]:
itit_df = pd.merge(baseline_clean, survey_clean, on=['trip_id', 'user_id'], how='left')

In [176]:
def add_missing_surveys(df):
    survey_list = []
    
    for (trip_id, user_id), group in df.groupby(['trip_id', 'user_id']):
        travel_date = group['travel_date'].iloc[0]
        duration = int(group['travel_duration'].iloc[0])
        
        # Create a DataFrame with all expected survey dates
        all_days = pd.date_range(start=travel_date, periods=duration, freq='D')
        existing_days = group['finished'].dt.date.unique()
        
        # Filter out days that already have surveys
        missing_days = [day for day in all_days if day.date() not in existing_days]
        
        # Create DataFrame for missing surveys
        if missing_days:
            missing_surveys = pd.DataFrame({
                'trip_id': trip_id,
                'user_id': user_id,
                'travel_date': travel_date,
                'finished': pd.to_datetime(missing_days),
                'travel_duration': duration
            })
            
            # Copy constant baseline variables
            baseline_vars = ['baseline_date', 'age', 'gender', 'country_iso2c', 'country_clean', 'continent_clean', 
                             'health_chronic', 'latitude', 'longitude', 'smoking_status', 'travel_purpose', 'trip_number']
            for var in baseline_vars:
                missing_surveys[var] = group[var].iloc[0]
                
            # Append missing surveys to the list
            survey_list.append(missing_surveys)
        
        # Append existing surveys to the list
        survey_list.append(group)
    
    # Concatenate all DataFrames and sort
    df_concat = pd.concat(survey_list).sort_values(by=['trip_id', 'user_id', 'finished']).reset_index(drop=True)

    # Ensure finished column is properly filled
    df_concat['finished'] = df_concat['finished'].ffill().bfill()

    # Ensure no duplicate rows by dropping the extra row caused by the duration miscalculation
    df_concat = df_concat.drop_duplicates(subset=['trip_id', 'user_id', 'finished'])
    
    return df_concat

# Add missing surveys
itit_df = add_missing_surveys(itit_df)

In [177]:
#only participant that filled at least one survey
itit_df_1plus_survey=itit_df.groupby('trip_id').filter(lambda x: x['nausea'].notna().any())


In [178]:
def fill_columns_forward(data, columns, max_repetition=4):
    # Loop through each column provided
    for column in columns:
        # Create a new column to store the filled data
        fill_column = f'{column}_filled'
        data[fill_column] = data[column].copy()

        # Loop through each trip_id
        for tid, tdata in data.groupby('trip_id'):
            # Sort the grouped data by 'finished' for chronological order
            tdata = tdata.sort_values(by='finished')

            # We need to work with indices because we look back at previous rows
            indices = tdata.index
            for i in range(len(indices)):
                idx = indices[i]
                current_value = tdata.at[idx, column]

                if pd.isna(current_value):
                    # Look back to count repetitions of the last valid entry if it exists
                    last_valid = None
                    count_back = 0
                    for j in range(1, min(max_repetition, i) + 1):
                        back_idx = indices[i - j]
                        back_value = tdata.at[back_idx, column]
                        if not pd.isna(back_value):
                            if last_valid is None:
                                last_valid = back_value
                            if back_value == last_valid:
                                count_back += 1
                            else:
                                break

                    # If the count is less than max_repetition and last_valid is not 'none', fill with last_valid
                    if count_back < max_repetition and last_valid != 'none':
                        data.loc[idx, fill_column] = last_valid
                else:
                    # Update the filled column with current value
                    data.loc[idx, fill_column] = current_value

    return data

def fill_columns_backward(data, columns, max_repetition=4):
    # Loop through each column provided
    for column in columns:
        fill_column = f'{column}_filled'
        
        # Ensure the 'nausea_filled' column exists, copying the original data if not
        if fill_column not in data.columns:
            data[fill_column] = data[column].copy()

        # Loop through each trip_id
        for tid, tdata in data.groupby('trip_id'):
            # Sort the grouped data by 'finished' in descending order for backward filling
            tdata = tdata.sort_values(by='finished', ascending=False)

            # We need to work with indices because we look back at previous rows
            indices = tdata.index
            last_valid = None
            count_back = 0
            continuous_na = False  # Track if we are in a continuous NA segment

            for i in range(len(indices)):
                idx = indices[i]
                current_value = tdata.at[idx, fill_column]

                if pd.isna(current_value):
                    if continuous_na:
                        # If we are in a continuous NA segment, reset last_valid
                        last_valid = None
                        count_back = 0
                    elif last_valid is not None and last_valid != 'none' and count_back < max_repetition:
                        data.loc[idx, fill_column] = last_valid
                        count_back += 1
                    else:
                        # Reset the tracking variables if conditions to fill are not met
                        last_valid = None
                        count_back = 0
                    continuous_na = True  # Mark that we are in a continuous NA segment
                else:
                    # Reset continuous NA tracking as we hit a non-NA value
                    continuous_na = False
                    if current_value == last_valid:
                        count_back += 1
                    else:
                        last_valid = current_value
                        count_back = 1

    return data

columns_to_fill = ['nausea', 'vomiting', 'stomach_pain', 'diarrhea',
       'constipation','cough', 'sore_throat', 'runny_nose',
       'out_of_breath_resting', 'out_of_breath_running','rash',
       'itchy_insect_bite', 'itchy_other', 'sunburn', 'itchy_red_eyes','fever', 'dizziness', 'ear_ache', 'headache', 'pain_eyes',
       'musle_pain', 'aching_limbs','pain_joint',
       'swelling_joint', 'location_swelling']
filled_df = fill_columns_forward(itit_df_1plus_survey, columns_to_fill)
filled_df = fill_columns_backward(filled_df, columns_to_fill)

In [180]:
filled_df[columns_to_fill] = filled_df[columns_to_fill].fillna('none')