In [479]:
import pandas as pd
import numpy as np
from datetime import timedelta

In [480]:
baseline_clean = pd.read_pickle('data_clean/baseline_clean.pkl')
survey_clean = pd.read_pickle('data_clean/survey_clean.pkl')

In [481]:
itit_df = pd.merge(baseline_clean, survey_clean, on=['trip_id', 'user_id'], how='left')

In [482]:
# Add missing surveys
def add_missing_surveys(df):
    survey_list = []
    
    for (trip_id, user_id), group in df.groupby(['trip_id', 'user_id']):
        travel_date = group['travel_date'].iloc[0]
        duration = int(group['travel_duration'].iloc[0])
        
        # Create a DataFrame with all expected survey dates
        all_days = pd.date_range(start=travel_date, periods=duration, freq='D')
        existing_days = group['finished'].dt.date.unique()
        
        # Filter out days that already have surveys
        missing_days = [day for day in all_days if day.date() not in existing_days]
        
        # Create DataFrame for missing surveys
        if missing_days:
            missing_surveys = pd.DataFrame({
                'trip_id': trip_id,
                'user_id': user_id,
                'travel_date': travel_date,
                'finished': pd.to_datetime(missing_days),
                'travel_duration': duration
            })
            
            # Copy constant baseline variables
            baseline_vars = ['baseline_date', 'age', 'gender', 'country_iso2c', 'country_clean', 'continent_clean', 
                             'health_chronic', 'latitude', 'longitude', 'smoking_status', 'travel_purpose', 'trip_number']
            for var in baseline_vars:
                missing_surveys[var] = group[var].iloc[0]
                
            # Append missing surveys to the list
            survey_list.append(missing_surveys)
        
        # Append existing surveys to the list
        survey_list.append(group)
    
    # Concatenate all DataFrames and sort
    df_concat = pd.concat(survey_list).sort_values(by=['trip_id', 'user_id', 'finished']).reset_index(drop=True)

    # Ensure finished column is properly filled
    df_concat['finished'] = df_concat['finished'].ffill().bfill()

    # Ensure no duplicate rows by dropping the extra row caused by the duration miscalculation
    df_concat = df_concat.drop_duplicates(subset=['trip_id', 'user_id', 'finished'])
    
    return df_concat

itit_df = add_missing_surveys(itit_df)

In [483]:
#only participant that filled at least one survey
itit_df_1plus_survey=itit_df.groupby('trip_id').filter(lambda x: x['nausea'].notna().any())


In [484]:
#filled missing survey based on symptoms length (forward and backward)
def fill_columns_forward(data, columns,baselevel, max_repetition=4):
    # Loop through each column provided
    for column in columns:
        # Create a new column to store the filled data
        fill_column = f'{column}_filled'
        data[fill_column] = data[column].copy()

        # Loop through each trip_id
        for tid, tdata in data.groupby('trip_id'):
            # Sort the grouped data by 'finished' for chronological order
            tdata = tdata.sort_values(by='finished')

            # We need to work with indices because we look back at previous rows
            indices = tdata.index
            for i in range(len(indices)):
                idx = indices[i]
                current_value = tdata.at[idx, column]

                if pd.isna(current_value):
                    # Look back to count repetitions of the last valid entry if it exists
                    last_valid = None
                    count_back = 0
                    for j in range(1, min(max_repetition, i) + 1):
                        back_idx = indices[i - j]
                        back_value = tdata.at[back_idx, column]
                        if not pd.isna(back_value):
                            if last_valid is None:
                                last_valid = back_value
                            if back_value == last_valid:
                                count_back += 1
                            else:
                                break

                    # If the count is less than max_repetition and last_valid is not 'none', fill with last_valid
                    if count_back < max_repetition and last_valid != baselevel[column]:
                        data.loc[idx, fill_column] = last_valid
                else:
                    # Update the filled column with current value
                    data.loc[idx, fill_column] = current_value

    return data

def fill_columns_backward(data, columns,baselevel, max_repetition=4):
    # Loop through each column provided
    for column in columns:
        fill_column = f'{column}_filled'
        
        # Ensure the 'nausea_filled' column exists, copying the original data if not
        if fill_column not in data.columns:
            data[fill_column] = data[column].copy()

        # Loop through each trip_id
        for tid, tdata in data.groupby('trip_id'):
            # Sort the grouped data by 'finished' in descending order for backward filling
            tdata = tdata.sort_values(by='finished', ascending=False)

            # We need to work with indices because we look back at previous rows
            indices = tdata.index
            last_valid = None
            count_back = 0
            continuous_na = False  # Track if we are in a continuous NA segment

            for i in range(len(indices)):
                idx = indices[i]
                current_value = tdata.at[idx, fill_column]

                if pd.isna(current_value):
                    if continuous_na:
                        # If we are in a continuous NA segment, reset last_valid
                        last_valid = None
                        count_back = 0
                    elif last_valid is not None and last_valid != baselevel[column] and count_back < max_repetition:
                        data.loc[idx, fill_column] = last_valid
                        count_back += 1
                    else:
                        # Reset the tracking variables if conditions to fill are not met
                        last_valid = None
                        count_back = 0
                    continuous_na = True  # Mark that we are in a continuous NA segment
                else:
                    # Reset continuous NA tracking as we hit a non-NA value
                    continuous_na = False
                    if current_value == last_valid:
                        count_back += 1
                    else:
                        last_valid = current_value
                        count_back = 1

    return data

columns_to_fill = ['nausea', 'vomiting', 'stomach_pain', 'diarrhea',
       'constipation','cough', 'sore_throat', 'runny_nose',
       'out_of_breath_resting', 'out_of_breath_running','rash',
       'itchy_insect_bite', 'itchy_other', 'sunburn', 'itchy_red_eyes','fever', 'dizziness', 'ear_ache', 'headache', 'pain_eyes',
       'musle_pain', 'aching_limbs','pain_joint',
       'swelling_joint', 'location_swelling',
       'body_other',
       'impact',
       'rating_day'
       ]
base_level = {
    'nausea': 'none',
    'vomiting': 'none',
    'stomach_pain': 'none',
    'diarrhea': 'none',
    'constipation': 'none',
    'cough': 'none',
    'sore_throat': 'none',
    'runny_nose': 'none',
    'out_of_breath_resting': 'none',
    'out_of_breath_running': 'none',
    'rash': 'none',
    'itchy_insect_bite': 'none',
    'itchy_other': 'none',
    'sunburn': 'none',
    'itchy_red_eyes': 'none',
    'fever': 'none',
    'dizziness': 'none',
    'ear_ache': 'none',
    'headache': 'none',
    'pain_eyes': 'none',
    'musle_pain': 'none',
    'aching_limbs': 'none',
    'pain_joint': 'none',
    'swelling_joint': 'none',
    'location_swelling': 'none',
    'body_other': 'No',
    'impact':'Did not affect my activities',
    'rating_day': 'It was a great day'
}

itit_filled_df = fill_columns_forward(itit_df_1plus_survey, columns_to_fill, baselevel=base_level)
itit_filled_df = fill_columns_backward(itit_filled_df, columns_to_fill, baselevel=base_level)

In [485]:
# Create the filled column names
filled_columns = [f'{column}_filled' for column in columns_to_fill]

# Adjust the base level mapping to match the filled column names
filled_base_level = {f'{column}_filled': base_level[column] for column in columns_to_fill}

# Fill NA values based on the adjusted base level mapping
itit_filled_df[filled_columns] = itit_filled_df[filled_columns].fillna(filled_base_level)

In [486]:
def fill_numeric_columns(data, columns, priority='last'):
    for column in columns:
        fill_column = f'{column}_filled'
        data[fill_column] = data[column].copy()

        for tid, tdata in data.groupby('trip_id'):
            tdata = tdata.sort_values(by='finished')
            indices = tdata.index.tolist()
            n = len(indices)

            I = 0
            while I < n:
                idx = indices[I]
                if pd.isna(tdata.at[idx, column]):
                    start_idx = I
                    # Find the segment of missing values
                    while I < n and pd.isna(tdata.at[indices[I], column]):
                        I += 1
                    end_idx = I

                    # Get the previous and next valid values
                    prev_value = tdata.at[indices[start_idx - 1], column] if start_idx > 0 else None
                    next_value = tdata.at[indices[end_idx], column] if end_idx < n else None

                    # Fill the missing values based on the priority
                    num_missing = end_idx - start_idx
                    half_point = (num_missing + 1) // 2

                    if prev_value is not None and next_value is not None:
                        if priority == 'last':
                            for j in range(num_missing):
                                fill_idx = indices[start_idx + j]
                                if j < half_point:
                                    data.loc[fill_idx, fill_column] = prev_value
                                else:
                                    data.loc[fill_idx, fill_column] = next_value
                        elif priority == 'first':
                            for j in range(num_missing):
                                fill_idx = indices[start_idx + j]
                                if j < half_point:
                                    data.loc[fill_idx, fill_column] = next_value
                                else:
                                    data.loc[fill_idx, fill_column] = prev_value
                    elif prev_value is not None:
                        for j in range(num_missing):
                            fill_idx = indices[start_idx + j]
                            data.loc[fill_idx, fill_column] = prev_value
                    elif next_value is not None:
                        for j in range(num_missing):
                            fill_idx = indices[start_idx + j]
                            data.loc[fill_idx, fill_column] = next_value
                else:
                    I += 1

    return data
columns_to_fill_numeric = ['trip_id',
 'user_id',
 'travel_date',
 'finished',
 'travel_duration',
 'baseline_date',
 'age',
 'gender',
 'country_iso2c',
 'country_clean',
 'continent_clean',
 'health_chronic',
 'latitude',
 'longitude',
 'smoking_status',
 'travel_purpose',
 'trip_number',
 'clouds','survey_latitude',
 'survey_longitude',
 'clouds',
 'dew_point',
 'feels_like',
 'humidity',
 'pressure',
 'rain_1h',
 'snow_1h',
 'sunrise',
 'sunset',
 'temp',
 'uvi',
 'visibility',
 'main_weather',
 'description_weather',
 'air_quality_components_co',
 'air_quality_components_nh_3',
 'air_quality_components_no',
 'air_quality_components_no_2',
 'air_quality_components_o_3',
 'air_quality_components_pm_10',
 'air_quality_components_pm_2_5',
 'air_quality_components_so_2',
 'wind_deg',
 'wind_gust',
 'wind_speed',
  'air_quality_main']
itit_filled_df = fill_numeric_columns(itit_filled_df, columns_to_fill_numeric, priority='last')

In [487]:
#remove initial columns
itit_filled_df = itit_filled_df.drop(columns=columns_to_fill)
itit_filled_df = itit_filled_df.drop(columns=columns_to_fill_numeric)
itit_filled_df = itit_filled_df.drop(columns=['respi_any','gastro_any','skin_any','body_any','joint_any'])

In [488]:
#creat dataset without gps position
filled_columns_numeric = [f'{column}_filled' for column in columns_to_fill_numeric[columns_to_fill_numeric.index('clouds'):columns_to_fill_numeric.index('air_quality_main') + 1]]
itit_nogps_filled_df = itit_filled_df.copy()
itit_nogps_filled_df = itit_nogps_filled_df.drop(columns=filled_columns_numeric)

In [489]:
#creat dataset with gps position
itit_filled_df=itit_filled_df.groupby('trip_id_filled').filter(lambda x: x['air_quality_main_filled'].notna().any())

In [490]:

itit_filled_df[['rain_1h_filled','snow_1h_filled','wind_gust_filled']]=itit_filled_df[['rain_1h_filled','snow_1h_filled','wind_gust_filled']].fillna(0)

In [491]:
itit_nogps_filled_df.to_pickle('data_clean/itit_nogps_filled_df.pkl')
itit_filled_df.to_pickle('data_clean/itit_filled_df.pkl')

In [478]:
itit_filled_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 22149 entries, 11 to 39345
Data columns (total 73 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   nausea_filled                         22149 non-null  object        
 1   vomiting_filled                       22149 non-null  object        
 2   stomach_pain_filled                   22149 non-null  object        
 3   diarrhea_filled                       22149 non-null  object        
 4   constipation_filled                   22149 non-null  object        
 5   cough_filled                          22149 non-null  object        
 6   sore_throat_filled                    22149 non-null  object        
 7   runny_nose_filled                     22149 non-null  object        
 8   out_of_breath_resting_filled          22149 non-null  object        
 9   out_of_breath_running_filled          22149 non-null  object        
 10  ra