In [192]:
import pandas as pd
import numpy as np

In [193]:
baseline_clean = pd.read_pickle('data_clean/baseline_clean.pkl')
survey_clean = pd.read_pickle('data_clean/survey_clean.pkl')

In [195]:
itit_df = pd.merge(baseline_clean, survey_clean, on=['trip_id', 'user_id'], how='left')

In [197]:
def add_missing_surveys(df):
    survey_list = []
    
    for (trip_id, user_id), group in df.groupby(['trip_id', 'user_id']):
        travel_date = group['travel_date'].iloc[0]
        duration = int(group['travel_duration'].iloc[0])
        
        # Create a DataFrame with all expected survey dates
        all_days = pd.date_range(start=travel_date, periods=duration, freq='D')
        existing_days = group['finished'].dt.date.unique()
        
        # Filter out days that already have surveys
        missing_days = [day for day in all_days if day.date() not in existing_days]
        
        # Create DataFrame for missing surveys
        if missing_days:
            missing_surveys = pd.DataFrame({
                'trip_id': trip_id,
                'user_id': user_id,
                'travel_date': travel_date,
                'finished': pd.to_datetime(missing_days),
                'travel_duration': duration
            })
            
            # Copy constant baseline variables
            baseline_vars = ['baseline_date', 'age', 'gender', 'country_iso2c', 'country_clean', 'continent_clean', 
                             'health_chronic', 'latitude', 'longitude', 'smoking_status', 'travel_purpose', 'trip_number']
            for var in baseline_vars:
                missing_surveys[var] = group[var].iloc[0]
                
            # Append missing surveys to the list
            survey_list.append(missing_surveys)
        
        # Append existing surveys to the list
        survey_list.append(group)
    
    # Concatenate all DataFrames and sort
    df_concat = pd.concat(survey_list).sort_values(by=['trip_id', 'user_id', 'finished']).reset_index(drop=True)

    # Ensure finished column is properly filled
    df_concat['finished'] = df_concat['finished'].ffill().bfill()

    # Ensure no duplicate rows by dropping the extra row caused by the duration miscalculation
    df_concat = df_concat.drop_duplicates(subset=['trip_id', 'user_id', 'finished'])
    
    return df_concat

# Add missing surveys
itit_df = add_missing_surveys(itit_df)

In [94]:
itit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38962 entries, 0 to 38961
Data columns (total 81 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   trip_id                        38962 non-null  object        
 1   user_id                        38962 non-null  object        
 2   travel_date                    38962 non-null  datetime64[ns]
 3   finished                       38455 non-null  datetime64[ns]
 4   travel_duration                38962 non-null  float64       
 5   baseline_date                  38962 non-null  datetime64[ns]
 6   age                            38962 non-null  float64       
 7   gender                         38886 non-null  object        
 8   country_iso2c                  38962 non-null  object        
 9   country_clean                  26770 non-null  object        
 10  continent_clean                38915 non-null  object        
 11  health_chronic 