In [1]:
import sys
import os
import importlib
sys.path.insert(0, os.path.abspath("../data_model/"))

In [2]:
import pandas as pd
import numpy as np
from pydantic import ValidationError
import data_model_v2
import enums as e
from utils import extract_base_type, add_enum_label_columns, add_list_objects, add_synthetic_records, map_zones
import datetime

In [3]:
importlib.reload(data_model_v2)
importlib.reload(e)
from data_model_v2 import Respondent, Employee, AirPassenger, Trip, DepartingPassengerResident, DepartingPassengerVisitor, ArrivingPassengerResident, ArrivingPassengerVisitor, DepartingAirPassenger, ArrivingAirPassenger, Resident, Visitor

In [4]:
external_dir = "../data/external"
interim_dir = "../data/interim"
processed_dir = "../data/processed"

input_file1 = os.path.join(external_dir, "etc/od_20241121_sandag_airport_draftfinal.xlsx") # latest
input_file2 = os.path.join(external_dir, "etc/od_20241015_sandag_airport_pilot_4.xlsx") #older version but records needed
variable_map_file = os.path.join(processed_dir, "revised_names.csv")
clean_survey_file = os.path.join(interim_dir, "survey_data_clean.csv")
output_csv_filename = os.path.join(processed_dir, "data_model_output.csv")
#summary_csv_filename = os.path.join(processed_dir, "data_model_output_summary.csv")

### Clean Data , Rename fields

In [5]:
in_df_complete1 = pd.read_excel(input_file1, sheet_name = 0)
in_df_incomplete1 = pd.read_excel(input_file1, sheet_name = 1)

in_df_complete2 = pd.read_excel(input_file2, sheet_name = 0)
in_df_incomplete2 = pd.read_excel(input_file2, sheet_name = 1)

in_df_complete = pd.concat([in_df_complete1, in_df_complete2], ignore_index = True)
in_df_incomplete = pd.concat([in_df_incomplete1, in_df_incomplete2], ignore_index = True)

in_df_complete['is_completed'] = 1
in_df_incomplete['is_completed'] = 0

in_df_complete['weight'] = 1
in_df_incomplete['weight'] = 0

in_df = pd.concat([in_df_complete, in_df_incomplete], ignore_index = True)
header_df = pd.read_csv(variable_map_file)[['ETC_name','WSP_name']]
header_dict = pd.Series(header_df.WSP_name.values,index=header_df.ETC_name).to_dict()
clean_df = in_df.rename(columns=header_dict).copy().drop(columns=["delete"])

  in_df_complete['is_completed'] = 1
  in_df_complete['weight'] = 1


In [6]:
print("Complete Records: ", in_df_complete.shape)
print("Incomplete Records: ", in_df_incomplete.shape)

Complete Records:  (7532, 372)
Incomplete Records:  (9069, 29)


In [7]:
clean_df.shape

(16601, 315)

In [8]:
len(clean_df['respondentid'].unique())

5104

In [9]:
#Remove the duplicate respondentids
clean_df.drop_duplicates('respondentid', keep = 'first', inplace = True)
clean_df.shape

(5104, 315)

In [10]:
clean_df[clean_df['is_completed']==1].shape

(4731, 315)

#### Add Zones Mapping

In [11]:
#PMSA
pmsa_zones_shapefile = "../data/external/geometry/pmsa_geoms/pmsa_geoms.shp"
clean_df['origin_pmsa'] = map_zones(clean_df, 'origin_latitude', 'origin_longitude', pmsa_zones_shapefile, 'pseudomsa', 99)
clean_df['destination_pmsa'] = map_zones(clean_df, 'destination_latitude', 'destination_longitude', pmsa_zones_shapefile, 'pseudomsa', 99)
clean_df['origin_pmsa'].value_counts(), clean_df['destination_pmsa'].value_counts()

(origin_pmsa
 2     1351
 3     1154
 1      668
 99     433
 6      426
 4      263
 5      249
 7      180
 8        7
 Name: count, dtype: int64,
 destination_pmsa
 2     4595
 99      38
 3       34
 1       34
 6       15
 5       12
 4        3
 Name: count, dtype: int64)

In [12]:
#Municipal Zones
municipal_zones_shapefile = "../data/external/geometry/Municipal_Boundaries/Municipal_Boundaries.shp"
clean_df['origin_municipal_zone'] = map_zones(clean_df, 'origin_latitude', 'origin_longitude', municipal_zones_shapefile, 'name', 'EXTERNAL')
clean_df['destination_municipal_zone'] = map_zones(clean_df, 'destination_latitude', 'destination_longitude', municipal_zones_shapefile, 'name', 'EXTERNAL')
clean_df['origin_municipal_zone'].value_counts(), clean_df['destination_municipal_zone'].value_counts()

(origin_municipal_zone
 SAN DIEGO         2982
 EXTERNAL           433
 S.D. COUNTY        217
 CHULA VISTA        184
 CARLSBAD           152
 OCEANSIDE          135
 CORONADO           109
 ESCONDIDO           71
 ENCINITAS           67
 LA MESA             62
 EL CAJON            55
 NATIONAL CITY       53
 POWAY               42
 SAN MARCOS          39
 VISTA               36
 DEL MAR             29
 IMPERIAL BEACH      23
 LEMON GROVE         18
 SOLANA BEACH        14
 SANTEE              10
 Name: count, dtype: int64,
 destination_municipal_zone
 SAN DIEGO        4655
 EXTERNAL           38
 EL CAJON            7
 OCEANSIDE           5
 CARLSBAD            5
 CORONADO            4
 S.D. COUNTY         4
 NATIONAL CITY       3
 CHULA VISTA         3
 LA MESA             2
 ENCINITAS           2
 POWAY               2
 SOLANA BEACH        1
 Name: count, dtype: int64)

In [13]:
clean_df['passenger_type'].value_counts()

passenger_type
2.0    4226
1.0     187
3.0      93
Name: count, dtype: int64

In [14]:
4226+187

4413

### Add passenger Segment

In [15]:
# Add the `passenger_segment` column based on the updated logic
clean_df["passenger_segment"] = np.where(
    # Resident Arriving
    (clean_df["passenger_type"] == e.PassengerType.ARRIVING) & 
    ((clean_df["resident_visitor_general"] == e.ResidentVisitorGeneral.COMING_HOME) |
     (clean_df["resident_visitor_followup"] == e.ResidentVisitorFollowup.LIVE_OUTSIDE_REGION_TRAVELED_TO_AIRPORT)),
    e.PassengerSegment.RESIDENT_ARRIVING,  # Resident Arriving
    np.where(
        # Visitor Arriving
        (clean_df["passenger_type"] == e.PassengerType.ARRIVING),
        e.PassengerSegment.VISITOR_ARRIVING,  # Visitor Arriving
        np.where(
            # Resident Departing
            (clean_df["passenger_type"] == e.PassengerType.DEPARTING) & 
            ((clean_df["resident_visitor_general"] == e.ResidentVisitorGeneral.LEAVING_HOME) |
             (clean_df["resident_visitor_followup"] == e.ResidentVisitorFollowup.LIVE_OUTSIDE_REGION_TRAVELED_TO_AIRPORT)),
            e.PassengerSegment.RESIDENT_DEPARTING,  # Resident Departing
            np.where(
                # Visitor Departing
                (clean_df["passenger_type"] == e.PassengerType.DEPARTING),
                e.PassengerSegment.VISITOR_DEPARTING,  # Visitor Departing
                None  # Default case (if no conditions match)
            )
        )
    )
)

In [16]:
clean_df.head()

Unnamed: 0,respondentid,submit,date_completed,interview_location,interview_location_label,interview_location_other,inbound_or_outbound,inbound_or_outbound_label,marketsegment,marketsegment_label,...,survey_language,survey_language_label,survey_language_other,is_completed,weight,origin_pmsa,destination_pmsa,origin_municipal_zone,destination_municipal_zone,passenger_segment
0,5385,YES,2024-10-04 00:00:00,Term2,Terminal 2,,IN,INBOUND,1.0,Air passenger,...,ENGLISH,ENGLISH,,1,1,6,2,ENCINITAS,SAN DIEGO,4
1,5386,YES,2024-10-04 00:00:00,Term2,Terminal 2,,IN,INBOUND,1.0,Air passenger,...,ENGLISH,ENGLISH,,1,1,2,2,SAN DIEGO,SAN DIEGO,2
2,5387,NO,2024-10-04 00:00:00,Term2,Terminal 2,,IN,INBOUND,1.0,Air passenger,...,ENGLISH,ENGLISH,,1,1,2,2,SAN DIEGO,SAN DIEGO,4
3,5388,YES,2024-10-04 00:00:00,SDA_1_FLYER,San Diego Flyer/Old Town Shuttle,,IN,INBOUND,1.0,Air passenger,...,ENGLISH,ENGLISH,,1,1,6,2,CARLSBAD,SAN DIEGO,2
4,5389,YES,2024-10-04 00:00:00,Term2,Terminal 2,,IN,INBOUND,1.0,Air passenger,...,ENGLISH,ENGLISH,,1,1,2,2,SAN DIEGO,SAN DIEGO,2


In [None]:
## Explicit Visitor Check
clean_df["qualified_visitor"] = np.where(
    # Arriving and visiting or neither, and does not live in the same region traveled
    (clean_df["passenger_type"] == e.PassengerType.ARRIVING) & 
    ((clean_df["resident_visitor_general"] == e.ResidentVisitorGeneral.VISITING) | 
     (clean_df["resident_visitor_general"] == e.ResidentVisitorGeneral.NEITHER)) &
    (clean_df["resident_visitor_followup"] != e.ResidentVisitorFollowup.LIVE_OUTSIDE_REGION_TRAVELED_TO_AIRPORT),
    1,  # Qualified visitor
    np.where(
        # Departing and going home or neither, and does not live in the same region traveled
        (clean_df["passenger_type"] == e.PassengerType.DEPARTING) &
        ((clean_df["resident_visitor_general"] == e.ResidentVisitorGeneral.GOING_HOME) | 
         (clean_df["resident_visitor_general"] == e.ResidentVisitorGeneral.NEITHER)) &
        (clean_df["resident_visitor_followup"] != e.ResidentVisitorFollowup.LIVE_OUTSIDE_REGION_TRAVELED_TO_AIRPORT),
        1,  # Qualified visitor
        0  # Not a qualified visitor
    )
)

### Commonly occuring invalid values

In [17]:
# Get the list of columns that contain '-oth-' as a value
columns_with_oth_value = [col for col in clean_df.columns if clean_df[col].eq('-oth-').any()]

print(columns_with_oth_value)

['interview_location']


In [18]:
columns_with_dash_value = [col for col in clean_df.columns if clean_df[col].eq('-').any()]

print(columns_with_dash_value)

['flight_number', 'origin_city', 'origin_state', 'origin_zip', 'destination_city', 'destination_state', 'destination_zip', 'transit_boarding_stop_name', 'transit_boarding_latitude', 'transit_boarding_longitude', 'transit_alighting_stop_name', 'transit_alighting_latitude', 'transit_alighting_longitude', 'home_location_city', 'home_location_state', 'home_location_zip', 'home_location_latitude', 'home_location_longitude']


### Making all modes consistent

In [19]:
clean_df['egress_mode_label'].value_counts()

egress_mode_label
Walk                                 25
Picked up by car by family/friend    12
Drive alone and park                  2
Uber/Lyft                             2
Other shared van (please specify)     1
Taxi                                  1
Name: count, dtype: int64

In [20]:
clean_df['other_airport_accessmode_label'].value_counts()

other_airport_accessmode_label
DROPPED OFF BY CAR BY FRIEND FAMILY               242
UBER LYFT                                         107
DROVE ALONE AND PARKED                             42
DROVE WITH OTHERS AND PARKED                       28
OTHER PUBLIC TRANSIT                               24
RENTAL CAR AND DROPPED IT OFF AT RENTAL AGENCY     22
TAXI                                               14
WHEELCHAIR OR OTHER MOBILITY DEVICE                11
RENTAL CAR AND PARKED IT                           11
WALK                                                9
CAR SERVICE BLACK CAR LIMO EXECUTIVE CAR            4
HOTEL SHUTTLE VAN                                   3
RODE WITH OTHER TRAVELER AND PARKED                 2
NON ELECTRIC BIKESHARE                              2
PERSONAL NON ELECTRIC BICYCLE                       1
OTHER SHARED RIDE VAN SERVICE                       1
ELECTRIC BIKESHARE                                  1
EMPLOYEE SHUTTLE                                   

In [21]:
other_airport_accessmode_label_map = {
    'Walk': 'Walk',
    'Wheelchair or other mobility device': 'Wheelchair or other mobility device',
    'ELECTRIC BIKESHARE': 'Bicycle: electric bikeshare',
    'NON ELECTRIC BIKESHARE': 'Bicycle: non-electric bikeshare',
    'E SCOOTER SHARE': 'E-scooter: shared',
    'PERSONAL ELECTRIC BICYCLE': 'Bicycle: personal electric bicycle',
    'PERSONAL NON ELECTRIC BICYCLE': 'Bicycle: personal non-electric bicycle',
    'PERSONAL E SCOOTER': 'E-scooter: personal',
    'Taxi': 'Taxi',
    'UBER LYFT': 'Uber/Lyft',
    'CAR SERVICE BLACK CAR LIMO EXECUTIVE CAR': 'Car service/black car/limo/executive car',
    'DROPPED OFF BY CAR BY FRIEND FAMILY': 'Dropped off by car by family/friend',
    'Drove alone and parked': 'Drove alone and parked',
    'Drove with others and parked': 'Drove with others and parked',
    'RODE WITH OTHER TRAVELER AND PARKED': 'Rode with other traveler(s) and parked',
    'Other public transit': 'Other public transit',
    'Chartered tour bus': 'Chartered tour bus',
    'Employee shuttle': 'Employee shuttle',
    'RENTAL CAR AND DROPPED IT OFF AT RENTAL AGENCY': 'Rental car: Dropped off at rental agency',
    'RENTAL CAR AND PARKED IT': 'Rental car: parked rental car',
    'Hotel shuttle van': 'Hotel shuttle van',
    'OTHER SHARED RIDE VAN SERVICE': 'Other shared van (please specify)',
    'Other': 'Other',
    'Refused/No Answer': 'Refused/No Answer'
}
clean_df['other_airport_accessmode_label'] = clean_df['other_airport_accessmode_label'].map(other_airport_accessmode_label_map)

In [22]:
clean_df['other_airport_accessmode_label'].value_counts()

other_airport_accessmode_label
Dropped off by car by family/friend         242
Uber/Lyft                                   107
Rental car: Dropped off at rental agency     22
Rental car: parked rental car                11
Car service/black car/limo/executive car      4
Rode with other traveler(s) and parked        2
Bicycle: non-electric bikeshare               2
Bicycle: personal non-electric bicycle        1
Other shared van (please specify)             1
Bicycle: electric bikeshare                   1
Name: count, dtype: int64

In [23]:
travel_mode_dict = {
    'Walk': 1,
    'Wheelchair or other mobility device': 2,
    'Bicycle: electric bikeshare': 3,
    'Bicycle: non-electric bikeshare': 4,
    'E-scooter: shared': 5,
    'Bicycle: personal electric bicycle': 6,
    'Bicycle: personal non-electric bicycle': 7,
    'E-scooter: personal': 8,
    'Taxi': 9,
    'Uber/Lyft': 10,
    'Car service/black car/limo/executive car': 11,
    'Dropped off by car by family/friend': 12,
    'Drove alone and parked': 13,
    'Drove with others and parked': 14,
    'MTS Route 992': 15,
    'Airport flyer shuttle': 16,
    'Chartered tour bus': 17,
    'Employee shuttle': 18,
    'Rental car: Dropped off at rental agency': 19,
    'Rental car: parked rental car': 20,
    'Hotel shuttle van': 21,
    'Other shared van (please specify)': 22,
    'Picked up by car by family/friend': 23,
    'Get in a parked vehicle and drive alone': 24,
    'Get in a parked vehicle and drive with others': 25,
    'Get in a parked vehicle and ride with other traveler(s)': 26,
    'Rental car: Picked up at rental agency': 27,
    'Rental car: get in a parked rental car': 28,
    'Rode with other traveler(s) and parked': 29,
    'Other public transit': 30,
    'Other': 98,
    'Refused/No Answer': 99,
    'None of the above': 98
}

### Modes to fix

In [24]:
mode_code_columns = ['main_transit_mode', 'main_mode', 'access_mode', 'egress_mode', 'reverse_mode', 'reverse_mode_predicted', 'other_airport_accessmode', 'reverse_commute_mode']
mode_label_columns = ['main_transit_mode_label', 'main_mode_label', 'access_mode_label', 'egress_mode_label', 'reverse_mode_label', 'reverse_mode_predicted_label', 'other_airport_accessmode_label', 'reverse_commute_mode_label']

In [25]:
#Remapping codes using label strings
for mode_code_col, mode_label_col in zip(mode_code_columns, mode_label_columns):
    # Apply the mapping for each pair of columns
    clean_df[mode_code_col] = clean_df[mode_label_col].map(travel_mode_dict)

In [26]:
clean_df['other_airport_accessmode_label'].value_counts()

other_airport_accessmode_label
Dropped off by car by family/friend         242
Uber/Lyft                                   107
Rental car: Dropped off at rental agency     22
Rental car: parked rental car                11
Car service/black car/limo/executive car      4
Rode with other traveler(s) and parked        2
Bicycle: non-electric bikeshare               2
Bicycle: personal non-electric bicycle        1
Other shared van (please specify)             1
Bicycle: electric bikeshare                   1
Name: count, dtype: int64

In [27]:
clean_df['other_airport_accessmode'].value_counts()

other_airport_accessmode
12.0    242
10.0    107
19.0     22
20.0     11
11.0      4
29.0      2
4.0       2
7.0       1
22.0      1
3.0       1
Name: count, dtype: int64

In [28]:
clean_df['main_transit_mode'].value_counts()

main_transit_mode
98.0    4361
16.0     214
15.0     156
Name: count, dtype: int64

### Pre-processing of some fields

In [29]:
clean_df['date_completed'] = pd.to_datetime(clean_df['date_completed'])
clean_df['is_pilot'] = np.where(clean_df['date_completed'].dt.date<=datetime.date(2024, 9, 30), 1, 0)
clean_df['record_type_synthetic'] = 0
clean_df.replace('-oth-', 98, inplace=True)
clean_df.replace('-', None, inplace = True )
clean_df['is_income_below_poverty'] = np.where(clean_df['is_income_below_poverty'] == 0, 2, clean_df['is_income_below_poverty'])
clean_df['stay_informed'] = np.where(clean_df['stay_informed'] == 0, 2, clean_df['stay_informed'])
#Maps
interview_location_map = {'Term1' : 1, 'Term2': 2, 'MTS_1_992': 3, 'SDA_1_FLYER': 4, 'ConracShuttle': 5, 'ParkingShuttle': 6, 'EmplParking': 7, '-oth-':98} 
inbound_outbound_map = {'IN':1, 'OUT':2}

#route_fields:
route_fields = ['to_airport_transit_route_1', 'to_airport_transit_route_2', 'to_airport_transit_route_3', 'to_airport_transit_route_4',
                'from_airport_transit_route_1', 'from_airport_transit_route_2', 'from_airport_transit_route_3', 'from_airport_transit_route_4']

#Replacement
clean_df['interview_location'] = clean_df['interview_location'].map(interview_location_map)
clean_df['inbound_or_outbound'] = clean_df['inbound_or_outbound'].map(inbound_outbound_map)
clean_df['main_mode'] = np.where(clean_df['main_transit_mode'].isin([15,16]), clean_df['main_transit_mode'], clean_df['main_mode'])

clean_df[route_fields] = clean_df[route_fields].replace(98, 'OTHER')
clean_df['nights_visited'] = clean_df['nights_visited'] - 1

clean_df['same_commute_mode'] = np.where(clean_df['same_commute_mode'] == 0, 2, clean_df['same_commute_mode'])
clean_df['resident_visitor_followup'] = np.where(clean_df['resident_visitor_followup'] == 0, 2, clean_df['resident_visitor_followup'])

#activity_type
clean_df['origin_activity_type'] = np.where(clean_df['inbound_or_outbound'] == e.InboundOutbound.OUTBOUND_FROM_AIRPORT, e.ActivityType.SAN_DIEGO_AIRPORT, clean_df['origin_activity_type'])
clean_df['destination_activity_type'] = np.where(clean_df['inbound_or_outbound'] == e.InboundOutbound.INBOUND_TO_AIRPORT, e.ActivityType.SAN_DIEGO_AIRPORT, clean_df['destination_activity_type'])

#For incomplete records:
clean_df['marketsegment'] = clean_df['marketsegment'].fillna(99)


  clean_df.replace('-oth-', 98, inplace=True)


In [30]:
clean_df.to_csv(clean_survey_file, index = False)

In [31]:
clean_df.head()

Unnamed: 0,respondentid,submit,date_completed,interview_location,interview_location_label,interview_location_other,inbound_or_outbound,inbound_or_outbound_label,marketsegment,marketsegment_label,...,survey_language_other,is_completed,weight,origin_pmsa,destination_pmsa,origin_municipal_zone,destination_municipal_zone,passenger_segment,is_pilot,record_type_synthetic
0,5385,YES,2024-10-04,2.0,Terminal 2,,1.0,INBOUND,1.0,Air passenger,...,,1,1,6,2,ENCINITAS,SAN DIEGO,4,0,0
1,5386,YES,2024-10-04,2.0,Terminal 2,,1.0,INBOUND,1.0,Air passenger,...,,1,1,2,2,SAN DIEGO,SAN DIEGO,2,0,0
2,5387,NO,2024-10-04,2.0,Terminal 2,,1.0,INBOUND,1.0,Air passenger,...,,1,1,2,2,SAN DIEGO,SAN DIEGO,4,0,0
3,5388,YES,2024-10-04,4.0,San Diego Flyer/Old Town Shuttle,,1.0,INBOUND,1.0,Air passenger,...,,1,1,6,2,CARLSBAD,SAN DIEGO,2,0,0
4,5389,YES,2024-10-04,2.0,Terminal 2,,1.0,INBOUND,1.0,Air passenger,...,,1,1,2,2,SAN DIEGO,SAN DIEGO,2,0,0


### Select Variables to verify for the survey

In [32]:
respondent_variables = [field_name for field_name, field_info in Respondent.__fields__.items()]


trip_variables = [field_name for field_name, field_info in Trip.__fields__.items()]
trip_variables.append('respondentid')
trip_variables.remove('valid_record')
trip_variables.remove('validation_error')
trip_variables.remove('validation_severity')
trip_variables.remove('validation_num_errors')

employee_variables = [field_name for field_name, field_info in Employee.__fields__.items()]
employee_variables.remove('trip')

#air_passenger_variables = [field_name for field_name, field_info in AirPassenger.__fields__.items()]
#air_passenger_variables.remove('trip')

air_passenger_departing_resident_variables = [field_name for field_name, field_info in DepartingPassengerResident.__fields__.items()] 
air_passenger_departing_visitor_variables = [field_name for field_name, field_info in DepartingPassengerVisitor.__fields__.items()]
air_passenger_arriving_resident_variables = [field_name for field_name, field_info in ArrivingPassengerResident.__fields__.items()]
air_passenger_arriving_visitor_variables = [field_name for field_name, field_info in ArrivingPassengerVisitor.__fields__.items()]
air_passenger_variables = list(set(air_passenger_departing_resident_variables + air_passenger_departing_visitor_variables + air_passenger_arriving_resident_variables +air_passenger_arriving_visitor_variables))

variables_to_verify = list(set(air_passenger_variables + respondent_variables + trip_variables + employee_variables))
variables_to_verify.remove('trip')
variables_to_verify.remove('valid_record')
variables_to_verify.remove('validation_error')
variables_to_verify.remove('validation_severity')
variables_to_verify.remove('validation_num_errors')
variables_to_verify.remove('is_self_administered')

working_df = clean_df.copy()
working_df = working_df[variables_to_verify].copy()
working_df = working_df.loc[working_df['marketsegment'].notna()].copy()
working_df.head()

Unnamed: 0,general_modes_used_visitor_e_scooter_personal,sdia_accessmode_split_other_public_transit,convention_center_activity_other,final_flight_destination,sdia_accessmode_split_other_shared_van,party_includes_coworker,origin_state,race_hp,general_use_transit_resident,household_income,...,reasons_no_transit_dislike_public_transport,origin_activity_type_other,main_mode,from_airport_transit_route_1,is_completed,general_modes_used_visitor_dropped_off_by_family_friend,general_modes_used_visitor_coaster,marketsegment,inbound_or_outbound,access_mode
0,No,,,,,Yes,CA,No,,13.0,...,No,,10.0,,1,No,No,1.0,1.0,
1,,No,,,No,No,CA,No,0.0,15.0,...,No,,10.0,,1,,,1.0,1.0,
2,No,,,,,No,CA,No,,11.0,...,,,16.0,,1,No,No,1.0,1.0,11.0
3,,,,,,,CA,No,0.0,14.0,...,,,16.0,,1,,,1.0,1.0,12.0
4,,,,,,No,CA,No,7.0,12.0,...,No,,10.0,,1,,,1.0,1.0,


In [33]:
working_df.shape

(5104, 252)

### Serialize the data

In [34]:
trips_df = working_df[trip_variables].copy()
persons_df = working_df[list[set(employee_variables + respondent_variables + air_passenger_variables)]].copy()

In [35]:
# combined
respondent_list = add_list_objects(
        trips_df.to_dict(orient="records"),  #child list
        "respondentid", # child key
        persons_df.to_dict(orient="records"), # parent list
        "respondentid", # parent key
        "trip", # parent var
    )

In [36]:
len(respondent_list)

5104

In [37]:
# employee_list = []
# air_passenger_list = []
# other_list = []
# failed_records = []

# for respondent in respondent_list:
#     market_segment = respondent["marketsegment"]
#     try:
#         if market_segment == e.Type.EMPLOYEE:
#             ev = Employee(** respondent)
#             employee_list.append(ev)
#         elif market_segment == e.Type.PASSENGER:
#              av = AirPassenger(** respondent)
#              air_passenger_list.append(av)
#         else:
#             rv = Respondent(** respondent)
#             other_list.append(rv)
#     except ValidationError as err:
#             respondent['error_flag'] = 'failed'
#             respondent['error_message'] = str(err)
#             failed_records.append(respondent) 


# failed_df = pd.DataFrame(failed_records)
# failed_df.head()

In [38]:
employee_list = []
arriving_air_passenger_resident_list = []
arriving_air_passenger_visitor_list = []
departing_air_passenger_resident_list = []
departing_air_passenger_visitor_list = []
other_list = []
failed_records = []

for respondent in respondent_list:
     market_segment = respondent["marketsegment"]
     try:
        if market_segment == e.Type.EMPLOYEE:
            ev = Employee(** respondent)
            employee_list.append(ev)
        elif market_segment == e.Type.PASSENGER:
             passenger_segment= respondent["passenger_segment"]
             if passenger_segment == e.PassengerSegment.RESIDENT_ARRIVING:
                    apr = ArrivingPassengerResident(** respondent)
                    arriving_air_passenger_resident_list.append(apr)
             elif passenger_segment == e.PassengerSegment.VISITOR_ARRIVING:
                    apv = ArrivingPassengerVisitor(** respondent)
                    arriving_air_passenger_visitor_list.append(apv)
             elif passenger_segment == e.PassengerSegment.RESIDENT_DEPARTING:
                    dpr = DepartingPassengerResident(** respondent)
                    departing_air_passenger_resident_list.append(dpr)
             elif passenger_segment == e.PassengerSegment.VISITOR_DEPARTING:
                    dpv = DepartingPassengerVisitor(** respondent)
                    departing_air_passenger_visitor_list.append(dpv)
             else:
                    rv = Respondent(** respondent)
                    other_list.append(rv)

        else:
            rv = Respondent(** respondent)
            other_list.append(rv)
            
     except ValidationError as err:
            respondent['error_flag'] = 'failed'
            respondent['error_message'] = str(err)
            failed_records.append(respondent) 


failed_df = pd.DataFrame(failed_records)
failed_df.head()

  Expected `float` but got `str` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `enum` but got `Terminal` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `enum` but got `Terminal` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `str` but got `list` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


Critical check failed for variable: party_size_ground_access
Critical check failed for variable: party_size_ground_access
Critical check failed for variable: party_size_ground_access
Critical check failed for variable: party_size_ground_access
Critical check failed for variable: party_size_ground_access
Critical check failed for variable: party_size_ground_access
Critical check failed for variable: party_size_ground_access
Critical check failed for variable: party_size_ground_access
Critical check failed for variable: party_size_ground_access
Critical check failed for variable: party_size_ground_access
Critical check failed for variable: party_size_ground_access
Critical check failed for variable: party_size_ground_access
Critical check failed for variable: party_size_ground_access
Critical check failed for variable: party_size_ground_access
Critical check failed for variable: party_size_ground_access
Critical check failed for variable: party_size_ground_access
Critical check failed fo

  Expected `float` but got `str` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `enum` but got `Terminal` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


In [39]:
failed_df.shape

(0, 0)

In [40]:
#failed_df['error_message'].unique()

In [41]:
#failed_df['is_completed'].value_counts()

In [42]:
#failed_df['error_message'][0]

In [43]:
failed_df.to_csv('../data/processed/failed_records.csv', index = False)

In [44]:
len(failed_df)

0

In [45]:
print("Arriving Air Passengers Residents:", len(arriving_air_passenger_resident_list))
print("Arriving Air Passengers Visitors:", len(arriving_air_passenger_visitor_list))
print("Departing Air Passengers Residents:", len(departing_air_passenger_resident_list))
print("Departing Air Passengers Visitors:", len(departing_air_passenger_visitor_list))
print("Total Air Passengers:", len(arriving_air_passenger_resident_list) + len(arriving_air_passenger_visitor_list) + len(departing_air_passenger_resident_list) + len(departing_air_passenger_visitor_list))

Arriving Air Passengers Residents: 37
Arriving Air Passengers Visitors: 150
Departing Air Passengers Residents: 1566
Departing Air Passengers Visitors: 2660
Total Air Passengers: 4413


### Make Data

In [46]:
employee_df = pd.DataFrame([Employee.model_dump() for Employee in employee_list])       

  Expected `str` but got `list` - serialized value may not be as expected
  Expected `str` but got `list` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


In [47]:
#passenger_df = pd.DataFrame([AirPassenger.model_dump() for AirPassenger in air_passenger_list])

In [48]:
arriving_passenger_resident_df = pd.DataFrame([ArrivingPassengerResident.model_dump() for ArrivingPassengerResident in arriving_air_passenger_resident_list])
arriving_passenger_visitor_df = pd.DataFrame([ArrivingPassengerVisitor.model_dump() for ArrivingPassengerVisitor in arriving_air_passenger_visitor_list])
departing_passenger_resident_df = pd.DataFrame([DepartingPassengerResident.model_dump() for DepartingPassengerResident in departing_air_passenger_resident_list])
departing_passenger_visitor_df = pd.DataFrame([DepartingPassengerVisitor.model_dump() for DepartingPassengerVisitor in departing_air_passenger_visitor_list])

  Expected `str` but got `list` - serialized value may not be as expected
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `enum` but got `Terminal` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `enum` but got `Terminal` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `enum` but got `Terminal` - serialized value may not be as expec

In [49]:
other_df = pd.DataFrame([Respondent.model_dump() for Respondent in other_list])
# other_df = add_enum_label_columns(other_df, Respondent)

  Expected `str` but got `list` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


In [50]:
len(other_list)

233

In [51]:
trip_list = []
id_list = []
for record in employee_list + arriving_air_passenger_resident_list + arriving_air_passenger_visitor_list + departing_air_passenger_resident_list + departing_air_passenger_visitor_list  + other_list:
    trip_list.append(record.trip)
    id_list.append(record.respondentid)

trip_df = pd.DataFrame([Trip.model_dump() for Trip in trip_list])
id_df = pd.DataFrame(id_list, columns=["respondentid"])

trip_df = pd.concat([id_df, trip_df], axis=1)
trip_df = add_enum_label_columns(trip_df,Trip)


  Expected `str` but got `list` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


In [52]:
output_df = pd.concat([employee_df, arriving_passenger_resident_df, arriving_passenger_visitor_df, departing_passenger_resident_df, departing_passenger_visitor_df , other_df], axis=0).reset_index(drop=True).drop(columns=["trip"])
output_df = pd.merge(output_df, trip_df, on="respondentid", how="left", suffixes = ('_person', '_trip'))

  output_df = pd.concat([employee_df, arriving_passenger_resident_df, arriving_passenger_visitor_df, departing_passenger_resident_df, departing_passenger_visitor_df , other_df], axis=0).reset_index(drop=True).drop(columns=["trip"])


In [53]:
output_df.shape

(5104, 284)

In [54]:
output_df = add_synthetic_records(output_df)

In [55]:
output_df.head()

Unnamed: 0,valid_record_person,validation_error_person,validation_severity_person,validation_num_errors_person,is_self_administered,respondentid,is_completed,is_pilot,record_type_synthetic,date_completed,...,main_mode_label,trip_start_time_label,trip_arrival_time_label,number_transit_vehicles_to_airport_label,access_mode_label,parking_location_label,parking_cost_frequency_label,reimbursement_label,number_transit_vehicles_from_airport_label,egress_mode_label
0,True,[],,0,False,5473,True,False,0,2024-10-04,...,DROVE_ALONE_AND_PARKED,FIVE_TO_FIVE_THIRTY,FIVE_TO_FIVE_THIRTY,,,EMPLOYEE_LOT_3665_ADMIRAL_BOLAND_WAY,MONTHLY,NOT_REIMBURSED,,
1,True,[],,0,False,5476,True,False,0,2024-10-04,...,MTS_ROUTE_992,EIGHT_THIRTY_TO_NINE,NINE_THIRTY_TO_TEN,ONE,WALK,,,NOT_REIMBURSED,,
2,True,[],,0,False,5489,True,False,0,2024-10-04,...,DROVE_ALONE_AND_PARKED,SIX_TO_SIX_THIRTY,SIX_TO_SIX_THIRTY,,,EMPLOYEE_LOT_3665_ADMIRAL_BOLAND_WAY,OTHER_SEPCIFY,NOT_REIMBURSED,,
3,True,[],,0,False,5558,True,False,0,2024-10-04,...,MTS_ROUTE_992,ELEVEN_TO_ELEVEN_THIRTY,ELEVEN_THIRTY_TO_NOON,NONE,WALK,,,NOT_REIMBURSED,,
4,True,[],,0,False,5593,True,False,0,2024-10-04,...,MTS_ROUTE_992,NOON_TO_TWELVE_THIRTY,NOON_TO_TWELVE_THIRTY,ONE,WALK,,,REIMBURSED_EMPLOYER_CLIENT,,


In [56]:
output_df =  add_enum_label_columns(output_df, Respondent)

output_df = add_enum_label_columns(output_df, AirPassenger)
output_df = add_enum_label_columns(output_df, DepartingAirPassenger)
output_df =  add_enum_label_columns(output_df, ArrivingAirPassenger)
output_df =  add_enum_label_columns(output_df, Resident)
output_df =  add_enum_label_columns(output_df, Visitor)

output_df = add_enum_label_columns(output_df, DepartingPassengerResident)
output_df =  add_enum_label_columns(output_df, ArrivingPassengerResident)

output_df = add_enum_label_columns(output_df, DepartingPassengerVisitor)
output_df =  add_enum_label_columns(output_df, ArrivingPassengerVisitor)

output_df =  add_enum_label_columns(output_df, Trip)
output_df =  add_enum_label_columns(output_df, Employee)

  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum

In [59]:
# Define the list of important columns
important_columns = ['is_completed', 'is_pilot', 'is_self_administered', 'record_type_synthetic']

# Separate important columns and the rest of the columns
remaining_columns = [col for col in output_df.columns if col not in important_columns]

# Create the new column order
new_column_order = important_columns + sorted(remaining_columns)

# Reorder the DataFrame
output_df = output_df[new_column_order]

# Display the updated DataFrame
output_df.head()

Unnamed: 0,is_completed,is_pilot,is_self_administered,record_type_synthetic,access_mode,access_mode_label,access_mode_other,age,age_label,airline,...,trip_start_time_label,valid_record_person,valid_record_trip,validation_error_person,validation_error_trip,validation_num_errors_person,validation_num_errors_trip,validation_severity_person,validation_severity_trip,weight
0,True,False,False,0,,,,7.0,AGE_45_49,,...,FIVE_TO_FIVE_THIRTY,True,True,[],[],0,0,,,1.0
1,True,False,False,0,1.0,WALK,,3.0,AGE_25_29,,...,EIGHT_THIRTY_TO_NINE,True,True,[],[],0,0,,,1.0
2,True,False,False,0,,,,5.0,AGE_35_39,,...,SIX_TO_SIX_THIRTY,True,True,[],[],0,0,,,1.0
3,True,False,False,0,1.0,WALK,,2.0,AGE_20_24,,...,ELEVEN_TO_ELEVEN_THIRTY,True,True,[],[],0,0,,,1.0
4,True,False,False,0,1.0,WALK,,8.0,AGE_50_54,,...,NOON_TO_TWELVE_THIRTY,True,True,[],[],0,0,,,1.0


In [60]:
output_df.index = output_df.index + 1
output_df.to_csv(output_csv_filename, index_label = 'unique_id')

In [61]:
output_df[(output_df['is_pilot']==0) & (output_df['is_completed']==1) ].to_csv('../data/processed/data_model_non_pilot.csv')