In [1]:
import sys
import os
import importlib
sys.path.insert(0, os.path.abspath("../data_model/"))

In [2]:
import pandas as pd
import numpy as np
from pydantic import ValidationError
import data_model
import enums as e
from utils import extract_base_type, add_enum_label_columns, add_list_objects  

In [3]:
importlib.reload(data_model)
importlib.reload(e)
from data_model import Respondent, Employee, AirPassenger, Trip

In [4]:
external_dir = "../data/external"
interim_dir = "../data/interim"
processed_dir = "../data/processed"

input_file = os.path.join(external_dir, "etc/od_20241015_sandag_airport_pilot_4.xlsx") #pilot survey 3, latest
variable_map_file = os.path.join(processed_dir, "revised_names.csv")
clean_survey_file = os.path.join(interim_dir, "survey_data_clean.csv")
output_csv_filename = os.path.join(processed_dir, "data_model_output.csv")
#summary_csv_filename = os.path.join(processed_dir, "data_model_output_summary.csv")

### Clean Data , Rename fields

In [5]:
in_df = pd.read_excel(input_file)
header_df = pd.read_csv(variable_map_file)[['ETC_name','WSP_name']]
header_dict = pd.Series(header_df.WSP_name.values,index=header_df.ETC_name).to_dict()
clean_df = in_df.rename(columns=header_dict).copy().drop(columns=["delete"])

In [6]:
clean_df.shape

(3597, 302)

In [7]:
clean_df.head()

Unnamed: 0,respondentid,date_completed,interview_location,interview_location_label,interview_location_other,inbound_or_outbound,inbound_or_outbound_label,marketsegment,marketsegment_label,is_qualified_age,...,household_income_label,is_income_below_poverty,number_workers,number_workers_label,sp_invitation,sp_invitation_label,stay_informed,survey_language,survey_language_label,survey_language_other
0,4273,9/30/2024,Term1,Terminal 1,,OUT,OUTBOUND,1,Air passenger,YES,...,"$75,000-$99,999",,2,TWO (2),2.0,No,NO,ENGLISH,ENGLISH,
1,4282,9/30/2024,Term1,Terminal 1,,IN,INBOUND,1,Air passenger,YES,...,"$60,000-$74,999",,6,SIX (6),1.0,Yes,,SPANI,SPANISH,
2,4283,9/30/2024,Term1,Terminal 1,,IN,INBOUND,1,Air passenger,YES,...,"$60,000-$74,999",,0,NONE (0),2.0,No,NO,ENGLISH,ENGLISH,
3,4286,9/30/2024,Term1,Terminal 1,,IN,INBOUND,1,Air passenger,YES,...,"$150,000 or more",,2,TWO (2),1.0,Yes,,ENGLISH,ENGLISH,
4,4290,9/30/2024,Term1,Terminal 1,,IN,INBOUND,1,Air passenger,YES,...,Prefer not to say,No,0,NONE (0),2.0,No,NO,ENGLISH,ENGLISH,


### Commonly occuring invalid values

In [8]:
# Get the list of columns that contain '-oth-' as a value
columns_with_oth_value = [col for col in clean_df.columns if clean_df[col].eq('-oth-').any()]

print(columns_with_oth_value)

['interview_location', 'flight_purpose', 'shift_start_airport_building', 'employer', 'occupation', 'origin_activity_type', 'main_mode', 'access_mode', 'parking_location', 'parking_cost_frequency', 'car_available', 'reverse_mode_predicted', 'reverse_commute_mode', 'same_commute_mode', 'gender']


In [9]:
columns_with_dash_value = [col for col in clean_df.columns if clean_df[col].eq('-').any()]

print(columns_with_dash_value)

['flight_number', 'origin_city', 'origin_state', 'origin_zip', 'destination_city', 'destination_zip', 'transit_boarding_stop_name', 'transit_boarding_latitude', 'transit_boarding_longitude', 'transit_alighting_stop_name', 'transit_alighting_latitude', 'transit_alighting_longitude', 'home_location_city', 'home_location_zip']


### Making all modes consistent

In [10]:
clean_df['egress_mode_label'].value_counts()

egress_mode_label
Walk                                 20
Picked up by car by family/friend     5
Drive alone and park                  1
Other shared van (please specify)     1
Name: count, dtype: int64

In [11]:
clean_df['other_airport_accessmode_label'].value_counts()

other_airport_accessmode_label
DROPPED OFF BY CAR BY FRIEND FAMILY               190
UBER LYFT                                          76
DROVE ALONE AND PARKED                             32
DROVE WITH OTHERS AND PARKED                       23
OTHER PUBLIC TRANSIT                               17
RENTAL CAR AND DROPPED IT OFF AT RENTAL AGENCY     17
TAXI                                               12
WHEELCHAIR OR OTHER MOBILITY DEVICE                11
RENTAL CAR AND PARKED IT                            8
WALK                                                7
CAR SERVICE BLACK CAR LIMO EXECUTIVE CAR            3
HOTEL SHUTTLE VAN                                   3
CHARTERED TOUR BUS                                  1
PERSONAL NON ELECTRIC BICYCLE                       1
OTHER SHARED RIDE VAN SERVICE                       1
RODE WITH OTHER TRAVELER AND PARKED                 1
ELECTRIC BIKESHARE                                  1
EMPLOYEE SHUTTLE                                   

In [12]:
other_airport_accessmode_label_map = {
    'Walk': 'Walk',
    'Wheelchair or other mobility device': 'Wheelchair or other mobility device',
    'ELECTRIC BIKESHARE': 'Bicycle: electric bikeshare',
    'NON ELECTRIC BIKESHARE': 'Bicycle: non-electric bikeshare',
    'E SCOOTER SHARE': 'E-scooter: shared',
    'PERSONAL ELECTRIC BICYCLE': 'Bicycle: personal electric bicycle',
    'PERSONAL NON ELECTRIC BICYCLE': 'Bicycle: personal non-electric bicycle',
    'PERSONAL E SCOOTER': 'E-scooter: personal',
    'Taxi': 'Taxi',
    'UBER LYFT': 'Uber/Lyft',
    'CAR SERVICE BLACK CAR LIMO EXECUTIVE CAR': 'Car service/black car/limo/executive car',
    'DROPPED OFF BY CAR BY FRIEND FAMILY': 'Dropped off by car by family/friend',
    'Drove alone and parked': 'Drove alone and parked',
    'Drove with others and parked': 'Drove with others and parked',
    'RODE WITH OTHER TRAVELER AND PARKED': 'Rode with other traveler(s) and parked',
    'Other public transit': 'Other public transit',
    'Chartered tour bus': 'Chartered tour bus',
    'Employee shuttle': 'Employee shuttle',
    'RENTAL CAR AND DROPPED IT OFF AT RENTAL AGENCY': 'Rental car: Dropped off at rental agency',
    'RENTAL CAR AND PARKED IT': 'Rental car: parked rental car',
    'Hotel shuttle van': 'Hotel shuttle van',
    'OTHER SHARED RIDE VAN SERVICE': 'Other shared van (please specify)',
    'Other': 'Other',
    'Refused/No Answer': 'Refused/No Answer'
}
clean_df['other_airport_accessmode_label'] = clean_df['other_airport_accessmode_label'].map(other_airport_accessmode_label_map)

In [13]:
clean_df['other_airport_accessmode_label'].value_counts()

other_airport_accessmode_label
Dropped off by car by family/friend         190
Uber/Lyft                                    76
Rental car: Dropped off at rental agency     17
Rental car: parked rental car                 8
Car service/black car/limo/executive car      3
Bicycle: personal non-electric bicycle        1
Other shared van (please specify)             1
Rode with other traveler(s) and parked        1
Bicycle: electric bikeshare                   1
Name: count, dtype: int64

In [14]:
travel_mode_dict = {
    'Walk': 1,
    'Wheelchair or other mobility device': 2,
    'Bicycle: electric bikeshare': 3,
    'Bicycle: non-electric bikeshare': 4,
    'E-scooter: shared': 5,
    'Bicycle: personal electric bicycle': 6,
    'Bicycle: personal non-electric bicycle': 7,
    'E-scooter: personal': 8,
    'Taxi': 9,
    'Uber/Lyft': 10,
    'Car service/black car/limo/executive car': 11,
    'Dropped off by car by family/friend': 12,
    'Drove alone and parked': 13,
    'Drove with others and parked': 14,
    'Chartered tour bus': 17,
    'Employee shuttle': 18,
    'Rental car: Dropped off at rental agency': 19,
    'Rental car: parked rental car': 20,
    'Hotel shuttle van': 21,
    'Other shared van (please specify)': 22,
    'Picked up by car by family/friend': 23,
    'Get in a parked vehicle and drive alone': 24,
    'Get in a parked vehicle and drive with others': 25,
    'Get in a parked vehicle and ride with other traveler(s)': 26,
    'Rental car: Picked up at rental agency': 27,
    'Rental car: get in a parked rental car': 28,
    'Rode with other traveler(s) and parked': 29,
    'Other public transit': 30,
    'Other': 98,
    'Refused/No Answer': 99
}

### Modes to fix

In [15]:
exclude_substrings = ('general', 'alt', 'split', 'same', 'sdia', '_other')
mode_columns = [col for col in clean_df.columns if 'mode' in col and all(sub not in col for sub in exclude_substrings)]
print(mode_columns)

#Separating codes and labels
mode_code_columns = [col for col in mode_columns if 'label' not in col]
print(mode_code_columns)
mode_label_columns = [col for col in mode_columns if 'label' in col]
print(mode_label_columns)

['main_transit_mode', 'main_transit_mode_label', 'main_mode', 'main_mode_label', 'access_mode', 'access_mode_label', 'egress_mode', 'egress_mode_label', 'reverse_mode', 'reverse_mode_label', 'reverse_mode_predicted', 'reverse_mode_predicted_label', 'other_airport_accessmode', 'other_airport_accessmode_label', 'reverse_commute_mode', 'reverse_commute_mode_label']
['main_transit_mode', 'main_mode', 'access_mode', 'egress_mode', 'reverse_mode', 'reverse_mode_predicted', 'other_airport_accessmode', 'reverse_commute_mode']
['main_transit_mode_label', 'main_mode_label', 'access_mode_label', 'egress_mode_label', 'reverse_mode_label', 'reverse_mode_predicted_label', 'other_airport_accessmode_label', 'reverse_commute_mode_label']


In [16]:
#Remapping codes using label strings
for mode_code_col, mode_label_col in zip(mode_code_columns, mode_label_columns):
    # Apply the mapping for each pair of columns
    clean_df[mode_code_col] = clean_df[mode_label_col].map(travel_mode_dict)

In [17]:
clean_df['other_airport_accessmode_label'].value_counts()

other_airport_accessmode_label
Dropped off by car by family/friend         190
Uber/Lyft                                    76
Rental car: Dropped off at rental agency     17
Rental car: parked rental car                 8
Car service/black car/limo/executive car      3
Bicycle: personal non-electric bicycle        1
Other shared van (please specify)             1
Rode with other traveler(s) and parked        1
Bicycle: electric bikeshare                   1
Name: count, dtype: int64

In [18]:
clean_df['other_airport_accessmode'].value_counts()

other_airport_accessmode
12.0    190
10.0     76
19.0     17
20.0      8
11.0      3
7.0       1
22.0      1
29.0      1
3.0       1
Name: count, dtype: int64

### Pre-processing of some fields

In [19]:
clean_df['date_completed'] = pd.to_datetime(clean_df['date_completed'])
clean_df.replace('-oth-', 98, inplace=True)
clean_df.replace('-', None, inplace = True )

#Maps
interview_location_map = {'Term1' : 1, 'Term2': 2, 'MTS_1_992': 3, 'SDA_1_Flyer': 4, 'ConracShuttle': 5, 'ParkingShuttle': 6, 'EmplParking': 7, '-oth-':98} 
inbound_outbound_map = {'IN':1, 'OUT':2}
main_transit_mode_map = {'SDA_1_FLYER': 16, 'MTS_1_992': 15, 3: None}

all_modes_map = {}
#route_fields:
route_fields = ['to_airport_transit_route_1', 'to_airport_transit_route_2', 'to_airport_transit_route_3', 'to_airport_transit_route_4',
                'from_airport_transit_route_1', 'from_airport_transit_route_2', 'from_airport_transit_route_3', 'from_airport_transit_route_4']

#Replacement
clean_df['interview_location'] = clean_df['interview_location'].map(interview_location_map)
clean_df['inbound_or_outbound'] = clean_df['inbound_or_outbound'].map(inbound_outbound_map)
clean_df['main_transit_mode'] = clean_df['main_transit_mode'].map(main_transit_mode_map)
clean_df[route_fields] = clean_df[route_fields].replace(98, 'OTHER')
clean_df['nights_visited'] = clean_df['nights_visited'] - 1

clean_df['household_income'] = np.where(clean_df['household_income'] >= 15, 13, clean_df['household_income'])
clean_df['same_commute_mode'] = np.where(clean_df['same_commute_mode'] == 0, 2, clean_df['same_commute_mode'])
clean_df['resident_visitor_followup'] = np.where(clean_df['resident_visitor_followup'] == 0, 2, clean_df['resident_visitor_followup'])

#activity_type
clean_df['origin_activity_type'] = np.where(clean_df['inbound_or_outbound'] == e.InboundOutbound.OUTBOUND_FROM_AIRPORT, None, clean_df['origin_activity_type'])
clean_df['destination_activity_type'] = np.where(clean_df['inbound_or_outbound'] == e.InboundOutbound.INBOUND_TO_AIRPORT, None, clean_df['destination_activity_type'])


  clean_df.replace('-oth-', 98, inplace=True)


In [20]:
clean_df['interview_location']

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
3592    2.0
3593    1.0
3594    1.0
3595    2.0
3596    1.0
Name: interview_location, Length: 3597, dtype: float64

In [21]:
clean_df.to_csv(clean_survey_file, index = False)

### Select Variables to verify for the survey

In [22]:
respondent_variables = [field_name for field_name, field_info in Respondent.__fields__.items()]
respondent_variables.remove('trip')

trip_variables = [field_name for field_name, field_info in Trip.__fields__.items()]
trip_variables.append('respondentid')

employee_variables = [field_name for field_name, field_info in Employee.__fields__.items()]
employee_variables.remove('trip')

air_passenger_variables = [field_name for field_name, field_info in AirPassenger.__fields__.items()]
air_passenger_variables.remove('trip')

variables_to_verify = list(set(air_passenger_variables + respondent_variables + trip_variables + employee_variables))

working_df = clean_df.copy()
working_df = working_df[variables_to_verify].copy()
working_df = working_df.loc[working_df['marketsegment'].notna()].copy()
working_df.head()

Unnamed: 0,reasons_no_transit_too_complicated,number_vehicles,alt_commute_mode_other_public_transit,reasons_no_transit_not_convenient,alt_commute_mode_other_shared_van,transit_alighting_longitude,general_modes_used_visitor_e_scooter_personal,other_airport_accessmode,destination_state,date_completed,...,sdia_accessmode_split_flyer_shuttle,sdia_accessmode_split_rental_car_dropped_off,state_of_residence,shift_start_airport_building_other,general_modes_used_visitor_employee_shuttle,reasons_no_transit_ride_too_long,alt_commute_mode_mts_route_992,origin_activity_type,party_size_flight,sdia_accessmode_split_other_public_transit
0,,3,,No,,,,,CA,2024-09-30,...,,,,,,,,,0.0,
1,,1,,,,-117.198122,No,,,2024-09-30,...,No,No,,,No,,,2.0,0.0,No
2,,2,,No,,,No,,,2024-09-30,...,,,,,No,,,2.0,0.0,
3,,2,,Yes,,,No,,,2024-09-30,...,,,,,No,,,98.0,4.0,
4,,2,,Yes,,,,,,2024-09-30,...,,,,,,,,3.0,0.0,


In [23]:
working_df.shape

(3597, 237)

### Serialize the data

In [24]:
trips_df = working_df[trip_variables].copy()
persons_df = working_df[list[set(employee_variables + respondent_variables + air_passenger_variables)]].copy()

In [25]:
# combined
respondent_list = add_list_objects(
        trips_df.to_dict(orient="records"),  #child list
        "respondentid", # child key
        persons_df.to_dict(orient="records"), # parent list
        "respondentid", # parent key
        "trip", # parent var
    )

In [26]:
len(respondent_list)

3597

In [27]:
employee_list = []
air_passenger_list = []
other_list = []
failed_records = []

for respondent in respondent_list:
    market_segment = respondent["marketsegment"]
    try:
        if market_segment == e.Type.EMPLOYEE:
            ev = Employee(** respondent)
            employee_list.append(ev)
        elif market_segment == e.Type.PASSENGER:
             av = AirPassenger(** respondent)
             air_passenger_list.append(av)
        else:
            rv = Respondent(** respondent)
            other_list.append(rv)
    except ValidationError as err:
            respondent['error_flag'] = 'failed'
            respondent['error_message'] = str(err)
            failed_records.append(respondent) 

failed_df = pd.DataFrame(failed_records)
failed_df.head()

Unnamed: 0,reasons_no_transit_too_complicated,number_vehicles,alt_commute_mode_other_public_transit,reasons_no_transit_not_convenient,alt_commute_mode_other_shared_van,transit_alighting_longitude,general_modes_used_visitor_e_scooter_personal,other_airport_accessmode,destination_state,date_completed,...,shift_start_airport_building_other,general_modes_used_visitor_employee_shuttle,reasons_no_transit_ride_too_long,alt_commute_mode_mts_route_992,origin_activity_type,party_size_flight,sdia_accessmode_split_other_public_transit,trip,error_flag,error_message
0,,0,,No,,,No,,,2024-10-01,...,,No,,,6,5.0,,"{'inbound_or_outbound': 1, 'origin_activity_ty...",failed,2 validation errors for AirPassenger\ntrip.tax...
1,,3,,No,,,No,,,2024-10-02,...,,No,,,3,0.0,,"{'inbound_or_outbound': 1, 'origin_activity_ty...",failed,2 validation errors for AirPassenger\ntrip.tax...
2,,3,,No,,,No,,,2024-10-02,...,,No,,,3,1.0,,"{'inbound_or_outbound': 1, 'origin_activity_ty...",failed,2 validation errors for AirPassenger\ntrip.tax...
3,,3,,,,-117.197756,,,,2024-10-04,...,,,,,2,,,"{'inbound_or_outbound': 1, 'origin_activity_ty...",failed,1 validation error for Employee\n Value error...
4,,1,,No,,,No,,,2024-10-04,...,,No,,,1,0.0,,"{'inbound_or_outbound': 1, 'origin_activity_ty...",failed,1 validation error for AirPassenger\n Value e...


In [28]:
failed_df.shape

(10, 240)

In [29]:
failed_df['error_message'].head()

0    2 validation errors for AirPassenger\ntrip.tax...
1    2 validation errors for AirPassenger\ntrip.tax...
2    2 validation errors for AirPassenger\ntrip.tax...
3    1 validation error for Employee\n  Value error...
4    1 validation error for AirPassenger\n  Value e...
Name: error_message, dtype: object

In [30]:
failed_df.to_csv('../data/processed/failed_records.csv', index = False)

In [31]:
len(failed_df)

10

### Make Data

In [32]:
employee_df = pd.DataFrame([Employee.model_dump() for Employee in employee_list])       
employee_df = add_enum_label_columns(employee_df,Employee)
employee_df = add_enum_label_columns(employee_df,Respondent)

In [33]:
passenger_df = pd.DataFrame([AirPassenger.model_dump() for AirPassenger in air_passenger_list])
passenger_df = add_enum_label_columns(passenger_df,AirPassenger)
passenger_df = add_enum_label_columns(passenger_df,Respondent)

In [34]:
# other_df = pd.DataFrame([Respondent.model_dump() for Respondent in other_list])
# other_df = add_enum_label_columns(other_df, Respondent)

KeyError: 'interview_location'

In [35]:
other_list

[]

In [36]:
trip_list = []
id_list = []
for record in employee_list + air_passenger_list + other_list:
    trip_list.append(record.trip)
    id_list.append(record.respondentid)

trip_df = pd.DataFrame([Trip.model_dump() for Trip in trip_list])
id_df = pd.DataFrame(id_list, columns=["respondentid"])

trip_df = pd.concat([id_df, trip_df], axis=1)
trip_df = add_enum_label_columns(trip_df,Trip)


In [37]:
output_df = pd.concat([employee_df, passenger_df, other_df], axis=0).reset_index(drop=True).drop(columns=["trip"])
output_df = pd.merge(output_df, trip_df, on="respondentid", how="left")

  output_df = pd.concat([employee_df, passenger_df, other_df], axis=0).reset_index(drop=True).drop(columns=["trip"])


In [38]:
output_df.shape

(3587, 298)

In [39]:
clean_df.shape

(3597, 302)

In [40]:
output_df.to_csv(output_csv_filename, index = False)