In [1]:
import sys
import os
import importlib
sys.path.insert(0, os.path.abspath("../data_model/"))

In [2]:
import pandas as pd
import numpy as np
from pydantic import ValidationError
import data_model
import enums as e
from utils import extract_base_type, add_enum_label_columns, add_list_objects  

In [3]:
importlib.reload(data_model)
importlib.reload(e)
from data_model import Respondent, Employee, AirPassenger, Trip

In [4]:
external_dir = "../data/external"
interim_dir = "../data/interim"
processed_dir = "../data/processed"

input_file = os.path.join(external_dir, "etc/od_20241015_sandag_airport_pilot_4.xlsx") #pilot survey 3, latest
variable_map_file = os.path.join(processed_dir, "revised_names.csv")
clean_survey_file = os.path.join(interim_dir, "survey_data_clean.csv")
output_csv_filename = os.path.join(processed_dir, "data_model_output.csv")
#summary_csv_filename = os.path.join(processed_dir, "data_model_output_summary.csv")

### Clean Data , Rename fields

In [5]:
in_df = pd.read_excel(input_file)
header_df = pd.read_csv(variable_map_file)[['ETC_name','WSP_name']]
header_dict = pd.Series(header_df.WSP_name.values,index=header_df.ETC_name).to_dict()
clean_df = in_df.rename(columns=header_dict).copy().drop(columns=["delete"])

In [6]:
clean_df.shape

(3597, 302)

In [7]:
clean_df.head()

Unnamed: 0,respondentid,date_completed,interview_location,interview_location_label,interview_location_other,inbound_or_outbound,inbound_or_outbound_label,marketsegment,marketsegment_label,is_qualified_age,...,household_income_label,is_income_below_poverty,number_workers,number_workers_label,sp_invitation,sp_invitation_label,stay_informed,survey_language,survey_language_label,survey_language_other
0,4273,9/30/2024,Term1,Terminal 1,,OUT,OUTBOUND,1,Air passenger,YES,...,"$75,000-$99,999",,2,TWO (2),2.0,No,NO,ENGLISH,ENGLISH,
1,4282,9/30/2024,Term1,Terminal 1,,IN,INBOUND,1,Air passenger,YES,...,"$60,000-$74,999",,6,SIX (6),1.0,Yes,,SPANI,SPANISH,
2,4283,9/30/2024,Term1,Terminal 1,,IN,INBOUND,1,Air passenger,YES,...,"$60,000-$74,999",,0,NONE (0),2.0,No,NO,ENGLISH,ENGLISH,
3,4286,9/30/2024,Term1,Terminal 1,,IN,INBOUND,1,Air passenger,YES,...,"$150,000 or more",,2,TWO (2),1.0,Yes,,ENGLISH,ENGLISH,
4,4290,9/30/2024,Term1,Terminal 1,,IN,INBOUND,1,Air passenger,YES,...,Prefer not to say,No,0,NONE (0),2.0,No,NO,ENGLISH,ENGLISH,


In [8]:
# Get the list of columns that contain '-oth-' as a value
columns_with_oth_value = [col for col in clean_df.columns if clean_df[col].eq('-oth-').any()]

print(columns_with_oth_value)

['interview_location', 'flight_purpose', 'shift_start_airport_building', 'employer', 'occupation', 'origin_activity_type', 'main_mode', 'access_mode', 'parking_location', 'parking_cost_frequency', 'car_available', 'reverse_mode_predicted', 'reverse_commute_mode', 'same_commute_mode', 'gender']


In [9]:
columns_with_oth_value = [col for col in clean_df.columns if clean_df[col].eq('-').any()]

print(columns_with_oth_value)

['flight_number', 'origin_city', 'origin_state', 'origin_zip', 'destination_city', 'destination_zip', 'transit_boarding_stop_name', 'transit_boarding_latitude', 'transit_boarding_longitude', 'transit_alighting_stop_name', 'transit_alighting_latitude', 'transit_alighting_longitude', 'home_location_city', 'home_location_zip']


### Pre-processing of some fields

In [10]:
clean_df['date_completed'] = pd.to_datetime(clean_df['date_completed'])
clean_df.replace('-oth-', 98, inplace=True)
clean_df.replace('-', None, inplace = True )

#Maps
interview_location_map = {'Term1' : 1, 'Term2': 2, 'MTS_1_992': 3, 'SDA_1_Flyer': 4, 'ConracShuttle': 5, 'ParkingShuttle': 6, 'EmplParking': 7, '-oth-':98} 
inbound_outbound_map = {'IN':1, 'OUT':2}
main_transit_mode_map = {'SDA_1_FLYER': 16, 'MTS_1_992': 15, 3: None}
#route_fields:
route_fields = ['to_airport_transit_route_1', 'to_airport_transit_route_2', 'to_airport_transit_route_3', 'to_airport_transit_route_4',
                'from_airport_transit_route_1', 'from_airport_transit_route_2', 'from_airport_transit_route_3', 'from_airport_transit_route_4']

#Replacement
clean_df['interview_location'] = clean_df['interview_location'].map(interview_location_map)
clean_df['inbound_or_outbound'] = clean_df['inbound_or_outbound'].map(inbound_outbound_map)
clean_df['main_transit_mode'] = clean_df['main_transit_mode'].map(main_transit_mode_map)
clean_df[route_fields] = clean_df[route_fields].replace(98, 'OTHER')
clean_df['nights_visited'] = clean_df['nights_visited'] - 1

clean_df['household_income'] = np.where(clean_df['household_income'] >= 15, 13, clean_df['household_income'])
clean_df['same_commute_mode'] = np.where(clean_df['same_commute_mode'] == 0, 2, clean_df['same_commute_mode'])

  clean_df.replace('-oth-', 98, inplace=True)


In [11]:
clean_df['interview_location']

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
3592    2.0
3593    1.0
3594    1.0
3595    2.0
3596    1.0
Name: interview_location, Length: 3597, dtype: float64

In [12]:
clean_df.to_csv(clean_survey_file, index = False)

### Select Variables to verify for the survey

In [13]:
respondent_variables = [field_name for field_name, field_info in Respondent.__fields__.items()]
respondent_variables.remove('trip')

trip_variables = [field_name for field_name, field_info in Trip.__fields__.items()]
trip_variables.append('respondentid')

employee_variables = [field_name for field_name, field_info in Employee.__fields__.items()]
employee_variables.remove('trip')

air_passenger_variables = [field_name for field_name, field_info in AirPassenger.__fields__.items()]
air_passenger_variables.remove('trip')

variables_to_verify = list(set(air_passenger_variables + respondent_variables + trip_variables + employee_variables))

working_df = clean_df.copy()
working_df = working_df[variables_to_verify].copy()
working_df = working_df.loc[working_df['marketsegment'].notna()].copy()
working_df.head()

Unnamed: 0,reasons_no_transit_too_complicated,destination_name,general_modes_used_visitor_bicycle_personal_non_electric_bicycle,alt_commute_mode_uber_lyft,alt_commute_mode_walk,airport_access_transit_use_elsewhere,household_income,origin_city,transit_boarding_latitude,number_persons_in_household,...,inbound_or_outbound,party_includes_child_aged06to17,transit_alighting_longitude,general_modes_used_visitor_walk,transit_alighting_latitude,race_aian,sdia_accessmode_split_bicycle_non_electric_bikeshare,number_commute_days,general_modes_used_visitor_drove_alone_and_parked,sdia_accessmode_split_rental_car_parked
0,,,,,,,11,,,3,...,2,,,,,No,,,,
1,,,No,,,1.0,10,Mexicali,32.732843,6,...,1,,-117.198122,No,32.732,No,No,,No,No
2,,,No,,,6.0,10,Encinitas,,3,...,1,,,No,,No,,,No,
3,,,No,,,2.0,13,Solana Beach,,4,...,1,No,,No,,No,,,No,
4,,,,,,,14,San Diego,,2,...,1,,,,,No,,,,


In [14]:
working_df.shape

(3597, 237)

### Serialize the data

In [15]:
trips_df = working_df[trip_variables].copy()
persons_df = working_df[list[set(employee_variables + respondent_variables + air_passenger_variables)]].copy()

In [16]:
# combined
respondent_list = add_list_objects(
        trips_df.to_dict(orient="records"),  #child list
        "respondentid", # child key
        persons_df.to_dict(orient="records"), # parent list
        "respondentid", # parent key
        "trip", # parent var
    )

In [17]:
len(respondent_list)

3597

In [18]:
employee_list = []
air_passenger_list = []
other_list = []
failed_records = []

for respondent in respondent_list:
    market_segment = respondent["marketsegment"]
    try:
        if market_segment == 2: #e.Type.EMPLOYEE:
            ev = Employee(** respondent)
            employee_list.append(ev)
        elif market_segment == 1: #e.Type.PASSENGER:
             av = AirPassenger(** respondent)
             air_passenger_list.append(av)
        else:
            rv = Respondent(** respondent)
            other_list.append(rv)
    except ValidationError as err:
            respondent['error_flag'] = 'failed'
            respondent['error_message'] = str(err)
            failed_records.append(respondent) 

failed_df = pd.DataFrame(failed_records)
failed_df.head()

Unnamed: 0,reasons_no_transit_too_complicated,destination_name,general_modes_used_visitor_bicycle_personal_non_electric_bicycle,alt_commute_mode_uber_lyft,alt_commute_mode_walk,airport_access_transit_use_elsewhere,household_income,origin_city,transit_boarding_latitude,number_persons_in_household,...,general_modes_used_visitor_walk,transit_alighting_latitude,race_aian,sdia_accessmode_split_bicycle_non_electric_bikeshare,number_commute_days,general_modes_used_visitor_drove_alone_and_parked,sdia_accessmode_split_rental_car_parked,trip,error_flag,error_message
0,,,,No,No,,11,San Diego,,3,...,,,No,,6.0,,,"{'inbound_or_outbound': 1, 'origin_activity_ty...",failed,1 validation error for Employee\nsame_commute_...
1,,,No,,,2.0,12,San Diego,,1,...,No,,No,,,No,,"{'inbound_or_outbound': 1, 'origin_activity_ty...",failed,2 validation errors for AirPassenger\ntrip.tax...
2,,,,No,No,,14,Los Angeles,,3,...,,,No,,2.0,,,"{'inbound_or_outbound': 1, 'origin_activity_ty...",failed,1 validation error for Employee\nsame_commute_...
3,,,,No,No,,14,San Diego,,6,...,,,No,,5.0,,,"{'inbound_or_outbound': 1, 'origin_activity_ty...",failed,1 validation error for Employee\nsame_commute_...
4,,,No,,,5.0,14,San Diego,,4,...,No,,No,,,No,,"{'inbound_or_outbound': 1, 'origin_activity_ty...",failed,2 validation errors for AirPassenger\ntrip.tax...


In [19]:
failed_df.shape

(47, 240)

In [20]:
failed_df['error_message'].head()

0    1 validation error for Employee\nsame_commute_...
1    2 validation errors for AirPassenger\ntrip.tax...
2    1 validation error for Employee\nsame_commute_...
3    1 validation error for Employee\nsame_commute_...
4    2 validation errors for AirPassenger\ntrip.tax...
Name: error_message, dtype: object

In [22]:
failed_df.to_csv('../data/processed/failed_records.csv', index = False)

In [23]:
len(failed_df)

47