In [1]:
import sys
import os
import importlib
sys.path.insert(0, os.path.abspath("../data_model/"))

In [2]:
import pandas as pd
import numpy as np
from pydantic import ValidationError
import data_model
import enums as e
from utils import extract_base_type, add_enum_label_columns, add_list_objects, add_synthetic_records
import datetime

In [3]:
importlib.reload(data_model)
importlib.reload(e)
from data_model import Respondent, Employee, AirPassenger, Trip

In [4]:
external_dir = "../data/external"
interim_dir = "../data/interim"
processed_dir = "../data/processed"

input_file1 = os.path.join(external_dir, "etc/od_20241121_sandag_airport_draftfinal.xlsx") # latest
input_file2 = os.path.join(external_dir, "etc/od_20241015_sandag_airport_pilot_4.xlsx") #older version but records needed
variable_map_file = os.path.join(processed_dir, "revised_names.csv")
clean_survey_file = os.path.join(interim_dir, "survey_data_clean.csv")
output_csv_filename = os.path.join(processed_dir, "data_model_output.csv")
#summary_csv_filename = os.path.join(processed_dir, "data_model_output_summary.csv")

### Clean Data , Rename fields

In [5]:
in_df_complete1 = pd.read_excel(input_file1, sheet_name = 0)
in_df_incomplete1 = pd.read_excel(input_file1, sheet_name = 1)

in_df_complete2 = pd.read_excel(input_file2, sheet_name = 0)
in_df_incomplete2 = pd.read_excel(input_file2, sheet_name = 1)

in_df_complete = pd.concat([in_df_complete1, in_df_complete2], ignore_index = True)
in_df_incomplete = pd.concat([in_df_incomplete1, in_df_incomplete2], ignore_index = True)

in_df_complete['is_completed'] = 1
in_df_incomplete['is_completed'] = 0

in_df_complete['weight'] = 1
in_df_incomplete['weight'] = 0

in_df = pd.concat([in_df_complete, in_df_incomplete], ignore_index = True)
header_df = pd.read_csv(variable_map_file)[['ETC_name','WSP_name']]
header_dict = pd.Series(header_df.WSP_name.values,index=header_df.ETC_name).to_dict()
clean_df = in_df.rename(columns=header_dict).copy().drop(columns=["delete"])

  in_df_complete['is_completed'] = 1
  in_df_complete['weight'] = 1


In [6]:
print("Complete Records: ", in_df_complete.shape)
print("Incomplete Records: ", in_df_incomplete.shape)

Complete Records:  (7532, 372)
Incomplete Records:  (9069, 29)


In [7]:
clean_df.shape

(16601, 315)

In [8]:
len(clean_df['respondentid'].unique())

5104

In [9]:
#Remove the duplicate respondentids
clean_df.drop_duplicates('respondentid', keep = 'first', inplace = True)
clean_df.shape

(5104, 315)

In [10]:
clean_df[clean_df['is_completed']==1].shape

(4731, 315)

In [11]:
clean_df.head()

Unnamed: 0,respondentid,submit,date_completed,interview_location,interview_location_label,interview_location_other,inbound_or_outbound,inbound_or_outbound_label,marketsegment,marketsegment_label,...,number_workers_label,sp_invitation,sp_invitation_label,stay_informed,stay_informed_label,survey_language,survey_language_label,survey_language_other,is_completed,weight
0,5385,YES,2024-10-04 00:00:00,Term2,Terminal 2,,IN,INBOUND,1.0,Air passenger,...,THREE (3),2.0,No,0.0,NO,ENGLISH,ENGLISH,,1,1
1,5386,YES,2024-10-04 00:00:00,Term2,Terminal 2,,IN,INBOUND,1.0,Air passenger,...,TWO (2),1.0,Yes,,,ENGLISH,ENGLISH,,1,1
2,5387,NO,2024-10-04 00:00:00,Term2,Terminal 2,,IN,INBOUND,1.0,Air passenger,...,NONE (0),2.0,No,0.0,NO,ENGLISH,ENGLISH,,1,1
3,5388,YES,2024-10-04 00:00:00,SDA_1_FLYER,San Diego Flyer/Old Town Shuttle,,IN,INBOUND,1.0,Air passenger,...,TWO (2),2.0,No,0.0,NO,ENGLISH,ENGLISH,,1,1
4,5389,YES,2024-10-04 00:00:00,Term2,Terminal 2,,IN,INBOUND,1.0,Air passenger,...,THREE (3),1.0,Yes,,,ENGLISH,ENGLISH,,1,1


### Commonly occuring invalid values

In [12]:
# Get the list of columns that contain '-oth-' as a value
columns_with_oth_value = [col for col in clean_df.columns if clean_df[col].eq('-oth-').any()]

print(columns_with_oth_value)

['interview_location']


In [13]:
columns_with_dash_value = [col for col in clean_df.columns if clean_df[col].eq('-').any()]

print(columns_with_dash_value)

['flight_number', 'origin_city', 'origin_state', 'origin_zip', 'destination_city', 'destination_state', 'destination_zip', 'transit_boarding_stop_name', 'transit_boarding_latitude', 'transit_boarding_longitude', 'transit_alighting_stop_name', 'transit_alighting_latitude', 'transit_alighting_longitude', 'home_location_city', 'home_location_state', 'home_location_zip', 'home_location_latitude', 'home_location_longitude']


### Making all modes consistent

In [14]:
clean_df['egress_mode_label'].value_counts()

egress_mode_label
Walk                                 25
Picked up by car by family/friend    12
Drive alone and park                  2
Uber/Lyft                             2
Other shared van (please specify)     1
Taxi                                  1
Name: count, dtype: int64

In [15]:
clean_df['other_airport_accessmode_label'].value_counts()

other_airport_accessmode_label
DROPPED OFF BY CAR BY FRIEND FAMILY               242
UBER LYFT                                         107
DROVE ALONE AND PARKED                             42
DROVE WITH OTHERS AND PARKED                       28
OTHER PUBLIC TRANSIT                               24
RENTAL CAR AND DROPPED IT OFF AT RENTAL AGENCY     22
TAXI                                               14
WHEELCHAIR OR OTHER MOBILITY DEVICE                11
RENTAL CAR AND PARKED IT                           11
WALK                                                9
CAR SERVICE BLACK CAR LIMO EXECUTIVE CAR            4
HOTEL SHUTTLE VAN                                   3
RODE WITH OTHER TRAVELER AND PARKED                 2
NON ELECTRIC BIKESHARE                              2
PERSONAL NON ELECTRIC BICYCLE                       1
OTHER SHARED RIDE VAN SERVICE                       1
ELECTRIC BIKESHARE                                  1
EMPLOYEE SHUTTLE                                   

In [16]:
other_airport_accessmode_label_map = {
    'Walk': 'Walk',
    'Wheelchair or other mobility device': 'Wheelchair or other mobility device',
    'ELECTRIC BIKESHARE': 'Bicycle: electric bikeshare',
    'NON ELECTRIC BIKESHARE': 'Bicycle: non-electric bikeshare',
    'E SCOOTER SHARE': 'E-scooter: shared',
    'PERSONAL ELECTRIC BICYCLE': 'Bicycle: personal electric bicycle',
    'PERSONAL NON ELECTRIC BICYCLE': 'Bicycle: personal non-electric bicycle',
    'PERSONAL E SCOOTER': 'E-scooter: personal',
    'Taxi': 'Taxi',
    'UBER LYFT': 'Uber/Lyft',
    'CAR SERVICE BLACK CAR LIMO EXECUTIVE CAR': 'Car service/black car/limo/executive car',
    'DROPPED OFF BY CAR BY FRIEND FAMILY': 'Dropped off by car by family/friend',
    'Drove alone and parked': 'Drove alone and parked',
    'Drove with others and parked': 'Drove with others and parked',
    'RODE WITH OTHER TRAVELER AND PARKED': 'Rode with other traveler(s) and parked',
    'Other public transit': 'Other public transit',
    'Chartered tour bus': 'Chartered tour bus',
    'Employee shuttle': 'Employee shuttle',
    'RENTAL CAR AND DROPPED IT OFF AT RENTAL AGENCY': 'Rental car: Dropped off at rental agency',
    'RENTAL CAR AND PARKED IT': 'Rental car: parked rental car',
    'Hotel shuttle van': 'Hotel shuttle van',
    'OTHER SHARED RIDE VAN SERVICE': 'Other shared van (please specify)',
    'Other': 'Other',
    'Refused/No Answer': 'Refused/No Answer'
}
clean_df['other_airport_accessmode_label'] = clean_df['other_airport_accessmode_label'].map(other_airport_accessmode_label_map)

In [17]:
clean_df['other_airport_accessmode_label'].value_counts()

other_airport_accessmode_label
Dropped off by car by family/friend         242
Uber/Lyft                                   107
Rental car: Dropped off at rental agency     22
Rental car: parked rental car                11
Car service/black car/limo/executive car      4
Rode with other traveler(s) and parked        2
Bicycle: non-electric bikeshare               2
Bicycle: personal non-electric bicycle        1
Other shared van (please specify)             1
Bicycle: electric bikeshare                   1
Name: count, dtype: int64

In [18]:
travel_mode_dict = {
    'Walk': 1,
    'Wheelchair or other mobility device': 2,
    'Bicycle: electric bikeshare': 3,
    'Bicycle: non-electric bikeshare': 4,
    'E-scooter: shared': 5,
    'Bicycle: personal electric bicycle': 6,
    'Bicycle: personal non-electric bicycle': 7,
    'E-scooter: personal': 8,
    'Taxi': 9,
    'Uber/Lyft': 10,
    'Car service/black car/limo/executive car': 11,
    'Dropped off by car by family/friend': 12,
    'Drove alone and parked': 13,
    'Drove with others and parked': 14,
    'MTS Route 992': 15,
    'Airport flyer shuttle': 16,
    'Chartered tour bus': 17,
    'Employee shuttle': 18,
    'Rental car: Dropped off at rental agency': 19,
    'Rental car: parked rental car': 20,
    'Hotel shuttle van': 21,
    'Other shared van (please specify)': 22,
    'Picked up by car by family/friend': 23,
    'Get in a parked vehicle and drive alone': 24,
    'Get in a parked vehicle and drive with others': 25,
    'Get in a parked vehicle and ride with other traveler(s)': 26,
    'Rental car: Picked up at rental agency': 27,
    'Rental car: get in a parked rental car': 28,
    'Rode with other traveler(s) and parked': 29,
    'Other public transit': 30,
    'Other': 98,
    'Refused/No Answer': 99,
    'None of the above': 98
}

### Modes to fix

In [19]:
mode_code_columns = ['main_transit_mode', 'main_mode', 'access_mode', 'egress_mode', 'reverse_mode', 'reverse_mode_predicted', 'other_airport_accessmode', 'reverse_commute_mode']
mode_label_columns = ['main_transit_mode_label', 'main_mode_label', 'access_mode_label', 'egress_mode_label', 'reverse_mode_label', 'reverse_mode_predicted_label', 'other_airport_accessmode_label', 'reverse_commute_mode_label']

In [20]:
#Remapping codes using label strings
for mode_code_col, mode_label_col in zip(mode_code_columns, mode_label_columns):
    # Apply the mapping for each pair of columns
    clean_df[mode_code_col] = clean_df[mode_label_col].map(travel_mode_dict)

In [21]:
clean_df['other_airport_accessmode_label'].value_counts()

other_airport_accessmode_label
Dropped off by car by family/friend         242
Uber/Lyft                                   107
Rental car: Dropped off at rental agency     22
Rental car: parked rental car                11
Car service/black car/limo/executive car      4
Rode with other traveler(s) and parked        2
Bicycle: non-electric bikeshare               2
Bicycle: personal non-electric bicycle        1
Other shared van (please specify)             1
Bicycle: electric bikeshare                   1
Name: count, dtype: int64

In [22]:
clean_df['other_airport_accessmode'].value_counts()

other_airport_accessmode
12.0    242
10.0    107
19.0     22
20.0     11
11.0      4
29.0      2
4.0       2
7.0       1
22.0      1
3.0       1
Name: count, dtype: int64

In [23]:
clean_df['main_transit_mode'].value_counts()

main_transit_mode
98.0    4361
16.0     214
15.0     156
Name: count, dtype: int64

### Pre-processing of some fields

In [24]:
clean_df['date_completed'] = pd.to_datetime(clean_df['date_completed'])
clean_df['is_pilot'] = np.where(clean_df['date_completed'].dt.date<=datetime.date(2024, 9, 30), 1, 0)
clean_df['record_type_synthetic'] = 0
clean_df.replace('-oth-', 98, inplace=True)
clean_df.replace('-', None, inplace = True )
clean_df['is_income_below_poverty'] = np.where(clean_df['is_income_below_poverty'] == 0, 2, clean_df['is_income_below_poverty'])
clean_df['stay_informed'] = np.where(clean_df['stay_informed'] == 0, 2, clean_df['stay_informed'])
#Maps
interview_location_map = {'Term1' : 1, 'Term2': 2, 'MTS_1_992': 3, 'SDA_1_FLYER': 4, 'ConracShuttle': 5, 'ParkingShuttle': 6, 'EmplParking': 7, '-oth-':98} 
inbound_outbound_map = {'IN':1, 'OUT':2}

#route_fields:
route_fields = ['to_airport_transit_route_1', 'to_airport_transit_route_2', 'to_airport_transit_route_3', 'to_airport_transit_route_4',
                'from_airport_transit_route_1', 'from_airport_transit_route_2', 'from_airport_transit_route_3', 'from_airport_transit_route_4']

#Replacement
clean_df['interview_location'] = clean_df['interview_location'].map(interview_location_map)
clean_df['inbound_or_outbound'] = clean_df['inbound_or_outbound'].map(inbound_outbound_map)
clean_df['main_mode'] = np.where(clean_df['main_transit_mode'].isin([15,16]), clean_df['main_transit_mode'], clean_df['main_mode'])

clean_df[route_fields] = clean_df[route_fields].replace(98, 'OTHER')
clean_df['nights_visited'] = clean_df['nights_visited'] - 1

clean_df['same_commute_mode'] = np.where(clean_df['same_commute_mode'] == 0, 2, clean_df['same_commute_mode'])
clean_df['resident_visitor_followup'] = np.where(clean_df['resident_visitor_followup'] == 0, 2, clean_df['resident_visitor_followup'])

#activity_type
clean_df['origin_activity_type'] = np.where(clean_df['inbound_or_outbound'] == e.InboundOutbound.OUTBOUND_FROM_AIRPORT, e.ActivityType.SAN_DIEGO_AIRPORT, clean_df['origin_activity_type'])
clean_df['destination_activity_type'] = np.where(clean_df['inbound_or_outbound'] == e.InboundOutbound.INBOUND_TO_AIRPORT, e.ActivityType.SAN_DIEGO_AIRPORT, clean_df['destination_activity_type'])

#For incomplete records:
clean_df['marketsegment'] = clean_df['marketsegment'].fillna(99)


  clean_df.replace('-oth-', 98, inplace=True)


In [25]:
clean_df.to_csv(clean_survey_file, index = False)

In [26]:
clean_df.head()

Unnamed: 0,respondentid,submit,date_completed,interview_location,interview_location_label,interview_location_other,inbound_or_outbound,inbound_or_outbound_label,marketsegment,marketsegment_label,...,sp_invitation_label,stay_informed,stay_informed_label,survey_language,survey_language_label,survey_language_other,is_completed,weight,is_pilot,record_type_synthetic
0,5385,YES,2024-10-04,2.0,Terminal 2,,1.0,INBOUND,1.0,Air passenger,...,No,2.0,NO,ENGLISH,ENGLISH,,1,1,0,0
1,5386,YES,2024-10-04,2.0,Terminal 2,,1.0,INBOUND,1.0,Air passenger,...,Yes,,,ENGLISH,ENGLISH,,1,1,0,0
2,5387,NO,2024-10-04,2.0,Terminal 2,,1.0,INBOUND,1.0,Air passenger,...,No,2.0,NO,ENGLISH,ENGLISH,,1,1,0,0
3,5388,YES,2024-10-04,4.0,San Diego Flyer/Old Town Shuttle,,1.0,INBOUND,1.0,Air passenger,...,No,2.0,NO,ENGLISH,ENGLISH,,1,1,0,0
4,5389,YES,2024-10-04,2.0,Terminal 2,,1.0,INBOUND,1.0,Air passenger,...,Yes,,,ENGLISH,ENGLISH,,1,1,0,0


### Select Variables to verify for the survey

In [27]:
respondent_variables = [field_name for field_name, field_info in Respondent.__fields__.items()]


trip_variables = [field_name for field_name, field_info in Trip.__fields__.items()]
trip_variables.append('respondentid')

employee_variables = [field_name for field_name, field_info in Employee.__fields__.items()]
employee_variables.remove('trip')

air_passenger_variables = [field_name for field_name, field_info in AirPassenger.__fields__.items()]
air_passenger_variables.remove('trip')

variables_to_verify = list(set(air_passenger_variables + respondent_variables + trip_variables + employee_variables))
variables_to_verify.remove('trip')
variables_to_verify.remove('valid_record')
variables_to_verify.remove('validation_error')
variables_to_verify.remove('validation_severity')


working_df = clean_df.copy()
working_df = working_df[variables_to_verify].copy()
working_df = working_df.loc[working_df['marketsegment'].notna()].copy()
working_df.head()

Unnamed: 0,employee_parking,alt_commute_mode_bicycle_electric_bikeshare,resident_visitor,race_hp,general_modes_used_visitor_rental_car_dropped_off,from_airport_transit_route_2,general_modes_used_visitor_chartered_tour_bus,home_location_zip,from_airport_transit_route_1_other,record_type_synthetic,...,parking_cost_frequency_other,submit,sdia_accessmode_split_uber_lyft,race_other,sdia_accessmode_split_mts992,to_airport_transit_route_3_other,destination_city,reasons_no_transit_ride_too_long,sdia_accessmode_split_other,general_modes_used_visitor_bicycle_electric_bikeshare
0,,,8.0,No,No,,No,,,0,...,,YES,,,,,San Diego,,,No
1,,,1.0,No,,,,,,0,...,,YES,Yes,,No,,San Diego,,,
2,,,8.0,No,No,,No,,,0,...,,NO,,,,,,,,No
3,,,1.0,No,,,,,,0,...,,YES,,,,,San Diego,,,
4,,,1.0,No,,,,,,0,...,,YES,,,,,San Diego,,,


In [28]:
working_df.shape

(5104, 251)

### Serialize the data

In [29]:
trips_df = working_df[trip_variables].copy()
persons_df = working_df[list[set(employee_variables + respondent_variables + air_passenger_variables)]].copy()

In [30]:
# combined
respondent_list = add_list_objects(
        trips_df.to_dict(orient="records"),  #child list
        "respondentid", # child key
        persons_df.to_dict(orient="records"), # parent list
        "respondentid", # parent key
        "trip", # parent var
    )

In [31]:
len(respondent_list)

5104

In [32]:
employee_list = []
air_passenger_list = []
other_list = []
failed_records = []

for respondent in respondent_list:
    market_segment = respondent["marketsegment"]
    try:
        if market_segment == e.Type.EMPLOYEE:
            ev = Employee(** respondent)
            employee_list.append(ev)
        elif market_segment == e.Type.PASSENGER:
             av = AirPassenger(** respondent)
             air_passenger_list.append(av)
        else:
            rv = Respondent(** respondent)
            other_list.append(rv)
    except ValidationError as err:
            respondent['error_flag'] = 'failed'
            respondent['error_message'] = str(err)
            failed_records.append(respondent) 


failed_df = pd.DataFrame(failed_records)
failed_df.head()

In [33]:
failed_df.shape

(0, 0)

In [34]:
#failed_df['error_message'].unique()

In [35]:
#failed_df.to_csv('../data/processed/failed_records.csv', index = False)

In [36]:
len(failed_df)

0

### Make Data

In [37]:
employee_df = pd.DataFrame([Employee.model_dump() for Employee in employee_list])       

In [38]:
passenger_df = pd.DataFrame([AirPassenger.model_dump() for AirPassenger in air_passenger_list])

  Expected `float` but got `str` - serialized value may not be as expected
  Expected `enum` but got `Terminal` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `enum` but got `Terminal` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `enum` but got `Terminal` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


In [39]:
other_df = pd.DataFrame([Respondent.model_dump() for Respondent in other_list])
# other_df = add_enum_label_columns(other_df, Respondent)

In [40]:
len(other_list)

102

In [41]:
trip_list = []
id_list = []
for record in employee_list + air_passenger_list + other_list:
    trip_list.append(record.trip)
    id_list.append(record.respondentid)

trip_df = pd.DataFrame([Trip.model_dump() for Trip in trip_list])
id_df = pd.DataFrame(id_list, columns=["respondentid"])

trip_df = pd.concat([id_df, trip_df], axis=1)
trip_df = add_enum_label_columns(trip_df,Trip)


  Expected `float` but got `str` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


In [42]:
output_df = pd.concat([employee_df, passenger_df, other_df], axis=0).reset_index(drop=True).drop(columns=["trip"])
output_df = pd.merge(output_df, trip_df, on="respondentid", how="left")

  output_df = pd.concat([employee_df, passenger_df, other_df], axis=0).reset_index(drop=True).drop(columns=["trip"])


In [43]:
output_df.shape

(5104, 278)

In [44]:
output_df = add_synthetic_records(output_df)

In [45]:
output_df =  add_enum_label_columns(output_df, Respondent)
output_df =  add_enum_label_columns(output_df, AirPassenger)
output_df =  add_enum_label_columns(output_df, Trip)
output_df =  add_enum_label_columns(output_df, Employee)

  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum

In [46]:
output_df['destination_activity_type_label'].value_counts()

destination_activity_type_label
SAN_DIEGO_AIRPORT    4863
HOME                 1581
HOTEL                1518
OTHER_RESIDENCE       983
OTHER                 102
nan                    63
USUAL_WORKPLACE        52
OTHER_BUSINESS         50
CONVENTION_CENTER       9
Name: count, dtype: int64

In [47]:
reference_columns = clean_df.columns.tolist()

important_columns = ['is_completed', 'is_pilot', 'record_type_synthetic', 'weight']


# Ensure no duplicate columns by removing important_columns from reference_columns
reference_columns = [col for col in reference_columns if col not in important_columns]

# Identify remaining columns that are not in reference_columns or important_columns
remaining_columns = [col for col in output_df.columns if col not in (reference_columns + important_columns)]

# Create the new column order
new_column_order = ['is_completed', 'is_pilot', 'record_type_synthetic'] + reference_columns + sorted(remaining_columns) + ['weight']

# Reorder the DataFrame
output_df = output_df[new_column_order]

# Display the updated DataFrame
output_df.head()

Unnamed: 0,is_completed,is_pilot,record_type_synthetic,respondentid,submit,date_completed,interview_location,interview_location_label,interview_location_other,inbound_or_outbound,...,next_flight_destination,non_airport_activity_type,parking_cost_numeric,previous_flight_origin,taxi_fhv_fare_numeric,taxi_fhv_wait_numeric,valid_record,validation_error,validation_severity,weight
0,True,False,0,5473,True,2024-10-04,2.0,TERMINAL_2,,1.0,...,,2.0,100.0,,,,True,,,1.0
1,True,False,0,5476,True,2024-10-04,3.0,ONBOARD_992,,1.0,...,,2.0,,,,,False,Prefer Not to disclose cannot be combined with...,Low,1.0
2,True,False,0,5489,True,2024-10-04,2.0,TERMINAL_2,,1.0,...,,2.0,134.0,,,,True,,,1.0
3,True,False,0,5558,True,2024-10-04,2.0,TERMINAL_2,,1.0,...,,2.0,,,,,True,,,1.0
4,True,False,0,5593,True,2024-10-04,2.0,TERMINAL_2,,1.0,...,,2.0,,,,,True,,,1.0


In [48]:
output_df.index = output_df.index+1
output_df.to_csv(output_csv_filename, index_label = 'unique_id')