In [2]:
import pandas as pd 
import numpy as np
import os

In [3]:
external_dir = "../data/external"
interim_dir = "../data/interim"
processed_dir = "../data/processed"


input_file_1 = os.path.join(external_dir, "etc/od_20241002_sandag_airport_pilot.xlsx")
input_file_2 = os.path.join(external_dir, "etc/od_20241004_sandag_airport_pilot_2.xlsx")
input_file_3 = os.path.join(external_dir, "etc/od_20241010_sandag_airport_pilot_3.xlsx") 
input_file = os.path.join(external_dir, "etc/od_20241015_sandag_airport_pilot_4.xlsx") #pilot survey 4, latest
variable_map_file = os.path.join(processed_dir, "revised_names.csv")

In [4]:
#in_df_1 = pd.read_excel(input_file_1)
#in_df_2 = pd.read_excel(input_file_2)
#in_df_3 = pd.read_excel(input_file_3)
#
#in_df = pd.concat([in_df_1, in_df_2, in_df_3], ignore_index = True)

in_df = pd.read_excel(input_file)

header_df = pd.read_csv(variable_map_file)[['ETC_name','WSP_name']]
header_dict = pd.Series(header_df.WSP_name.values,index=header_df.ETC_name).to_dict()
clean_df = in_df.rename(columns=header_dict).copy().drop(columns=["delete"])

In [5]:
clean_df.shape

(3597, 302)

In [6]:
clean_df.head()

Unnamed: 0,respondentid,date_completed,interview_location,interview_location_label,interview_location_other,inbound_or_outbound,inbound_or_outbound_label,marketsegment,marketsegment_label,is_qualified_age,...,household_income_label,is_income_below_poverty,number_workers,number_workers_label,sp_invitation,sp_invitation_label,stay_informed,survey_language,survey_language_label,survey_language_other
0,4273,9/30/2024,Term1,Terminal 1,,OUT,OUTBOUND,1,Air passenger,YES,...,"$75,000-$99,999",,2,TWO (2),2.0,No,NO,ENGLISH,ENGLISH,
1,4282,9/30/2024,Term1,Terminal 1,,IN,INBOUND,1,Air passenger,YES,...,"$60,000-$74,999",,6,SIX (6),1.0,Yes,,SPANI,SPANISH,
2,4283,9/30/2024,Term1,Terminal 1,,IN,INBOUND,1,Air passenger,YES,...,"$60,000-$74,999",,0,NONE (0),2.0,No,NO,ENGLISH,ENGLISH,
3,4286,9/30/2024,Term1,Terminal 1,,IN,INBOUND,1,Air passenger,YES,...,"$150,000 or more",,2,TWO (2),1.0,Yes,,ENGLISH,ENGLISH,
4,4290,9/30/2024,Term1,Terminal 1,,IN,INBOUND,1,Air passenger,YES,...,Prefer not to say,No,0,NONE (0),2.0,No,NO,ENGLISH,ENGLISH,


In [7]:
#Remove duplicate respondentid
clean_df = clean_df.drop_duplicates(subset='respondentid', keep='first')
clean_df.reset_index(drop=True, inplace=True)
clean_df.shape

(3597, 302)

In [8]:
cols_with_oth = [col for col in clean_df.columns if '-oth-' in clean_df[col].values]
cols_with_oth

['interview_location',
 'flight_purpose',
 'shift_start_airport_building',
 'employer',
 'occupation',
 'origin_activity_type',
 'main_mode',
 'access_mode',
 'parking_location',
 'parking_cost_frequency',
 'car_available',
 'reverse_mode_predicted',
 'reverse_commute_mode',
 'same_commute_mode',
 'gender']

In [9]:
cols_with_oth_and_98 = [col for col in clean_df[cols_with_oth] if 98 in clean_df[col].values]
cols_with_oth_and_98

['flight_purpose',
 'shift_start_airport_building',
 'employer',
 'occupation',
 'origin_activity_type',
 'access_mode',
 'parking_location',
 'parking_cost_frequency',
 'car_available',
 'reverse_mode_predicted',
 'same_commute_mode',
 'gender']

In [10]:
def replace_oth_with_98(df):
    # Iterate over each mode column
    for col in df.columns:
        # Replace '-oth-' with 98
        df[col] = df[col].replace('-oth-', 98)
    
    return df

In [11]:
clean_df = replace_oth_with_98(clean_df)

  df[col] = df[col].replace('-oth-', 98)


### Mode summarization starts here

In [12]:
#Select columns relevant to mode
exclude_substrings = ('general', 'alt', 'split', 'same', 'sdia', '_other')
mode_columns = [col for col in clean_df.columns if 'mode' in col and all(sub not in col for sub in exclude_substrings)]
print(mode_columns)

['main_transit_mode', 'main_transit_mode_label', 'main_mode', 'main_mode_label', 'access_mode', 'access_mode_label', 'egress_mode', 'egress_mode_label', 'reverse_mode', 'reverse_mode_label', 'reverse_mode_predicted', 'reverse_mode_predicted_label', 'other_airport_accessmode', 'other_airport_accessmode_label', 'reverse_commute_mode', 'reverse_commute_mode_label']


In [13]:
mode_code_columns = [col for col in mode_columns if 'label' not in col]
mode_code_columns

['main_transit_mode',
 'main_mode',
 'access_mode',
 'egress_mode',
 'reverse_mode',
 'reverse_mode_predicted',
 'other_airport_accessmode',
 'reverse_commute_mode']

In [14]:
mode_label_columns = [col for col in mode_columns if 'label' in col]
mode_label_columns

['main_transit_mode_label',
 'main_mode_label',
 'access_mode_label',
 'egress_mode_label',
 'reverse_mode_label',
 'reverse_mode_predicted_label',
 'other_airport_accessmode_label',
 'reverse_commute_mode_label']

In [15]:
value_counts_df = pd.concat([clean_df[col].value_counts().rename(col) for col in mode_columns], axis=1)

print(value_counts_df)

                                     main_transit_mode  \
3                                               3283.0   
SDA_1_FLYER                                      186.0   
MTS_1_992                                        128.0   
None of the above                                  NaN   
Airport flyer shuttle                              NaN   
...                                                ...   
PERSONAL NON ELECTRIC BICYCLE                      NaN   
OTHER SHARED RIDE VAN SERVICE                      NaN   
RODE WITH OTHER TRAVELER AND PARKED                NaN   
ELECTRIC BIKESHARE                                 NaN   
EMPLOYEE SHUTTLE                                   NaN   

                                     main_transit_mode_label  main_mode  \
3                                                        NaN        NaN   
SDA_1_FLYER                                              NaN        NaN   
MTS_1_992                                                NaN        NaN   
Non

In [16]:
clean_df['main_mode'].value_counts()

main_mode
12.0    1121
10.0    1074
19.0     354
13.0     195
21.0      89
20.0      82
9.0       77
14.0      63
27.0      45
11.0      40
22.0      24
23.0      22
30.0      18
18.0      17
17.0      14
98.0      13
1.0       10
24.0       8
28.0       6
29.0       6
25.0       3
16.0       1
26.0       1
Name: count, dtype: int64

In [17]:
clean_df['main_mode_label'].value_counts()

main_mode_label
Dropped off by car by family/friend                        1121
Uber/Lyft                                                  1074
Rental car: Dropped off at rental agency                    354
Drove alone and parked                                      195
Hotel shuttle van                                            89
Rental car: parked rental car                                82
Taxi                                                         77
Drove with others and parked                                 63
Rental car: Picked up at rental agency                       45
Car service/black car/limo/executive car                     40
Other shared van (please specify)                            24
Picked up by car by family/friend                            22
Other public transit                                         18
Employee shuttle                                             17
Chartered tour bus                                           14
Other                   

In [18]:
clean_df['main_transit_mode'].value_counts()

main_transit_mode
3              3283
SDA_1_FLYER     186
MTS_1_992       128
Name: count, dtype: int64

In [19]:
clean_df['main_transit_mode_label'].value_counts()

main_transit_mode_label
None of the above        3283
Airport flyer shuttle     186
MTS Route 992             128
Name: count, dtype: int64

In [20]:
clean_df['date_completed'] = pd.to_datetime(clean_df['date_completed'])

In [21]:
unique_dates = clean_df['date_completed'].unique()
unique_dates

<DatetimeArray>
['2024-09-30 00:00:00', '2024-10-01 00:00:00', '2024-10-02 00:00:00',
 '2024-10-03 00:00:00', '2024-10-04 00:00:00', '2024-10-05 00:00:00',
 '2024-10-06 00:00:00', '2024-10-07 00:00:00', '2024-10-08 00:00:00',
 '2024-10-09 00:00:00', '2024-10-10 00:00:00', '2024-10-11 00:00:00',
 '2024-10-12 00:00:00', '2024-10-13 00:00:00', '2024-10-14 00:00:00']
Length: 15, dtype: datetime64[ns]

In [31]:
import pandas as pd

def get_value_counts_with_labels(df, mode_columns, label_columns):
    # Create an empty list to store the result
    result = []

    # Iterate over each pair of code and label columns
    for col, label_col in zip(mode_columns, label_columns):
        # Get value counts for the code column
        counts = df[col].value_counts().reset_index()
        counts['share'] = counts['count']/np.sum(counts['count'])

        # Get value counts for the label column
        label_counts = df[label_col].value_counts().reset_index()
        label_counts['share'] = label_counts['count']/np.sum(label_counts['count'])

        # Rename columns to 'value' and 'count'
        counts.columns = ['value', 'count', 'count_share']
        label_counts.columns = ['label_value', 'label_count', 'label_share']

        # Merge both DataFrames side by side (assuming they match by index)
        merged = pd.concat([counts, label_counts], axis=1)

        # Add a column for the column name (code column)
        merged['column'] = col

        # Append the result to the list
        result.append(merged)

    # Concatenate all the DataFrames in the list into a single DataFrame
    final_df = pd.concat(result, ignore_index=True)

    # Reorder columns to have 'column', 'value', 'count', 'label_value', 'label_count'
    final_df = final_df[['column', 'value', 'label_value', 'count', 'count_share', 'label_count', 'label_share']]

    return final_df


In [32]:
value_counts_df = get_value_counts_with_labels(clean_df, mode_code_columns, mode_label_columns)
print(value_counts_df)

                   column        value  \
0       main_transit_mode            3   
1       main_transit_mode  SDA_1_FLYER   
2       main_transit_mode    MTS_1_992   
3               main_mode         12.0   
4               main_mode         10.0   
..                    ...          ...   
104  reverse_commute_mode         15.0   
105  reverse_commute_mode         18.0   
106  reverse_commute_mode         11.0   
107  reverse_commute_mode         98.0   
108  reverse_commute_mode          9.0   

                                           label_value  count  count_share  \
0                                    None of the above   3283     0.912705   
1                                Airport flyer shuttle    186     0.051710   
2                                        MTS Route 992    128     0.035585   
3                  Dropped off by car by family/friend   1121     0.341456   
4                                            Uber/Lyft   1074     0.327140   
..                         

In [33]:
clean_df['date_completed']

0      2024-09-30
1      2024-09-30
2      2024-09-30
3      2024-09-30
4      2024-09-30
          ...    
3592   2024-10-14
3593   2024-10-14
3594   2024-10-14
3595   2024-10-14
3596   2024-10-14
Name: date_completed, Length: 3597, dtype: datetime64[ns]

In [34]:
value_counts_df.to_csv("../data/processed/Mode_summaries.csv", index = False)

In [35]:
clean_df['access_mode'].value_counts()

access_mode
1.0     114
13.0     72
12.0     38
10.0     21
16.0     15
98.0      9
11.0      8
14.0      6
15.0      3
9.0       2
Name: count, dtype: int64

In [36]:
clean_df['access_mode_label'].value_counts()

access_mode_label
Walk                                        114
Drove alone and parked                       72
Dropped off by car by family/friend          38
Uber/Lyft                                    21
Other public transit                         15
Other                                         9
Car service/black car/limo/executive car      8
Drove with others and parked                  6
Rode with other traveler(s) and parked        3
Taxi                                          2
Name: count, dtype: int64

In [37]:
clean_df['reverse_mode_predicted'].value_counts()

reverse_mode_predicted
12.0    533
10.0    370
13.0     91
14.0     49
11.0     21
9.0      18
17.0     17
98.0     15
16.0     11
21.0     11
18.0      9
20.0      7
15.0      6
24.0      6
1.0       4
22.0      3
23.0      3
3.0       2
4.0       1
7.0       1
2.0       1
Name: count, dtype: int64

In [55]:
clean_df['reverse_mode_predicted_label'].value_counts()

reverse_mode_predicted_label
Dropped off by car by family/friend         533
Uber/Lyft                                   370
Drove alone and parked                       91
Drove with others and parked                 49
Car service/black car/limo/executive car     21
Taxi                                         18
Airport flyer shuttle                        17
Other                                        15
MTS Route 992                                11
Rental car: Dropped off at rental agency     11
Other public transit                          9
Employee shuttle                              7
Rode with other traveler(s) and parked        6
Other shared van (please specify)             6
Walk                                          4
Rental car: parked rental car                 3
Hotel shuttle van                             3
Bicycle: electric bikeshare                   2
Bicycle: non-electric bikeshare               1
Bicycle: personal non-electric bicycle        1
Wheelchair 

In [56]:
#main_mode code = 16 label and code don't exist
#access_mode, reverse_mode_predicted 

In [38]:
unique_dates = clean_df['date_completed'].unique()
unique_dates

<DatetimeArray>
['2024-09-30 00:00:00', '2024-10-01 00:00:00', '2024-10-02 00:00:00',
 '2024-10-03 00:00:00', '2024-10-04 00:00:00', '2024-10-05 00:00:00',
 '2024-10-06 00:00:00', '2024-10-07 00:00:00', '2024-10-08 00:00:00',
 '2024-10-09 00:00:00', '2024-10-10 00:00:00', '2024-10-11 00:00:00',
 '2024-10-12 00:00:00', '2024-10-13 00:00:00', '2024-10-14 00:00:00']
Length: 15, dtype: datetime64[ns]

In [39]:
total_df = pd.DataFrame()
for i in unique_dates:
    df_by_date = clean_df[clean_df['date_completed'] == i]
    value_counts_df = get_value_counts_with_labels(df_by_date, mode_code_columns, mode_label_columns)
    value_counts_df['date'] = i
    total_df = pd.concat([total_df,value_counts_df], ignore_index = True)


In [40]:
total_df.head()

Unnamed: 0,column,value,label_value,count,count_share,label_count,label_share,date
0,main_transit_mode,3,None of the above,117,0.975,117.0,0.975,2024-09-30
1,main_transit_mode,SDA_1_FLYER,Airport flyer shuttle,3,0.025,3.0,0.025,2024-09-30
2,main_mode,12.0,Dropped off by car by family/friend,35,0.299145,35.0,0.301724,2024-09-30
3,main_mode,10.0,Uber/Lyft,30,0.25641,30.0,0.258621,2024-09-30
4,main_mode,19.0,Rental car: Dropped off at rental agency,19,0.162393,19.0,0.163793,2024-09-30


In [41]:
total_df.to_csv("../data/processed/Mode_summaries_by_date.csv", index = False)