In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Enterprise Station Dataset Preprocessing

In [2]:
data_path = '../data/raw/enterprise_station.csv'
raw_df = pd.read_csv(data_path)

## Data Summary

In [3]:
raw_df.head()

Unnamed: 0,index,tid,loc_name,loc_number,loc_type,address_1,country,city,state,phone,postal_code,group_branch_number,latitude,longitude,update_timestamp,RunDate,InsertUpdateTime,RunID,brand
0,0,103985,Lihue Kuhio Hwy.,LIHC61,CITY,3-3257 Kuhio Hwy,US,Lihue,HI,+1 808-241-5580,96766,36EL,21.990878,-159.366127,2022-03-25 00:00:00,2022-05-03 08:50:05,2022-05-03 08:53:49,46292,ENTERPRISE
1,1,103986,Lihue Airport,LIHT61,AIRPORT,3276 Hoolimalima Pl,US,Lihue,HI,+1 844-914-1553,96766,3642,21.9799,-159.3502,2021-05-03 00:00:00,2022-05-03 08:50:05,2022-05-03 08:53:49,46292,ENTERPRISE
2,2,103987,Schofield Barracks (Military Only),E13651,CITY,694 Mccormack Ave Bldg 694,US,Schofield Barrack,HI,+1 808-671-5399,96786,3651,21.498478,-158.008438,2022-04-06 00:00:00,2022-05-03 08:50:05,2022-05-03 08:54:09,46292,ENTERPRISE
3,3,103988,Molokai Airport,MKKT71,AIRPORT,Bldg #2 Airport Loop Mkk,US,Hoolehua,HI,+1 808-567-6381,96729,36A8,21.156484,-157.0984,2021-10-07 00:00:00,2022-05-03 08:50:05,2022-05-03 08:55:44,46292,ALAMO
4,4,103989,Kailua,E13610,CITY,134a Hamakua Dr,US,Kailua,HI,+1 808-261-4282,96734,3610,21.391604,-157.743429,2022-03-25 00:00:00,2022-05-03 08:50:05,2022-05-03 08:55:55,46292,ENTERPRISE


In [4]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index                4177 non-null   int64  
 1   tid                  4177 non-null   int64  
 2   loc_name             4177 non-null   object 
 3   loc_number           4177 non-null   object 
 4   loc_type             4177 non-null   object 
 5   address_1            4177 non-null   object 
 6   country              4177 non-null   object 
 7   city                 4177 non-null   object 
 8   state                4177 non-null   object 
 9   phone                4177 non-null   object 
 10  postal_code          4177 non-null   int64  
 11  group_branch_number  4177 non-null   object 
 12  latitude             4177 non-null   float64
 13  longitude            4177 non-null   float64
 14  update_timestamp     4177 non-null   object 
 15  RunDate              4177 non-null   o

In [5]:
raw_df.describe(include='all')

Unnamed: 0,index,tid,loc_name,loc_number,loc_type,address_1,country,city,state,phone,postal_code,group_branch_number,latitude,longitude,update_timestamp,RunDate,InsertUpdateTime,RunID,brand
count,4177.0,4177.0,4177,4177,4177,4177,4177,4177,4177,4177,4177.0,4177,4177.0,4177.0,4177,4177,4177,4177.0,4177
unique,,,3923,4177,5,4086,1,2251,52,4035,,4177,,,442,1,1145,,3
top,,,Madison,LIHC61,CITY,1 Glen Rd,US,Houston,CA,+1 214-688-4396,,36EL,,,2022-04-19 00:00:00,2022-05-03 08:50:05,2022-05-03 14:12:16,,ENTERPRISE
freq,,,7,1,3691,3,4177,50,421,7,,1,,,144,4177,55,,4171
mean,2088.0,106073.0,,,,,,,,,49634.113239,,36.999051,-91.196025,,,,46292.0,
std,1205.940366,1205.940366,,,,,,,,,29707.040714,,5.448131,16.467218,,,,0.0,
min,0.0,103985.0,,,,,,,,,603.0,,18.00594,-159.366127,,,,46292.0,
25%,1044.0,105029.0,,,,,,,,,25414.0,,33.55422,-97.600942,,,,46292.0,
50%,2088.0,106073.0,,,,,,,,,45014.0,,37.8648,-86.111582,,,,46292.0,
75%,3132.0,107117.0,,,,,,,,,77450.0,,40.900747,-79.431661,,,,46292.0,


In [6]:
missing_values = raw_df.isnull().sum()
print("Missing or null values in each column:\n", missing_values)

Missing or null values in each column:
 index                  0
tid                    0
loc_name               0
loc_number             0
loc_type               0
address_1              0
country                0
city                   0
state                  0
phone                  0
postal_code            0
group_branch_number    0
latitude               0
longitude              0
update_timestamp       0
RunDate                0
InsertUpdateTime       0
RunID                  0
brand                  0
dtype: int64


#### Dropping Missing Values
Since there are no columns with null values, no further processing is needed.

In [8]:
cleaned_df = raw_df
cleaned_df

Unnamed: 0,index,tid,loc_name,loc_number,loc_type,address_1,country,city,state,phone,postal_code,group_branch_number,latitude,longitude,update_timestamp,RunDate,InsertUpdateTime,RunID,brand
0,0,103985,Lihue Kuhio Hwy.,LIHC61,CITY,3-3257 Kuhio Hwy,US,Lihue,HI,+1 808-241-5580,96766,36EL,21.990878,-159.366127,2022-03-25 00:00:00,2022-05-03 08:50:05,2022-05-03 08:53:49,46292,ENTERPRISE
1,1,103986,Lihue Airport,LIHT61,AIRPORT,3276 Hoolimalima Pl,US,Lihue,HI,+1 844-914-1553,96766,3642,21.979900,-159.350200,2021-05-03 00:00:00,2022-05-03 08:50:05,2022-05-03 08:53:49,46292,ENTERPRISE
2,2,103987,Schofield Barracks (Military Only),E13651,CITY,694 Mccormack Ave Bldg 694,US,Schofield Barrack,HI,+1 808-671-5399,96786,3651,21.498478,-158.008438,2022-04-06 00:00:00,2022-05-03 08:50:05,2022-05-03 08:54:09,46292,ENTERPRISE
3,3,103988,Molokai Airport,MKKT71,AIRPORT,Bldg #2 Airport Loop Mkk,US,Hoolehua,HI,+1 808-567-6381,96729,36A8,21.156484,-157.098400,2021-10-07 00:00:00,2022-05-03 08:50:05,2022-05-03 08:55:44,46292,ALAMO
4,4,103989,Kailua,E13610,CITY,134a Hamakua Dr,US,Kailua,HI,+1 808-261-4282,96734,3610,21.391604,-157.743429,2022-03-25 00:00:00,2022-05-03 08:50:05,2022-05-03 08:55:55,46292,ENTERPRISE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4172,4172,108157,Key Largo,E14128,CITY,100149 Overseas Highway,US,Key Largo,FL,+1 305-451-3998,33037,4128,25.100639,-80.433584,2020-08-27 00:00:00,2022-05-03 08:50:05,2022-05-03 17:05:36,46292,ENTERPRISE
4173,4173,108158,Florida Keys Marathon Airport,E14186,AIRPORT,9400 Overseas Hwy.,US,Marathon,FL,+1 305-289-7630,33050,4186,24.725854,-81.047595,2021-10-11 00:00:00,2022-05-03 08:50:05,2022-05-03 17:08:51,46292,ENTERPRISE
4174,4174,108159,Boca Chica Naval Air Military Only,E141BC,CITY,Bldg A-4203 Midway Ave,US,Key West Naval Airstation,FL,+1 305-292-0220,33040,41BC,24.581717,-81.689769,2022-04-19 00:00:00,2022-05-03 08:50:05,2022-05-03 17:08:52,46292,ENTERPRISE
4175,4175,108160,Key West International Airport,EYWT61,AIRPORT,3491 S Roosevelt Blvd,US,Key West,FL,+1 844-958-1292,33040,41KE,24.555600,-81.758300,2021-07-13 00:00:00,2022-05-03 08:50:05,2022-05-03 17:08:52,46292,ENTERPRISE


## Data Type Conversions

In [9]:
processed_df = cleaned_df
print("Columns in cleaned_df:", processed_df.columns)


Columns in cleaned_df: Index(['index', 'tid', 'loc_name', 'loc_number', 'loc_type', 'address_1',
       'country', 'city', 'state', 'phone', 'postal_code',
       'group_branch_number', 'latitude', 'longitude', 'update_timestamp',
       'RunDate', 'InsertUpdateTime', 'RunID', 'brand'],
      dtype='object')


## Svaing the Cleaned Dataset

In [10]:
cleaned_file_path = '../data/cleaned/enterprise_station.csv'
processed_df.to_csv(cleaned_file_path, index=False)

print("\nData Preprocessing Complete. Cleaned data saved to:", cleaned_file_path)


Data Preprocessing Complete. Cleaned data saved to: ../data/cleaned/enterprise_station.csv
