In [1]:
import pandas as pd

### **TODO:** Check on Data Quality

In [2]:
# Data Source
# https://www.fhwa.dot.gov/policyinformation/analysisframework/01.cfm

non_bus = pd.read_csv('2008AutoNonbiz.csv',
                      header=None,
                      names=[
                          'origin_code',
                          'destination_code',
                          'n_trips'
                      ]
                     )

print(non_bus.shape)
non_bus.head()

(9897316, 3)


Unnamed: 0,origin_code,destination_code,n_trips
0,1001,1001,0.0
1,1001,1003,2622.0
2,1001,1005,1971.0
3,1001,1007,0.0
4,1001,1009,1219.0


In [3]:
# 66% of Origin Destination Pairs have n_trips values of 0
non_bus['n_trips'].value_counts()[0]/len(non_bus['n_trips'])

0.6601878731567224

In [4]:
# Shorten the DF by Removing all Pairs with Counts of 0 or 1
non_bus = non_bus[non_bus['n_trips'] > 50]
non_bus.shape

(2214298, 3)

In [5]:
# DF is now 33% of Original Length
non_bus.shape[0]/non_bus.shape[0]

1.0

In [6]:
non_bus['n_trips'].describe()

count    2.214298e+06
mean     9.355895e+02
std      1.385785e+04
min      5.100000e+01
25%      9.100000e+01
50%      1.760000e+02
75%      4.590000e+02
max      9.098075e+06
Name: n_trips, dtype: float64

In [7]:
# Non_Bus must be reduced further
# According to my crude calculations... to keep the for loop under 15 min
# we need to reduce the row count to 250,000

In [8]:
# Reduce Non_Bus to 250000 Rows
non_bus = non_bus.sort_values(by='n_trips', ascending=False).iloc[:250000]
non_bus.shape

(250000, 3)

In [9]:
# Data Source:
# https://www.census.gov/geographies/reference-files/2018/demo/popest/2018-fips.html
# TODO
# Rename Columns
codes = pd.read_csv('all-geocodes-v2018.csv',
                    skiprows=4
                  )
print(codes.shape)
codes.head()

(43847, 7)


Unnamed: 0,Summary Level,State Code (FIPS),County Code (FIPS),County Subdivision Code (FIPS),Place Code (FIPS),Consolidtated City Code (FIPS),Area Name (including legal/statistical area description)
0,10,0,0,0,0,0,United States
1,40,1,0,0,0,0,Alabama
2,50,1,1,0,0,0,Autauga County
3,50,1,3,0,0,0,Baldwin County
4,50,1,5,0,0,0,Barbour County


In [10]:
# Add Labeled Cols

origin_state_code = []
for code in non_bus['origin_code']:
    if len(str(code)) > 4:
        origin_state_code.append(str(code)[:2])
    elif len(str(code)) == 4:
        origin_state_code.append(str(code)[0])

destination_state_code = []
for code in non_bus['destination_code']:
    if len(str(code)) > 4:
        destination_state_code.append(str(code)[:2])
    elif len(str(code)) == 4:
        destination_state_code.append(str(code)[0])
        
origin_county_code = [str(code)[-3:] for code in non_bus['origin_code']]
destination_county_code = [str(code)[-3:] for code in non_bus['destination_code']]

assert len(origin_state_code) == len(destination_state_code) == len(origin_county_code) == len(destination_county_code)

In [12]:
# Insert new Columns

origin_state_county = list(zip(origin_state_code, origin_county_code))
destination_state_county = list(zip(destination_state_code, destination_county_code))
non_bus.insert(3, 'origin_state_county', origin_state_county)
non_bus.insert(4, 'destination_state_county', destination_state_county)
non_bus.head(2)

Unnamed: 0,origin_code,destination_code,n_trips,origin_state_county,destination_state_county
695469,6073,6037,9098075.0,"(6, 073)","(6, 037)"
638859,6037,6073,9098075.0,"(6, 037)","(6, 073)"


In [13]:
# Testing 1, 2, 3...
# Test Query String works with Autauga County
# Query Sting Using df.query() is an alternative to bracket filtering 
# that works more efficiently
query_string = f"{int(1)} == `State Code (FIPS)` & {int(1)} == `County Code (FIPS)`"

In [14]:
# Testing 1, 2, 3...
codes.query(query_string)['Area Name (including legal/statistical area description)'].iloc[0]

'Autauga County'

In [15]:
def convert_fips_city(state_county_tuple):
    """
    Takes a tuple of state fips code, county fips code
    And converts to Name String
    """
    state_code = state_county_tuple[0]
    county_code = state_county_tuple[1]
    # mask = (codes['State Code (FIPS)'] == int(state_code)) & (codes['County Code (FIPS)'] == int(county_code))
    query_string = f"{int(state_code)} == `State Code (FIPS)` & {int(county_code)} == `County Code (FIPS)`"
    codes.query(query_string)
    # return codes[mask]['Area Name (including legal/statistical area description)'].iloc[0]
    return codes.query(query_string)['Area Name (including legal/statistical area description)'].iloc[0]

### Which Method is More Efficent for Converting Codes to Name?

In [23]:
%%time
origin = non_bus['origin_state_county'].iloc[:10000].apply(convert_fips_city)

CPU times: user 33.8 s, sys: 60 ms, total: 33.9 s
Wall time: 33.9 s


In [24]:
print(len(origin))
origin[0:5]

10000


1    Autauga County
2    Autauga County
4    Autauga County
7    Autauga County
8    Autauga County
Name: origin_state_county, dtype: object

In [16]:
%%time
# Looks like the For Loop is Slightly More Time Efficient than .Apply()
origin = []
for i in range(non_bus.shape[0]):
    try:
        tup = tuple([origin_state_code[i], origin_county_code[i]])
        city_name = convert_fips_city(tup)
        origin.append(city_name)
    
    except:
        origin.append(f'E: {origin_state_code[i]}, {origin_county_code[i]}')

CPU times: user 13min 12s, sys: 826 ms, total: 13min 13s
Wall time: 13min 13s


In [17]:
print(len(origin))
origin[0:5]

250000


['San Diego County',
 'Los Angeles County',
 'Imperial County',
 'San Diego County',
 'Riverside County']

In [19]:
# Insert Origin Name Row
non_bus.insert(5, 'origin', origin)

ValueError: cannot insert origin, already exists

In [None]:
# TODO: Check on N Errors in Origin

In [20]:
%%time
# Looks like the For Loop is Slightly More Time Efficient than .Apply()
destination = []
for i in range(non_bus.shape[0]):
    try:
        tup = tuple([destination_state_code[i], destination_county_code[i]])
        city_name = convert_fips_city(tup)
        destination.append(city_name)
    
    except:
        destination.append(f'E: {destination_state_code[i]}, {destination_county_code[i]}')

CPU times: user 14min 52s, sys: 1.36 s, total: 14min 53s
Wall time: 35min 27s


In [21]:
non_bus.insert(6, 'destination', destination)

In [22]:
non_bus.head()

Unnamed: 0,origin_code,destination_code,n_trips,origin_state_county,destination_state_county,origin,destination
695469,6073,6037,9098075.0,"(6, 073)","(6, 037)",San Diego County,Los Angeles County
638859,6037,6073,9098075.0,"(6, 037)","(6, 073)",Los Angeles County,San Diego County
619983,6025,6073,4337119.0,"(6, 025)","(6, 073)",Imperial County,San Diego County
695463,6073,6025,4337119.0,"(6, 073)","(6, 025)",San Diego County,Imperial County
682885,6065,6037,4009696.0,"(6, 065)","(6, 037)",Riverside County,Los Angeles County


In [23]:
non_bus.to_csv('most_popular_trips.csv')

In [24]:
non_bus.tail()

Unnamed: 0,origin_code,destination_code,n_trips,origin_state_county,destination_state_county,origin,destination
3554518,22033,48321,1178.0,"(22, 033)","(48, 321)",East Baton Rouge Parish,Matagorda County
5931392,36113,23011,1178.0,"(36, 113)","(23, 011)",Warren County,Kennebec County
2454965,18167,21185,1178.0,"(18, 167)","(21, 185)",Vigo County,Oldham County
2865027,20045,40071,1178.0,"(20, 045)","(40, 071)",Douglas County,Kay County
1308080,13059,47123,1178.0,"(13, 059)","(47, 123)",Clarke County,Monroe County


In [28]:

non_bus['destination'].value_counts()['Sevier County']

175

In [29]:
non_bus['destination'].value_counts()['Jefferson County']

2963

In [None]:
mask = (codes['State Code (FIPS)'] == 1) & (codes['County Code (FIPS)'] == 1)

In [None]:
int('001')

In [None]:
codes[mask]

In [None]:
non_bus.sort_values(by='n_trips', ascending=False)

In [None]:
# Find Most Popular Interstate Voyage
# TODO
county_code = [str(code)[-3:] for code in non_bus['origin']]
state_code = [str(code)[:2] for code in non_bus['origin'] if len(str(code)) > 4]


#non_bus.sort_values(by='n_trips', ascending=False).head(30)

In [None]:
for code in non_bus

In [None]:
len(state_code) == len(non_bus)

In [None]:
# Fips Code
# Origin FIPS (Note: The last 3 digits of the FIPS codes represent the county and the first 1 
# or 2 digits represent the state (the leading 0 has been removed for state FIPS codes 
# between 01 and 09). As an example, FIPS Code 1001 represents Autauga County, Alabama.

In [None]:
mask = (codes['State Code (FIPS)'] == 6) & (codes['County Code (FIPS)'] == 37)
codes[mask]

In [None]:
mask = (codes['State Code (FIPS)'] == 6) & (codes['County Code (FIPS)'] == 25)
codes[mask]['Area Name (including legal/statistical area description)'].iloc[0]

In [None]:
def convert_fips_city(fips_code):
    mask = (codes['State Code (FIPS)'] == 6) & (codes['County Code (FIPS)'] == 25)
    return codes[mask]['Area Name (including legal/statistical area description)'].iloc[0]

In [None]:
# The most common trip is between San Diego and LA :)

In [None]:
non_bus.head()

In [None]:
non_bus['origin'].value_counts().nlargest(5)

In [None]:
# Top Origins = Top Destinations
all(non_bus['destination'].value_counts().nlargest(25)) == all(non_bus['origin'].value_counts().nlargest(25))

In [None]:
non_bus['destination'].value_counts().nlargest(5).apply(convert_fips_city)

In [None]:
non_bus['destination'].value_counts()

In [None]:
non_bus.head()

In [None]:
non_bus.sort_values(by='n_trips', ascending=False).head(2)

In [None]:
tx_code = codes[codes['Area Name (including legal/statistical area description)'] == 'Texas']['State Code (FIPS)'].iloc[0]

In [None]:
tx_code

In [None]:
# TX Mask
tx_mask = [non_bus[non_bus['destination']]
non_bus.sort_values(by='n_trips', ascending=False)