<h1>Overnight Bus Line Generation Model by Timour Almakaev</h1>

<h5>Required Libraries</h5>

In [54]:
import pandas as pd
import numpy as np
from itertools import permutations
import folium
from folium import plugins

<h5>data cleaning and formating</h5>

In [3]:
cities = pd.read_csv('/home/scrappycoco/Documents/uscities.csv')
cities = cities.loc[cities['population']>99999]
nycBorough = cities[cities['city'].isin(['Staten Island','Brooklyn','Manhattan','Bronx','Queens'])]
cities.drop(nycBorough.index,inplace = True)
cities.drop(['source','military','incorporated','timezone','ranking','zips','id'], axis=1, inplace=True)
cities_sort = cities.sort_values(by=['population','density'], ascending=False)
city_names = cities['city']+', '+cities['state_name']
cords = cities['lat'].astype('string')+','+cities['lng'].astype('string')
cords = cities.loc[:,['lat','lng']]
cityWcords = pd.DataFrame({'id':cities['county_fips'],'city':city_names,'cords':cords.values.tolist()})
print(cities)
print(cities.columns)
print(cityWcords)

                city       city_ascii state_id  state_name  county_fips  \
0           New York         New York       NY    New York        36061   
1        Los Angeles      Los Angeles       CA  California         6037   
2            Chicago          Chicago       IL    Illinois        17031   
3              Miami            Miami       FL     Florida        12086   
4             Dallas           Dallas       TX       Texas        48113   
..               ...              ...      ...         ...          ...   
459      Idaho Falls      Idaho Falls       ID       Idaho        16019   
460          Holland          Holland       MI    Michigan        26139   
461  Charlottesville  Charlottesville       VA    Virginia        51540   
462         Longview         Longview       TX       Texas        48183   
463            Tracy            Tracy       CA  California         6077   

         county_name      lat       lng  population  density  
0           New York  40.6943  -73.9

<h5>city permutations</h5>

In [25]:
perm = pd.DataFrame(list(permutations(cityWcords.values, 2)),columns=['start','end'])
print('Size:',perm.shape)

Size: (210222, 2)


<h5>distances between combinations of cities (this will take a while to run, its caluclating 200,000 distances)</h5>

In [26]:
from geopy.distance import geodesic
perm['physical_dist'] = perm.apply(lambda row: geodesic(row.start[2],row.end[2]).miles,axis=1)
perm.head()

Unnamed: 0,start,end,physical_dist
0,"[36061, New York, New York, [40.6943, -73.9249]]","[6037, Los Angeles, California, [34.1139, -118...",2462.101443
1,"[36061, New York, New York, [40.6943, -73.9249]]","[17031, Chicago, Illinois, [41.8373, -87.6862]]",720.07388
2,"[36061, New York, New York, [40.6943, -73.9249]]","[12086, Miami, Florida, [25.7839, -80.2102]]",1089.337402
3,"[36061, New York, New York, [40.6943, -73.9249]]","[48113, Dallas, Texas, [32.7936, -96.7662]]",1374.411407
4,"[36061, New York, New York, [40.6943, -73.9249]]","[42101, Philadelphia, Pennsylvania, [40.0077, ...",79.481123


<h5>How it would be done with google maps api</h5>

In [None]:
#import googlemaps
#from datetime import datetime
#gmaps = googlemaps.Client(key='enter api key here')
#dt = datetime.strptime("06/11/22 20:30", "%d/%m/%y %H:%M")
#perm['googleapi_raw'] = perm.apply(lambda row: gmaps.distance_matrix(tuple(row.start[1]), tuple(row.end[1]), mode='driving', departure_time=dt),axis=1)
#perm['time'] = perm.apply(lambda row: row.googleapi_raw['rows'][0]['elements'][0]['duration']['value']/3600, axis=1)

<h5>dropping duplicate combinations</h5>

In [24]:
perm.drop_duplicates(subset=['physical_dist'],inplace=True)
print('Size after dropping:',perm.shape)

Size after dropping: (105111, 4)


<h5>solving average travel time between cities</h5>
Since most travel for a overnight bus is during the night and travel is mostly done on highways,
I am assuming there is 0 traffic and the speed is 65 miles per hour on average.

Note: time is in hours

In [27]:
perm['time'] = perm['physical_dist']/65
perm.head()

Unnamed: 0,start,end,physical_dist,time
0,"[36061, New York, New York, [40.6943, -73.9249]]","[6037, Los Angeles, California, [34.1139, -118...",2462.101443,37.878484
1,"[36061, New York, New York, [40.6943, -73.9249]]","[17031, Chicago, Illinois, [41.8373, -87.6862]]",720.07388,11.07806
2,"[36061, New York, New York, [40.6943, -73.9249]]","[12086, Miami, Florida, [25.7839, -80.2102]]",1089.337402,16.759037
3,"[36061, New York, New York, [40.6943, -73.9249]]","[48113, Dallas, Texas, [32.7936, -96.7662]]",1374.411407,21.144791
4,"[36061, New York, New York, [40.6943, -73.9249]]","[42101, Philadelphia, Pennsylvania, [40.0077, ...",79.481123,1.222787


<h5>reorganizing</h5>

In [28]:
combos = perm
split1 = pd.DataFrame(combos['start'].to_list(), columns = ['city1id', 'city1name', 'city1location'])
split2 = pd.DataFrame(combos['end'].to_list(), columns = ['city2id', 'city2name', 'city2location'])
combos1 = split1.merge(split2,left_index=True,right_index=True)
combos1['time'] = perm['time']
combos1['phys_dist'] = perm['physical_dist']
#Taking out values that don't fit within 6-12 hour travel time
combos1 = combos1.loc[(combos1['time']<12) & (combos1['time']>6)]
combos1 = combos1.merge(cities[['population', 'density']], left_on = ['city1id'], right_on = cities['county_fips'], how = 'inner')
combos1.rename(columns = {'population':'city1pop','density':'city1density'}, inplace = True)
combos1 = combos1.merge(cities[['population', 'density']], left_on = ['city2id'], right_on = cities['county_fips'], how = 'inner')
combos1.rename(columns = {'population':'city2pop','density':'city2density'}, inplace = True)
combos1 = combos1.drop_duplicates(subset = ['city1id', 'city2id']).reset_index(drop = True)
combos1.head()

Unnamed: 0,city1id,city1name,city1location,city2id,city2name,city2location,time,phys_dist,city1pop,city1density,city2pop,city2density
0,36061,"New York, New York","[40.6943, -73.9249]",17031,"Chicago, Illinois","[41.8373, -87.6862]",11.07806,720.07388,18713220,10715,8604203,4574
1,42101,"Philadelphia, Pennsylvania","[40.0077, -75.1339]",17031,"Chicago, Illinois","[41.8373, -87.6862]",10.282587,668.368167,5649300,4554,8604203,4574
2,13121,"Atlanta, Georgia","[33.7627, -84.4224]",17031,"Chicago, Illinois","[41.8373, -87.6862]",8.995393,584.700527,5449398,1441,8604203,4574
3,11001,"Washington, District of Columbia","[38.9047, -77.0163]",17031,"Chicago, Illinois","[41.8373, -87.6862]",9.197382,597.829842,5379184,4457,8604203,4574
4,24510,"Baltimore, Maryland","[39.3051, -76.6144]",17031,"Chicago, Illinois","[41.8373, -87.6862]",9.348979,607.683637,2106068,2830,8604203,4574


<h5>The Model</h5>
Due to time constraints, I was not able to get much data that is usable so I sticked with what I got,
but this is easily scaleable to include more data sets to improve the quality of the selections.

Anything above the 40 percentile of populations
Anything above the 80 percentile of population density

I set the populations threshold to be anything above 40% of the data where the population of any city above 100k population.
The reasoning behind this is because even cities with lower populations have great public transit and use it a lot. An example would be Rostock, Germany with a great tram system, bus system, and even light-rail line for a city of 209k. 
My reasoning to the 80 percentile is density is required for good public transit and if the transit isn't good, people will just use their car and go to the airport.

In [31]:
population_thresh = combos1['city1pop'].quantile(0.4)
density_thresh = combos1['city1density'].quantile(0.8)
print('Population Threshold:',population_thresh)
print('Density Threshold:',density_thresh)

Population Threshold: 192315.0
Density Threshold: 1635.0


<h4>The final combinations</h4>

In [32]:
combos_final = combos1.loc[(combos1['city1pop']>=population_thresh1) & (combos1['city2pop']>=population_thresh2) & (combos1['city1density']>=density_thresh1) & (combos1['city2density']>=density_thresh2)]
combos_final = combos_final.drop_duplicates(subset = ['city1id', 'city2id']).reset_index(drop = True)
print(combos_final.head())

   city1id                         city1name        city1location  city2id  \
0    36061                New York, New York  [40.6943, -73.9249]    17031   
1    42101        Philadelphia, Pennsylvania  [40.0077, -75.1339]    17031   
2    11001  Washington, District of Columbia  [38.9047, -77.0163]    17031   
3    24510               Baltimore, Maryland  [39.3051, -76.6144]    17031   
4    42003          Pittsburgh, Pennsylvania  [40.4396, -79.9762]    17031   

           city2name        city2location       time   phys_dist  city1pop  \
0  Chicago, Illinois  [41.8373, -87.6862]  11.078060  720.073880  18713220   
1  Chicago, Illinois  [41.8373, -87.6862]  10.282587  668.368167   5649300   
2  Chicago, Illinois  [41.8373, -87.6862]   9.197382  597.829842   5379184   
3  Chicago, Illinois  [41.8373, -87.6862]   9.348979  607.683637   2106068   
4  Chicago, Illinois  [41.8373, -87.6862]   6.361087  413.470664   1703266   

   city1density  city2pop  city2density  
0         10715   86

<h5>Map Generation</h5>

In [65]:
bus_map = folium.Map(location=[40,-98],zoom_start=4)
for city in combos_final[['city1name','city1location']].drop_duplicates(subset='city1name').values.tolist():
    folium.Marker(city[1], tooltip=city[0]).add_to(bus_map)
vectors = combos_final[['city1location','city2location']]
print(vectors.head())
for line in vectors.values.tolist():
    plugins.AntPath([line[0],line[1]]).add_to(bus_map)
bus_map

         city1location        city2location
0  [40.6943, -73.9249]  [41.8373, -87.6862]
1  [40.0077, -75.1339]  [41.8373, -87.6862]
2  [38.9047, -77.0163]  [41.8373, -87.6862]
3  [39.3051, -76.6144]  [41.8373, -87.6862]
4  [40.4396, -79.9762]  [41.8373, -87.6862]
