# Amtrak Network Generation
#### Tanner Kogel tjk190000
##### Mech 6317.001: Semester Project

## Import needed libraries

In [51]:
import networkx as nx
import networkx.algorithms.community as nx_comm
from numpy import zeros, dot, array
import pickle
import matplotlib.pyplot as plt
import json
import string
import time
import itertools

## Create function to take in state name and return state acronym

In [52]:
def state_name_to_acronym(state_name):
    with open('state_acronyms.txt' , 'r' , encoding='UTF-8') as fin: # open file of state names \t state acronym
        for line in fin:                                             # loop over every line in the txt file
            state_info = line.split('\t')                            # split line by tabs
            if state_name == state_info[0]:                          # state names line up
                return state_info[1]                                 # give state acronym

## Create function to give city of train station in form city, state acronym

In [53]:
def city_from_station(station):
    station = station.split('(')    # break up based on station acronym
    station = station[0].split('-') # break up based on city, state acronym - station name
    return station[0].strip()       # return in form city, state acronym

## Create function to give integer from string with commas

In [54]:
def remove_commas(string,dataType):
    string = string.replace(",","") # remove instances of commas from string
    if dataType == 'int':           # return int
        return int(string)
    elif dataType == 'float':       # return float
        return float(string)
    else:                           # error
        print('Error: input desired dataType as \'int\' or \'float\'')

## Initialize graphs to be generated

In [55]:
G_amtrak = nx.Graph()            # weighted network of train stations
G_induced_cities = nx.Graph()    # induced graph of cities with population above 100,000 
G_noninduced_cities = nx.Graph() # non-induced graph of cities with population above 100,000

## Read in city nodes with attributes from cities_from_wiki.txt

In [56]:
# the cities_from_wiki.txt file has data gathered from wikipedia over the 331 cities in the United States that were recorded as having a population over 100,000 in the 2020 Census
# The data is organized into \t seperated lines holding the information in the order:
# [0]pop. rank, [1]city name, [2] state name, [3] estimate of 2021 pop. based on 202 census, [4] pop. reported by 2020 census, [6] 2020 land area, etc.

with open('cities_from_wiki.txt' , 'r' , encoding='UTF-8') as fin: # open file to read cities and attributes from
    for line in fin:                                               # loop over each line from the txt file
        city_attributes = line.split('\t')                         # split city attributes into groups
        city_name = city_attributes[1]                             # get city name from list of strings
        if city_name[len(city_name)-1].isalpha() == False:         # notation bracket is present in city name
            city_name = city_name[:-3]                             # remove reference bracket (always three characters)
        city_area = city_attributes[6].split(' ')                  # get city area in square miles
        city_area = remove_commas(city_area[0],'float')            # don't include units and remove commas
        state = state_name_to_acronym(city_attributes[2])          # get state acronym
        city_state = (city_name,state)                             # put city and state in a list
        city = ', '.join(city_state)                               # final form of city node name is city, state acronym
        pop_int = remove_commas(city_attributes[4],'int')          # get poopulation as an integer
        pop_change = city_attributes[5].replace("%","")            # remove instances of unneccesary characters from string
        pop_change = float(pop_change.replace("−","-"))            # get float from string of population change
        # add node with important attributes to both city networks
        G_induced_cities.add_node(city, state=city_attributes[2], population=pop_int, population_change=pop_change, area=city_area)
        G_noninduced_cities.add_node(city, state=city_attributes[2], population=pop_int, population_change=pop_change, area=city_area)

## Read in Amtrak Network from raw_amtrak.txt

In [57]:
# the raw_amtrak.txt file has each train station along a given amtrak route seperated by a newline, each empty line is a break betwenn routes

with open('raw_amtrak.txt' , 'r' , encoding='UTF-8') as fin: # open file to read train stations from
    amtrak_data = fin.read()                                 # get data read in as a string
    
amtrak_routes = amtrak_data.split('\n')      # get train stations in a list
is_blank = [False]*len(amtrak_routes)        # initialize list of blank (gap betweem routes) indeces
city_is_defined = [False]*len(amtrak_routes) # initialize list of stations whose cities are defined

for idx,station in enumerate(amtrak_routes):                       # loop for every station and keep track of its index
    if len(station) == 0:                                          # line is blank == gap between routes
        is_blank[idx] = True                                       # input blank line to list
        
    else: # not a break
        for city in G_noninduced_cities.nodes(): # loop over every city
            if station.find(city) != -1:         # station exists in a defined city
                city_is_defined[idx] = True      # input defined city to list
        
        
        
#    elif G_noninduced_cities.has_node(city_from_station(station)): # station is in a city on the list
#        city_is_defined[idx] = True                                # input defined city to list

                
                
                
                
                
# input data to graphs
for idx,station in enumerate(amtrak_routes): # loop over each train station and keep track of its index

    # input data to Amtrak graph
    if idx == len(amtrak_routes)-1: # last station in list
        break                       # graph is completed
    
    s1 = station              # current station in list                 
    s2 = amtrak_routes[idx+1] # next station in list
    
    if is_blank[idx] or is_blank[idx+1]: # break between routes
        continue                         # move to next set of stations
      
    if G_amtrak.has_node(s1) and G_amtrak.has_node(s2): # both nodes already exist in the graph
        if G_amtrak.has_edge(s1,s2):                    # nodes already have an edge between them
            G_amtrak[s1][s2]['weight'] += 1             # add additional weight to edge
        else:                                           # no edge exists between these nodes
            G_amtrak.add_edge(s1,s2,weight=1)           # create edge of weight 1
    else:                                               # at least one node does not exist
        G_amtrak.add_edge(s1,s2,weight=1)               # creating an edge adds nodes not in graph automatically

    # input data to non-induced city graph
    if city_is_defined[idx]:            # city at this index is defined in the list
        
        for city in G_noninduced_cities.nodes(): # loop over every city
            if station.find(city) != -1:         # station exists in a defined city
                c1 = city                        # input defined city to be used
        
#        c1 = city_from_station(station) # define first city
 
    else:                               # first city is not defined
        continue                        # move to next city in list
        
    i = 1                                    # define steps forward that are needed
    lonely_route = False                     # define term if there are no other defined cities on the route
    while city_is_defined[idx+i] == False:   # continue to loop to next city until it is defined
        i += 1                               # add value to needed steps forward
    for test in range(i):                    # loop over each line between city 1 and city 2
        if is_blank[idx+test+1]:             # next defined city is not on the current route
            lonely_route = True              # move to next route
    if lonely_route:                         # no more defined cities on route
        continue                             # do not define any more edges
    s2 = amtrak_routes[idx+i]                # define second station
    for city in G_noninduced_cities.nodes(): # loop over every city
            if s2.find(city) != -1:          # station exists in a defined city
                c2 = city                    # input defined city to be used
   
    if c1 == c2: # self-edge
        w = 2    # weight of 2
    else:        # non-self-edge
        w = 1    # weight of 1
    
    if G_noninduced_cities.has_edge(c1,c2):          # nodes already have an edge between them
        G_noninduced_cities[c1][c2]['weight'] += w   # add additional weight to edge
    else:                                            # no edge exists between these nodes
        G_noninduced_cities.add_edge(c1,c2,weight=w) # create edge of weight 1
    
# output results
print('G_amtrak:')
print('\t',G_amtrak)
print('G_noninduced_cities:')
print('\t',G_noninduced_cities)

G_amtrak:
	 Graph with 530 nodes and 592 edges
G_noninduced_cities:
	 Graph with 331 nodes and 163 edges


## Map amtrak network to induced city network

In [58]:
for edge in G_amtrak.edges(): # loop over every edge in the amtrak network
    
   # print(G_amtrak[edge[0]][edge[1]]['weight'])
    city_is_defined = [False]*len(edge)
    
    for city in G_induced_cities.nodes(): # loop over every city
            if edge[0].find(city) != -1:  # station 1 exists in a defined city
                city_1 = city             # input defined city for use
                city_is_defined[0] = True # input defined city existence
            if edge[1].find(city) != -1:  # station 2 exists in a defined city
                city_2 = city             # input defined city for use
                city_is_defined[1] = True # input defined city existance
    
    if city_is_defined[0] == False or city_is_defined[1] == False:
        continue
                
    #city_1 = city_from_station(edge[0]) # get city, state acronym for first city
    #city_2 = city_from_station(edge[1]) # get city, state acronym for second city
    
    if city_1 == city_2:                           # self-loop
        w = 2*G_amtrak[edge[0]][edge[1]]['weight'] # weight must be two for self-loop
    else:                                          # not a self-loop
        w = G_amtrak[edge[0]][edge[1]]['weight']   # weight should be equivalent to amtrak network weight
    
    if G_induced_cities.has_node(city_1) and G_induced_cities.has_node(city_2): # induced graph has both cities
        if G_induced_cities.has_edge(city_1,city_2):                            # nodes already have an edge between them
            G_induced_cities[city_1][city_2]['weight'] += w                     # add additional weight to edge
        else:                                                                   # no edge exists between these nodes
            G_induced_cities.add_edge(city_1,city_2,weight=w)                   # create edge of appropriate weight
            
     #   G_induced_cities.add_edge(city_1,city_2,weight=G_amtrak[edge[0]][edge[1]]['weight']) # create edge of appropriate weight

# print results
print('G_induced_cities:')
print('\t',G_induced_cities)

G_induced_cities:
	 Graph with 331 nodes and 74 edges


In [59]:
for node in G_amtrak.nodes():
    print('%s\n\t'%(node),G_amtrak[node])

Boston, MA - South Station (BOS)
	 {'Boston, MA - Back Bay Station (BBY)': {'weight': 3}, 'Rhinecliff, NY (RHI)': {'weight': 1}}
Boston, MA - Back Bay Station (BBY)
	 {'Boston, MA - South Station (BOS)': {'weight': 3}, 'Route 128, MA (RTE)': {'weight': 2}, 'Framingham, MA (FRA)': {'weight': 1}}
Route 128, MA (RTE)
	 {'Boston, MA - Back Bay Station (BBY)': {'weight': 2}, 'Providence, RI - Amtrak/MBTA Station (PVD)': {'weight': 2}}
Providence, RI - Amtrak/MBTA Station (PVD)
	 {'Route 128, MA (RTE)': {'weight': 2}, 'New Haven, CT - Union Station (NHV)': {'weight': 1}, 'Kingston, RI (KIN)': {'weight': 1}}
New Haven, CT - Union Station (NHV)
	 {'Providence, RI - Amtrak/MBTA Station (PVD)': {'weight': 1}, 'Stamford, CT (STM)': {'weight': 1}, 'New Haven, CT - State Street Station (STS)': {'weight': 3}, 'Bridgeport, CT (BRP)': {'weight': 2}, 'Wallingford, CT (WFD)': {'weight': 1}}
Stamford, CT (STM)
	 {'New Haven, CT - Union Station (NHV)': {'weight': 1}, 'New York, NY - Moynihan Train Hall (N

In [60]:
for node in G_induced_cities.nodes():
    print('%s\t'%(node),G_induced_cities[node])

New York, NY	 {'Stamford, CT': {'weight': 2}, 'Newark, NJ': {'weight': 9}, 'Yonkers, NY': {'weight': 5}}
Los Angeles, CA	 {'Burbank, CA': {'weight': 1}, 'Glendale, CA': {'weight': 1}, 'Fullerton, CA': {'weight': 2}, 'Pomona, CA': {'weight': 2}}
Chicago, IL	 {'Naperville, IL': {'weight': 2}, 'South Bend, IN': {'weight': 2}, 'Joliet, IL': {'weight': 1}}
Houston, TX	 {'Beaumont, TX': {'weight': 1}, 'San Antonio, TX': {'weight': 1}}
Phoenix, AZ	 {}
Philadelphia, PA	 {'Philadelphia, PA': {'weight': 6}}
San Antonio, TX	 {'Houston, TX': {'weight': 1}}
San Diego, CA	 {'San Diego, CA': {'weight': 2}}
Dallas, TX	 {'Fort Worth, TX': {'weight': 1}}
San Jose, CA	 {'Oakland, CA': {'weight': 1}, 'Santa Clara, CA': {'weight': 1}, 'Salinas, CA': {'weight': 1}}
Austin, TX	 {}
Jacksonville, FL	 {}
Fort Worth, TX	 {'Dallas, TX': {'weight': 1}}
Columbus, OH	 {}
Indianapolis, IN	 {}
Charlotte, NC	 {}
San Francisco, CA	 {}
Seattle, WA	 {'Tacoma, WA': {'weight': 1}}
Denver, CO	 {}
Oklahoma City, OK	 {'Norman,

In [61]:
for node in G_noninduced_cities.nodes():
    print('%s\t'%(node),G_noninduced_cities[node])

New York, NY	 {'Stamford, CT': {'weight': 3}, 'Newark, NJ': {'weight': 9}, 'Yonkers, NY': {'weight': 5}, 'Boston, MA': {'weight': 1}}
Los Angeles, CA	 {'Burbank, CA': {'weight': 1}, 'Glendale, CA': {'weight': 1}, 'Fullerton, CA': {'weight': 2}, 'Pomona, CA': {'weight': 2}}
Chicago, IL	 {'Naperville, IL': {'weight': 2}, 'South Bend, IN': {'weight': 2}, 'Indianapolis, IN': {'weight': 1}, 'Memphis, TN': {'weight': 1}, 'Milwaukee, WI': {'weight': 2}, 'Joliet, IL': {'weight': 3}, 'Lansing, MI': {'weight': 1}}
Houston, TX	 {'Beaumont, TX': {'weight': 1}, 'San Antonio, TX': {'weight': 1}}
Phoenix, AZ	 {}
Philadelphia, PA	 {'Newark, NJ': {'weight': 9}, 'Baltimore, MD': {'weight': 7}, 'Philadelphia, PA': {'weight': 6}, 'Pittsburgh, PA': {'weight': 1}}
San Antonio, TX	 {'Houston, TX': {'weight': 1}, 'El Paso, TX': {'weight': 2}, 'Austin, TX': {'weight': 1}}
San Diego, CA	 {'Oceanside, CA': {'weight': 1}, 'San Diego, CA': {'weight': 2}}
Dallas, TX	 {'Little Rock, AR': {'weight': 1}, 'Fort Worth, 

## Create GML Files for all created networks

In [62]:
nx.write_gml(G_amtrak,"Amtrak_Network.gml")
nx.write_gml(G_induced_cities,"Amtrak_Induced_Cities.gml")
nx.write_gml(G_noninduced_cities,"Amtrak_Noninduced_Cities.gml")