In [1]:
import json
from shapely.geometry import Point, Polygon, MultiPolygon
import pandas as pd
from shapely.ops import transform
from pyproj import Transformer
from fuzzywuzzy import process
import re
import numpy as np



In [2]:
with open('input_data/state_info.json', 'r') as f:
    state_fips_to_name = json.load(f)

with open('input_data/county_boundary.json', 'r') as file:
    county_boundary_data = json.load(file)

filtered_county_boundary_data = [feature for feature in county_boundary_data['features'] if feature['properties']['STATEFP'] != '09']

with open('input_data/gadm41_USA_2.json', 'r') as file:
    ct_boundary_data = json.load(file)

def find_state_fips(state_name, state_fips_to_name):
    for fips, info in state_fips_to_name.items():
        if info['name'].lower().replace(" ", "") == state_name.lower().replace(" ", ""):
            return fips
    return None


transformer = Transformer.from_crs("epsg:4326", "epsg:3097", always_xy=True)
  
county_data = []

for feature in filtered_county_boundary_data:
    state_fips = feature['properties']['STATEFP']
    try:
        state_name = state_fips_to_name[state_fips]["name"]
        state_abbr = state_fips_to_name[state_fips]["abbreviation"]
    except KeyError:
        continue
    county_fips = feature['properties']['COUNTYFP']
    county_name = feature['properties']['NAMELSAD']
    
    boundary_coords = feature['geometry']['coordinates']
    
    if feature['geometry']['type'] == 'MultiPolygon':
        polygons = []
        for polygon_coords in boundary_coords:
            exterior_coords = polygon_coords[0]
            interior_coords = [coords for coords in polygon_coords[1:]]
            polygons.append(Polygon(exterior_coords, interior_coords))
        
        boundary = MultiPolygon(polygons)

    centroid = boundary.centroid
    latitude = centroid.y
    longitude = centroid.x
    projected_boundary = transform(transformer.transform, boundary)
    
        
    county_data.append({
        'State Name': state_name,
        'State Abbr': state_abbr,
        'State Fips': state_fips,
        'County Name': county_name,
        'Boundary': boundary,
        'Latitude': latitude,
        'Longitude': longitude,
        'Area':projected_boundary.area/ 1e6
    })

for feature in ct_boundary_data['features']:
    state_name = feature['properties']['NAME_1']
    
    if state_name.lower() == "connecticut":
        state_fips = find_state_fips(state_name, state_fips_to_name)
        state_abbr = state_fips_to_name[state_fips]["abbreviation"]
        county_name = feature['properties']['NAME_2']
        
        boundary_coords = feature['geometry']['coordinates']
        if feature['geometry']['type'] == 'MultiPolygon':
            polygons = []
            for polygon_coords in boundary_coords:
                exterior_coords = polygon_coords[0] 
                interior_coords = [coords for coords in polygon_coords[1:]]
                polygons.append(Polygon(exterior_coords, interior_coords))
            
            boundary = MultiPolygon(polygons)
            
        centroid = boundary.centroid
        latitude = centroid.y
        longitude = centroid.x
        projected_boundary = transform(transformer.transform, boundary)
   
        county_data.append({
            'State Name': state_name,
            'State Abbr': state_abbr,
            'State Fips': state_fips,
            'County Name': county_name,
            'Boundary': boundary,
            'Latitude': latitude,
            'Longitude': longitude,
            'Area':projected_boundary.area/ 1e6
        })

suffixes_to_remove = ['county', 'census area', 'city and borough', 'borough', 'municipality', 'parish']
def remove_suffix(name, suffixes):
    pattern = r'\b(?:' + '|'.join(map(re.escape, suffixes)) + r')\b'
    return re.sub(pattern, '', name).strip()



county_data=pd.DataFrame(county_data)
county_data['County Name_clean'] = county_data['County Name'].str.lower().str.strip()
county_data['County Name_clean'] = county_data['County Name_clean'].apply(remove_suffix, suffixes=suffixes_to_remove)


def match_county(row):
    state_abbr = row['state_abbr']
    geo_name = row['GeoName_clean']
    possible_counties = county_data[county_data['State Abbr'] == state_abbr]['County Name_clean'].tolist()

    match = process.extractOne(geo_name, possible_counties, score_cutoff=80)
    if match:
        return match[0] 
    else:
        return None
    
economy_df = pd.read_excel('input_data/usa_economy.xlsx')

economy_df['Matched County Name'] = None
economy_df['GeoName'] = economy_df['GeoName'].str.replace('(Independent City)', 'City')
economy_df['state_abbr'] = economy_df['state_abbr'].str.strip().str.replace('*', '', regex=False)
economy_df['GeoName_clean'] = economy_df['GeoName'].str.lower().str.strip()
economy_df['GeoName_clean'] = economy_df['GeoName_clean'].apply(remove_suffix, suffixes=suffixes_to_remove)

economy_df['Matched County Name'] = economy_df.apply(match_county, axis=1)

adm2_data_df = pd.merge(
    economy_df,
    county_data[['State Abbr', 'County Name_clean', 'Latitude', 'Longitude', 'Area']],
    left_on=['state_abbr', 'Matched County Name'],
    right_on=['State Abbr', 'County Name_clean'],
    how='left'
)

adm2_data_df = pd.DataFrame(adm2_data_df )
adm2_data_df[['id', 'Latitude', 'Longitude', 'Area']].to_csv('data/usa_adm2_locations_area.csv', index=False)

file_path = 'input_data/usa-airports.csv'
airport_data = pd.read_csv(file_path)
with open('input_data/usa_ferry_terminals.geojson', 'r',encoding='utf-8') as file:
    ferry_data = json.load(file)
with open('input_data/usa_railway_station.geojson', 'r',encoding='utf-8') as file:
    train_data = json.load(file)
    
train_data['features'] = [
    station for station in train_data['features']
    if station['properties'].get('station') not in ['subway', 'light_rail']
]

airport_data = airport_data[airport_data['type'].isin(['medium_airport', 'large_airport'])]

def find_city_by_coordinates(lat, lon):
    point = Point(lon, lat)
    for _, row in county_data.iterrows():
        boundary = row['Boundary']
        if boundary.contains(point):
            return row['State Name'], row['State Abbr'], row['County Name']
    return None, None, None


city_status_dict = {
    (row['State Name'].lower(), row['State Abbr'], row['County Name'].lower()): {'Has Airport': 0, 'Has Ferry Terminal': 0, 'Has Train Station': 0}
    for _, row in county_data.iterrows()
}


for _, row in airport_data.iterrows():
    latitude = row['latitude_deg']
    longitude = row['longitude_deg']
    state_name, state_abbr, city_name = find_city_by_coordinates(latitude, longitude)
    
    if state_name and city_name:
        key = (state_name.lower(), state_abbr, city_name.lower())
        if key in city_status_dict:
            city_status_dict[key]['Has Airport'] = 1

for feature in ferry_data['features']:
    latitude = feature['geometry']['coordinates'][1]
    longitude = feature['geometry']['coordinates'][0]
    
    state_name, state_abbr, city_name = find_city_by_coordinates(latitude, longitude)
    
    if state_name and city_name:
        key = (state_name.lower(), state_abbr, city_name.lower())
        if key in city_status_dict:
            city_status_dict[key]['Has Ferry Terminal'] = 1

for feature in train_data['features']:
    latitude = feature['geometry']['coordinates'][1]
    longitude = feature['geometry']['coordinates'][0]
    
    state_name, state_abbr, city_name = find_city_by_coordinates(latitude, longitude)
    
    if state_name and city_name:
        key = (state_name.lower(), state_abbr, city_name.lower())
        if key in city_status_dict:
            city_status_dict[key]['Has Train Station'] = 1
            
county_status_df = pd.DataFrame.from_dict(city_status_dict, orient='index').reset_index()
county_status_df.columns = ['State Name','State Abbr', 'County Name', 'Has Airport', 'Has Ferry Terminal', 'Has Train Station']

county_status_df['State Name'] = county_status_df['State Name'].str.title()
county_status_df['County Name'] = county_status_df['County Name'].str.title()

output_file_path = 'data/USA_transportation.csv'
county_status_df.to_csv(output_file_path, index=False)

print(f"Data has been exported to {output_file_path}.")

Data has been exported to data/USA_transportation.csv.


In [3]:
import json  
from geopy.distance import geodesic 
from tqdm import tqdm 
import pandas as pd 

adm2_data = pd.read_csv('data/usa_adm2_locations_area.csv')

adm2_data = adm2_data.to_dict(orient='records')

distances = {}

for i, place1 in tqdm(enumerate(adm2_data), total=len(adm2_data)):
    place1_key = f"{place1['id']}"
    distances[place1_key] = {}
    for j, place2 in enumerate(adm2_data[i+1:]):
        place2_key = f"{place2['id']}"
        dist = geodesic((place1['Latitude'], place1['Longitude']),
                        (place2['Latitude'], place2['Longitude'])).km
        distances[place1_key][place2_key] = dist
        
with open('data/usa_adm2_distances.txt', 'w') as f:
    for place1, dist_dict in distances.items():
        for place2, dist in dist_dict.items():
            f.write(f"{place1}\t{place2}\t{dist:.2f}\n")

100%|██████████| 3114/3114 [06:35<00:00,  7.87it/s] 


In [4]:
import pandas as pd
from fuzzywuzzy import process
import re
import numpy as np

economy_df = pd.read_excel('input_data/usa_economy.xlsx')
county_status_df = pd.read_csv('data/USA_transportation.csv')

economy_df['GeoName'] = economy_df['GeoName'].str.replace('(Independent City)', 'City')
economy_df['us-popu'] = economy_df['us-popu'].replace(',', '', regex=True).astype(float)
economy_df['state_abbr'] = economy_df['state_abbr'].str.strip().str.replace('*', '', regex=False)

county_status_df['State Abbr'] = county_status_df['State Abbr'].str.strip().str.replace('*', '', regex=False)

economy_df['GeoName_clean'] = economy_df['GeoName'].str.lower().str.strip()
county_status_df['County Name_clean'] = county_status_df['County Name'].str.lower().str.strip()

suffixes_to_remove = ['county', 'census area', 'city and borough', 'borough', 'municipality', 'parish']

def remove_suffix(name, suffixes):
    pattern = r'\b(?:' + '|'.join(map(re.escape, suffixes)) + r')\b'
    return re.sub(pattern, '', name).strip()

economy_df['GeoName_clean'] = economy_df['GeoName_clean'].apply(remove_suffix, suffixes=suffixes_to_remove)
county_status_df['County Name_clean'] = county_status_df['County Name_clean'].apply(remove_suffix, suffixes=suffixes_to_remove)


economy_df['Matched County Name'] = None

def match_county(row):
    state_abbr = row['state_abbr']
    geo_name = row['GeoName_clean']
    
    possible_counties = county_status_df[county_status_df['State Abbr'] == state_abbr]['County Name_clean'].tolist()
    
    match = process.extractOne(geo_name, possible_counties, score_cutoff=80)
    
    if match:
        return match[0]
    else:
        return None

economy_df['Matched County Name'] = economy_df.apply(match_county, axis=1)

merged_df = pd.merge(
    economy_df,
    county_status_df[['State Name','State Abbr', 'County Name', 'County Name_clean', 'Has Airport', 'Has Ferry Terminal', 'Has Train Station']],
    left_on=['state_abbr', 'Matched County Name'],
    right_on=['State Abbr', 'County Name_clean'],
    how='left'
)

final_columns = ['GeoName','State Name', 'state_abbr', 'County Name', 'Has Airport', 'Has Ferry Terminal', 'Has Train Station'] + economy_df.columns.tolist()

merged_df.to_excel('data/usa_transportation_economy.xlsx', index=False)

In [5]:
import pandas as pd

usa_transportation_economy_df = pd.read_excel('data/usa_transportation_economy.xlsx')
usa_attractions_df = pd.read_excel('input_data/usa_attractions.xlsx')

score_columns = ['5', '4', '3', '2', '1']
for col in score_columns:
    usa_attractions_df[col] = pd.to_numeric(usa_attractions_df[col], errors='coerce')

grouped_attractions = usa_attractions_df.groupby(['Matched_State', 'Matched_Location']).agg(
    total_attractions=('Matched_Location', 'size'),
    total_reviews=('reviews', 'sum'),
    five_score=('5', 'sum'),
    four_score=('4', 'sum'),
    three_score=('3', 'sum'),
    two_score=('2', 'sum'),
    one_score=('1', 'sum'),
).reset_index()

merged_df = pd.merge(usa_transportation_economy_df, grouped_attractions, left_on=['State Name', 'County Name'],
                     right_on=['Matched_State', 'Matched_Location'], how='left')

merged_df.drop(['Matched_State', 'Matched_Location', 'GeoName_clean',	'Matched County Name', 'State Name' ,'State Abbr',	'County Name' , 'County Name_clean'
], axis=1, inplace=True)

merged_df.to_excel('data/usa_transportation_economy_attractions.xlsx', index=False)

In [6]:
import pandas as pd
import numpy as np

adm2_data_locations_area=pd.read_csv('data/usa_adm2_locations_area.csv')
adm2_area = dict(zip(adm2_data_locations_area['id'], adm2_data_locations_area['Area']))

adm2_transportation_economy_attractions=pd.read_excel('data/usa_transportation_economy_attractions.xlsx')
adm2_transportation_economy_attractions[['total_attractions', 'total_reviews', 'five_score', 'four_score', 'three_score', 'two_score', 'one_score']] = adm2_transportation_economy_attractions[['total_attractions', 'total_reviews', 'five_score', 'four_score', 'three_score', 'two_score', 'one_score']].fillna(0)
adm2_transportation_economy_attractions['tourism_quality'] = (
    10 * adm2_transportation_economy_attractions['five_score'] +
    8 * adm2_transportation_economy_attractions['four_score'] +
    4 * adm2_transportation_economy_attractions['three_score'] +
    2 * adm2_transportation_economy_attractions['two_score'] +
    1 * adm2_transportation_economy_attractions['one_score']
)

adm2_transportation_economy_attractions['us-popu'] = adm2_transportation_economy_attractions['us-popu'].replace(',', '', regex=True).astype(float)
adm2_transportation_economy_attractions['us-gdp'] = adm2_transportation_economy_attractions['us-gdp'].replace(',', '', regex=True).astype(float)

adm2_population = dict(zip(adm2_transportation_economy_attractions['id'], adm2_transportation_economy_attractions['us-popu']))
adm2_income = dict(zip(adm2_transportation_economy_attractions['id'], adm2_transportation_economy_attractions['us-gdp']))
adm2_tourism_quality = dict(zip(adm2_transportation_economy_attractions['id'], adm2_transportation_economy_attractions['tourism_quality']))

        
adm2_distances = {}
with open('data/usa_adm2_distances.txt', 'r') as f:
    for line in f:
        place1, place2, dist = line.strip().split('\t')
        place1 = int(place1)
        place2 = int(place2)
        dist = float(dist)
        if place1 not in adm2_distances:
            adm2_distances[place1] = {}
        if place2 not in adm2_distances:
            adm2_distances[place2] = {}
        adm2_distances[place1][place2] = dist
        adm2_distances[place2][place1] = dist

In [7]:
with open('data/usa_population_attraction_scores.txt', 'w') as f:
    for place1, dist_dict in adm2_distances.items():
        if place1 in adm2_population:
            pop1 = adm2_population[place1]
            total_score = 0
            for place2, dist in dist_dict.items():
                if place2 in adm2_population and dist > 0:
                    pop2 = adm2_population[place2]
                    score = (pop1 * pop2) / dist
                    total_score += score
            f.write(f"{place1}\t{total_score:.2f}\n")
            
with open('data/usa_tourism_attraction_scores.txt', 'w') as f:
    for place1, dist_dict in adm2_distances.items():
        if place1 in adm2_tourism_quality:
            pop1 = adm2_tourism_quality[place1]
            total_score = 0
            for place2, dist in dist_dict.items():
                if place2 in adm2_tourism_quality and dist > 0:
                    pop2 = adm2_tourism_quality[place2]
                    score = (pop1 * pop2) / dist
                    total_score += score
            f.write(f"{place1}\t{total_score:.2f}\n")
            
def self_potential(GDP_i, area_i):
    return GDP_i / ((2 / 3) * np.sqrt(area_i / np.pi))


HMP = {}
for place1 in adm2_income:
    GDP_i = adm2_income[place1]
    area_i = adm2_area[place1]

    potential_sum = 0
    for place2, dist in adm2_distances.get(place1, {}).items():
        if place1 != place2:
            try:
                GDP_j = adm2_income[place2]
                potential_sum += GDP_j / dist
            except:
                print(place1,place2,dist)
            
    
    self_term = self_potential(GDP_i, area_i)
    
    HMP[place1] = potential_sum + self_term

HMP_df = pd.DataFrame(list(HMP.items()))
HMP_df.to_csv('data/usa_harris_market_potential.txt', sep='\t', index=False, header=False)

24005 24510 0.0
24510 24005 0.0
29189 29510 0.0
29510 29189 0.0
51159 51760 0.0
51650 51949 0.0
51760 51159 0.0
51770 51944 0.0
51944 51770 0.0
51949 51650 0.0


In [8]:
population_attraction_scores = pd.read_csv('data/usa_population_attraction_scores.txt', sep='\t', header=None, names=['id', 'population_attraction_score'])
tourism_attraction_scores = pd.read_csv('data/usa_tourism_attraction_scores.txt', sep='\t', header=None, names=['id', 'tourism_attraction_scores'])
harris_market_potential = pd.read_csv('data/usa_harris_market_potential.txt', sep='\t', header=None, names=['id', 'harris_market_potential'])

merged_data = pd.merge(adm2_transportation_economy_attractions, population_attraction_scores, on='id', how='left')
merged_data = pd.merge(merged_data, tourism_attraction_scores, on='id', how='left')
merged_data = pd.merge(merged_data, harris_market_potential , on='id', how='left')

merged_data = pd.merge(merged_data, adm2_data_locations_area , on='id', how='left')
merged_data.to_csv('data/final_scores.csv', index=False)