In [None]:
# pip install geopandas, geopy

In [2]:
import os
# Creating paths
in_folderpath = os.path.join("..", "in")

os.makedirs(in_folderpath, exist_ok=True)

In [5]:
import geopandas as gpd

# Loading the data into A geopandas dataframe
gdf = gpd.read_file(os.path.join(in_folderpath, "datacenters.geojson"))

# Filtering the dataframe to only include locations within the US
gdf_usa = gdf[gdf['country'] == 'USA']

In [6]:
from geopy.geocoders import Nominatim
from shapely.geometry import Point
import json
from tqdm import tqdm


# Iterate through each row to check and update geometry
for i, row in tqdm(gdf_usa.iterrows()):

    if row['geometry'] is None or row['geometry'].is_empty:
        
        # Dynamically create a geolocator object with a unique user_agent for each row
        geolocator = Nominatim(user_agent=f"geocoding_address_{i}")
        
        # Creates an address for the geolocator
        address = f"{row['address']}, {row['postal']}, {row['city']}, {row['state']}, {row['country']}"
        location = geolocator.geocode(address)
        
        if location:
            # Primary location found, updating the geometry
            gdf_usa.at[i, 'geometry'] = Point(location.longitude, location.latitude)
            gdf_usa.at[i, 'latitude'] = location.latitude
            gdf_usa.at[i, 'longitude'] = location.longitude
        else:
            # Primary location not found, trying alternate address
            alt_address = f"{row['postal']}, {row['city']}, {row['state']}, {row['country']}"
            alt_location = geolocator.geocode(alt_address)

            if alt_location:
                # Alternate location found, updating the geometry
                gdf_usa.at[i, 'geometry'] = Point(alt_location.longitude, alt_location.latitude)
                gdf_usa.at[i, 'latitude'] = alt_location.latitude
                gdf_usa.at[i, 'longitude'] = alt_location.longitude
            else:
                # Neither primary nor alternate location found
                print("Geometry not found")


gdf_usa_filtered = gdf_usa[gdf_usa['geometry'].notna()]

# Convert GeoDataFrame to JSON and save
json_str = gdf_usa_filtered.to_json()

json_dict = json.loads(json_str)

with open(os.path.join(in_folderpath, "datacenters_usa.geojson"), 'w') as f:
    json.dump(json_dict, f, indent=2)

print("GeoJSON file has been formatted and saved.")

185it [00:05, 23.28it/s]

Geometry not found


1048it [00:39, 71.47it/s]

Geometry not found


1316it [00:48, 28.60it/s]

Geometry not found


1781it [01:19,  4.56it/s]

Geometry not found


2012it [01:32, 20.43it/s]

Geometry not found


2338it [01:57, 18.09it/s]

Geometry not found


2343it [01:59, 15.32it/s]

Geometry not found


2526it [02:04, 20.37it/s]


GeoJSON file has been formatted and saved.


In [71]:

import pandas as pd

state_temp = pd.read_csv(os.path.join(in_folderpath, "state_temp.csv"), skiprows=4)

state_temp.head()

Unnamed: 0,ID,Name,Value,Anomaly (1901-2000 base period),Rank,1901-2000 Mean
0,1,Alabama,65.4,2.3,127,63.1
1,2,Arizona,60.8,1.4,105,59.4
2,3,Arkansas,62.9,2.5,126,60.4
3,4,California,58.2,0.8,92,57.4
4,5,Colorado,45.9,1.3,101,44.6


In [72]:
state_temp.rename(columns={'Name' : 'State_Name', 'Value' : 'temperature'}, inplace=True)

state_temp = state_temp[['State_Name','temperature']]

state_temp.head()

Unnamed: 0,State_Name,temperature
0,Alabama,65.4
1,Arizona,60.8
2,Arkansas,62.9
3,California,58.2
4,Colorado,45.9


In [73]:

# Calculates the average of the annual high temperature and annual low temperature for Honolulu, Hawaii
avg_temp_hawaii = (84+71)/2

# Creates a data frame with the missing states and their average tempeatures 
missing_states = pd.DataFrame({
    'State_Name': ['Alaska', 'Washington, D.C', 'Hawaii'],
    'temperature': [28.4, 61.2, avg_temp_hawaii]
})

# Creates a boolean condition that checks if the State_Name column contains any of the missing states
states_found = state_temp['State_Name'].isin(['Alaska', 'Washington, D.C', 'Hawaii'])

# If none of the missing states are present in the column, the missing_states data frame and state_temp data frame are merged together  
if not states_found.any():

    state_temp = pd.concat([state_temp, missing_states])

# Sorts the data frame by state name, drops the original index and resets the new one from zero
state_temp = state_temp.sort_values(by='State_Name').reset_index(drop=True)


In [74]:
state_temp

Unnamed: 0,State_Name,temperature
0,Alabama,65.4
1,Alaska,28.4
2,Arizona,60.8
3,Arkansas,62.9
4,California,58.2
5,Colorado,45.9
6,Connecticut,52.2
7,Delaware,57.9
8,Florida,73.2
9,Georgia,65.6


## References 

### Nominatim

https://medium.com/@gopesh3652/geocoding-with-python-using-nominatim-a-beginners-guide-220b250ca48d 

https://geopy.readthedocs.io/en/stable/ 


### Temperatures

Alaska:
https://www.ncei.noaa.gov/access/monitoring/monthly-report/national/202313 

Hawaii:
https://www.usclimatedata.com/climate/honolulu/hawaii/united-states/ushi0026

Washington DC:
https://www.weather.gov/media/lwx/climate/dcatemps.pdf 
