In [None]:
# pip install geopandas, geopy, opencage, ipywidgets (for tqdm)

In [30]:
import os
# Creating paths
in_folderpath = os.path.join("..", "in")

os.makedirs(in_folderpath, exist_ok=True)

In [31]:
import geopandas as gpd

# Loading the data into A geopandas dataframe
gdf = gpd.read_file(os.path.join(in_folderpath, "datacenters.geojson"))

# Filtering the dataframe to only include locations within the US
gdf_usa = gdf[gdf['country'] == 'USA']

print(len(gdf_usa))

2526


## Correct way of using the geolocator

In [29]:

from opencage.geocoder import OpenCageGeocode
from shapely.geometry import Point
from tqdm import tqdm

api_key = '4176360a8df2453abab8cb3341a7be21'

geolocator = OpenCageGeocode(api_key)

# Iterate through each row to check and update geometry
for i, row in tqdm(gdf_usa.iterrows(), desc="Geocoding Addresses"):

    if row['geometry'] is None or row['geometry'].is_empty:
        
        # Creates an address for the geolocator
        address = f"{row['address']}, {row['postal']}, {row['city']}, {row['state']}, {row['country']}"
        
        # Returns a list of dictionaries with spatial information about the data center 
        location = geolocator.geocode(address)
        
        if location:

            # Retrieves the geometry from the list of dictionaries
            geometry = location[0]['geometry']

            # Adds the retrived latitude and longditude to each row in data frame
            gdf_usa.at[i, 'geometry'] = Point(geometry['lng'], geometry['lat'])
            gdf_usa.at[i, 'latitude'] = geometry['lat']
            gdf_usa.at[i, 'longitude'] = geometry['lng']
        else:
            print("Geometry not found")


gdf_usa_filtered = gdf_usa[gdf_usa['geometry'].notna()]

print(len(gdf_usa_filtered))

csv_path = os.path.join(in_folderpath, "datacenters_usa.csv")

gdf_usa_filtered.to_csv(csv_path, index=False)

print("GeoJSON file has been formatted and saved.")

Geocoding Addresses: 2526it [01:35, 26.42it/s]

2526
GeoJSON file has been formatted and saved.





In [33]:
from geopy.geocoders import Nominatim
from shapely.geometry import Point
import time
from tqdm import tqdm

email_address = "trine.rye@hotmail.com"

geolocator = Nominatim(user_agent=email_address)

# Iterate through each row to check and update geometry
for i, row in tqdm(gdf_usa.iterrows(), desc='Processing missing datacenter locations:'):

    # if a row (i.e. a datacenter) does not have a geometry (i.e. a location)
    if row['geometry'] is None or row['geometry'].is_empty:        
        # Create an address for the geolocator, by trying to combine the 
        address = f"{row['address']}, {row['postal']}, {row['city']}, {row['state']}, {row['country']}"
        location = geolocator.geocode(address)
        print(f'Primary address: {address}')
        print(f'geocode location: {location}')

        #we wait 2 seconds, as to not overload the nominatim API
        time.sleep(2)
        if location:
            # Primary location found, updating the geometry
            gdf_usa.at[i, 'geometry'] = Point(location.longitude, location.latitude)
            gdf_usa.at[i, 'latitude'] = location.latitude
            gdf_usa.at[i, 'longitude'] = location.longitude
        else:
            # If primary address does not return a location, try an alternate address
            alt_address = f"{row['postal']}, {row['city']}, {row['state']}, {row['country']}"
            print(f'Primary address didnt not return a geocoding location. Trying alternate address')
            print(f'Alternate address: {alt_address}')
            alt_location = geolocator.geocode(alt_address)
            print(f'alternate geocode location: {location}')
            
            #we wait 2 seconds, as to not overload the nominatim API
            time.sleep(2)
            if alt_location:
                # Alternate location found, updating the geometry
                gdf_usa.at[i, 'geometry'] = Point(alt_location.longitude, alt_location.latitude)
                gdf_usa.at[i, 'latitude'] = alt_location.latitude
                gdf_usa.at[i, 'longitude'] = alt_location.longitude
            else:
                # Neither primary nor alternate location found
                print(f"No geocoded locations found for {alt_address}")

gdf_usa_filtered = gdf_usa[gdf_usa['geometry'].notna()]

print(len(gdf_usa_filtered))

csv_path = os.path.join(in_folderpath, "datacenters_usa_2.csv")

gdf_usa_filtered.to_csv(csv_path, index=False)

print("GeoJSON file has been formatted and saved.")



Processing missing datacenter locations:: 0it [00:00, ?it/s]

Primary address: SECRET, within 2 mile from Interstate, 58103, Fargo, North Dakota, USA
geocode location: None
Primary address didnt not return a geocoding location. Trying alternate address
Alternate address: 58103, Fargo, North Dakota, USA
alternate geocode location: None


Processing missing datacenter locations:: 179it [00:09, 19.80it/s]


GeocoderUnavailable: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=412+E+Madison+St+suite+1010%2C+33602%2C+Tampa%2C+Florida%2C+USA&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))

## Finding missing temperatures

In [2]:

import pandas as pd

state_temp = pd.read_csv(os.path.join(in_folderpath, "state_temp.csv"), skiprows=4)

state_temp.head()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,ID,Name,Value,Anomaly (1901-2000 base period),Rank,1901-2000 Mean
0,1,Alabama,65.4,2.3,127,63.1
1,2,Arizona,60.8,1.4,105,59.4
2,3,Arkansas,62.9,2.5,126,60.4
3,4,California,58.2,0.8,92,57.4
4,5,Colorado,45.9,1.3,101,44.6


In [3]:
state_temp.rename(columns={'Name' : 'State_Name', 'Value' : 'temperature'}, inplace=True)

state_temp = state_temp[['State_Name','temperature']]

state_temp.head()

Unnamed: 0,State_Name,temperature
0,Alabama,65.4
1,Arizona,60.8
2,Arkansas,62.9
3,California,58.2
4,Colorado,45.9


In [4]:

# Calculates the average of the annual high temperature and annual low temperature for Honolulu, Hawaii
avg_temp_hawaii = (84+71)/2

# Creates a data frame with the missing states and their average tempeatures 
missing_states = pd.DataFrame({
    'State_Name': ['Alaska', 'Hawaii'],
    'temperature': [28.4, avg_temp_hawaii]
})

# Creates a boolean condition that checks if the State_Name column contains any of the missing states
states_found = state_temp['State_Name'].isin(['Alaska', 'Hawaii'])

# If none of the missing states are present in the column, the missing_states data frame and state_temp data frame are merged together  
if not states_found.any():

    state_temp = pd.concat([state_temp, missing_states])

# Sorts the data frame by state name, drops the original index and resets the new one from zero
state_temp = state_temp.sort_values(by='State_Name').reset_index(drop=True)


In [6]:
state_temp

Unnamed: 0,State_Name,temperature
0,Alabama,65.4
1,Alaska,28.4
2,Arizona,60.8
3,Arkansas,62.9
4,California,58.2
5,Colorado,45.9
6,Connecticut,52.2
7,Delaware,57.9
8,Florida,73.2
9,Georgia,65.6


In [13]:
temp_condition = state_temp['temperature'] > 65

filtered_state_temp = state_temp[temp_condition]

# print(filtered_state_temp)

print(len(filtered_state_temp))

7


### Husk at power_prices includere power prices for Distric of Columbia, hvilket ikke er en stat, men som tæller med i statistikken

In [34]:
# pip install polars openpyxl pyxlsb

#Importing Polars
import polars as pl

# Reads Excel file into a Polars DataFrame
power_prices = pl.read_excel(os.path.join("..", "in", "avgprice_annual.xlsx"))

# Removes the first row
power_prices = power_prices.slice(1, len(power_prices) - 1)

# Setting names to rename columns
new_column_names = {
    "Average Price (Cents/kilowatthour) by State by Provider, 1990-2020": "Year",
    "": "State",
    "_duplicated_0": "Industry Sector Category",
    "_duplicated_1": "Residential",
    "_duplicated_2": "Commercial",
    "_duplicated_3": "Industrial",
    "_duplicated_4": "Transportation",
    "_duplicated_5": "Other",
    "_duplicated_6": "Total",
    }

# Renaming columns
power_prices = power_prices.rename(new_column_names)

# Casting Year as floats
power_prices = power_prices.with_columns(pl.col("Year").cast(pl.Float64))

# Only look at data from 2020 in the year column
power_prices = power_prices.filter((pl.col("Year") == 2020) & (pl.col("State") != "US") & (pl.col("State") != "DC"))



# Only look at the Total Electric Industry in the Industry Sector Column
power_prices = power_prices.filter(pl.col("Industry Sector Category") == "Total Electric Industry")

# Casting Industrial as floats
power_prices = power_prices.with_columns(pl.col("Industrial").cast(pl.Float64))

# Calculate the mean of the "Industrial" column
mean_value = power_prices.select(pl.col("Industrial").mean().alias("mean"))

over_mean = power_prices.filter(pl.col("Industrial") > mean_value["mean"][0])

# Filter the data frames to only contain entries under the mean
under_mean = power_prices.filter(pl.col("Industrial") < mean_value["mean"][0])

print(mean_value)
print(over_mean.head(11))


shape: (1, 1)
┌──────────┐
│ mean     │
│ ---      │
│ f64      │
╞══════════╡
│ 7.847843 │
└──────────┘
shape: (11, 9)
┌────────┬───────┬─────────────────┬─────────────┬───┬────────────┬────────────────┬───────┬───────┐
│ Year   ┆ State ┆ Industry Sector ┆ Residential ┆ … ┆ Industrial ┆ Transportation ┆ Other ┆ Total │
│ ---    ┆ ---   ┆ Category        ┆ ---         ┆   ┆ ---        ┆ ---            ┆ ---   ┆ ---   │
│ f64    ┆ str   ┆ ---             ┆ str         ┆   ┆ f64        ┆ str            ┆ str   ┆ str   │
│        ┆       ┆ str             ┆             ┆   ┆            ┆                ┆       ┆       │
╞════════╪═══════╪═════════════════╪═════════════╪═══╪════════════╪════════════════╪═══════╪═══════╡
│ 2020.0 ┆ AK    ┆ Total Electric  ┆ 22.57       ┆ … ┆ 15.88      ┆ 0              ┆ NA    ┆ 19.82 │
│        ┆       ┆ Industry        ┆             ┆   ┆            ┆                ┆       ┆       │
│ 2020.0 ┆ CA    ┆ Total Electric  ┆ 20.45       ┆ … ┆ 14.27      ┆ 10.0

## References 

### Nominatim

https://medium.com/@gopesh3652/geocoding-with-python-using-nominatim-a-beginners-guide-220b250ca48d 

https://geopy.readthedocs.io/en/stable/ 


### Temperatures

Alaska:
https://www.ncei.noaa.gov/access/monitoring/monthly-report/national/202313 

Hawaii:
https://www.usclimatedata.com/climate/honolulu/hawaii/united-states/ushi0026

Washington DC:
https://www.weather.gov/media/lwx/climate/dcatemps.pdf 
