In [None]:
# pip install geopandas, geopy, opencage, ipywidgets (for tqdm)

In [2]:
import os
# Creating paths
in_folderpath = os.path.join("..", "in")

os.makedirs(in_folderpath, exist_ok=True)

In [None]:
import geopandas as gpd

# Loading the data into A geopandas dataframe
gdf = gpd.read_file(os.path.join(in_folderpath, "datacenters.geojson"))

# Filtering the dataframe to only include locations within the US
gdf_usa = gdf[gdf['country'] == 'USA']

print(len(gdf_usa))

## Correct way of using the geolocator

In [None]:

from opencage.geocoder import OpenCageGeocode
from shapely.geometry import Point
import json
from tqdm import tqdm

api_key = 'API KEY'

geolocator = OpenCageGeocode(api_key)

# Iterate through each row to check and update geometry
for i, row in tqdm(gdf_usa.iterrows(), desc="Geocoding Addresses"):

    if row['geometry'] is None or row['geometry'].is_empty:
        
        # Creates an address for the geolocator
        address = f"{row['address']}, {row['postal']}, {row['city']}, {row['state']}, {row['country']}"
        
        # Returns a list of dictionaries with spatial information about the data center 
        location = geolocator.geocode(address)
        
        if location:

            # Retrieves the geometry from the list of dictionaries
            geometry = location[0]['geometry']

            # Adds the retrived latitude and longditude to each row in data frame
            gdf_usa.at[i, 'geometry'] = Point(geometry['lng'], geometry['lat'])
            gdf_usa.at[i, 'latitude'] = geometry['lat']
            gdf_usa.at[i, 'longitude'] = geometry['lng']
        else:
            print("Geometry not found")


gdf_usa_filtered = gdf_usa[gdf_usa['geometry'].notna()]

print(len(gdf_usa_filtered))

# Convert GeoDataFrame to JSON and save
json_str = gdf_usa_filtered.to_json()

json_dict = json.loads(json_str)

with open(os.path.join(in_folderpath, "datacenters_usa.geojson"), 'w') as f:
    json.dump(json_dict, f, indent=2)

print("GeoJSON file has been formatted and saved.")

In [None]:
from geopy.geocoders import Nominatim
from shapely.geometry import Point
import json
import time
from tqdm import tqdm

email_address = "victor.rassers.minecraft@gmail.com"

geolocator = Nominatim(user_agent=email_address)

# Iterate through each row to check and update geometry
for i, row in tqdm(gdf_usa.iterrows()):

    if row['geometry'] is None or row['geometry'].is_empty:        
        
        # Creates an address for the geolocator
        address = f"{row['address']}, {row['postal']}, {row['city']}, {row['state']}, {row['country']}"
        location = geolocator.geocode(address)

        time.sleep(1.5)
        
        if location:
            # Primary location found, updating the geometry
            gdf_usa.at[i, 'geometry'] = Point(location.longitude, location.latitude)
            gdf_usa.at[i, 'latitude'] = location.latitude
            gdf_usa.at[i, 'longitude'] = location.longitude
        else:
            # Primary location not found, trying alternate address
            alt_address = f"{row['postal']}, {row['city']}, {row['state']}, {row['country']}"
            alt_location = geolocator.geocode(alt_address)

            if alt_location:
                # Alternate location found, updating the geometry
                gdf_usa.at[i, 'geometry'] = Point(alt_location.longitude, alt_location.latitude)
                gdf_usa.at[i, 'latitude'] = alt_location.latitude
                gdf_usa.at[i, 'longitude'] = alt_location.longitude
            else:
                # Neither primary nor alternate location found
                print("Geometry not found")


gdf_usa_filtered = gdf_usa[gdf_usa['geometry'].notna()]

# Convert GeoDataFrame to JSON and save
json_str = gdf_usa_filtered.to_json()

json_dict = json.loads(json_str)

with open(os.path.join(in_folderpath, "datacenters_usa.geojson"), 'w') as f:
    json.dump(json_dict, f, indent=2)

print("GeoJSON file has been formatted and saved.")

## Finding missing temperatures

In [9]:

import pandas as pd

state_temp = pd.read_csv(os.path.join(in_folderpath, "state_temp.csv"), skiprows=4)

state_temp.head()

Unnamed: 0,ID,Name,Value,Anomaly (1901-2000 base period),Rank,1901-2000 Mean
0,1,Alabama,65.4,2.3,127,63.1
1,2,Arizona,60.8,1.4,105,59.4
2,3,Arkansas,62.9,2.5,126,60.4
3,4,California,58.2,0.8,92,57.4
4,5,Colorado,45.9,1.3,101,44.6


In [10]:
state_temp.rename(columns={'Name' : 'State_Name', 'Value' : 'temperature'}, inplace=True)

state_temp = state_temp[['State_Name','temperature']]

state_temp.head()

Unnamed: 0,State_Name,temperature
0,Alabama,65.4
1,Arizona,60.8
2,Arkansas,62.9
3,California,58.2
4,Colorado,45.9


In [11]:

# Calculates the average of the annual high temperature and annual low temperature for Honolulu, Hawaii
avg_temp_hawaii = (84+71)/2

# Creates a data frame with the missing states and their average tempeatures 
missing_states = pd.DataFrame({
    'State_Name': ['Alaska', 'District of Columbia', 'Hawaii'],
    'temperature': [28.4, 61.2, avg_temp_hawaii]
})

# Creates a boolean condition that checks if the State_Name column contains any of the missing states
states_found = state_temp['State_Name'].isin(['Alaska', 'District of Columbia', 'Hawaii'])

# If none of the missing states are present in the column, the missing_states data frame and state_temp data frame are merged together  
if not states_found.any():

    state_temp = pd.concat([state_temp, missing_states])

# Sorts the data frame by state name, drops the original index and resets the new one from zero
state_temp = state_temp.sort_values(by='State_Name').reset_index(drop=True)


In [12]:
state_temp

Unnamed: 0,State_Name,temperature
0,Alabama,65.4
1,Alaska,28.4
2,Arizona,60.8
3,Arkansas,62.9
4,California,58.2
5,Colorado,45.9
6,Connecticut,52.2
7,Delaware,57.9
8,District of Columbia,61.2
9,Florida,73.2


In [None]:
# pip install polars openpyxl pyxlsb

#Importing Polars
import polars as pl

# Reads Excel file into a Polars DataFrame
power_prices = pl.read_excel(os.path.join("..", "in", "avgprice_annual.xlsx"))

# Removes the first row
power_prices = power_prices.slice(1, len(power_prices) - 1)

# Setting names to rename columns
new_column_names = {
    "Average Price (Cents/kilowatthour) by State by Provider, 1990-2020": "Year",
    "": "State",
    "_duplicated_0": "Industry Sector Category",
    "_duplicated_1": "Residential",
    "_duplicated_2": "Commercial",
    "_duplicated_3": "Industrial",
    "_duplicated_4": "Transportation",
    "_duplicated_5": "Other",
    "_duplicated_6": "Total",
    }

# Renaming columns
power_prices = power_prices.rename(new_column_names)

# Casting Year as floats
power_prices = power_prices.with_columns(pl.col("Year").cast(pl.Float64))

# Only look at data from 2020 in the year column
power_prices = power_prices.filter(pl.col("Year") == 2020)

# Only look at the Total Electric Industry in the Industry Sector Column
power_prices = power_prices.filter(pl.col("Industry Sector Category") == "Total Electric Industry")

# Casting Industrial as floats
power_prices = power_prices.with_columns(pl.col("Industrial").cast(pl.Float64))

# Calculate the mean of the "Industrial" column
mean_value = power_prices.select(pl.col("Industrial").mean().alias("mean"))

# Filter the data frames to only contain entries under the mean
under_mean = power_prices.filter(pl.col("Industrial") < mean_value["mean"][0])


print(power_prices)
print(mean_value)
print(under_mean)

## References 

### Nominatim

https://medium.com/@gopesh3652/geocoding-with-python-using-nominatim-a-beginners-guide-220b250ca48d 

https://geopy.readthedocs.io/en/stable/ 


### Temperatures

Alaska:
https://www.ncei.noaa.gov/access/monitoring/monthly-report/national/202313 

Hawaii:
https://www.usclimatedata.com/climate/honolulu/hawaii/united-states/ushi0026

Washington DC:
https://www.weather.gov/media/lwx/climate/dcatemps.pdf 
