In [1]:
import requests
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import time
from sqlalchemy import create_engine, text
import psycopg2

In [2]:
gdd_csv_path = r'C:\Users\tjjoh\OneDrive\Desktop\GIS 5572\Lab2\gdd.csv'

In [3]:
url = r'https://www.ncei.noaa.gov/cdo-web/api/v2/stations'
params = {
    'locationid': 'FIPS:27', #Minnesota is 27
    'datasetid': 'GHCND', #Global Historical Climatology Network Daily
    'limit': 1000 #Maximum results per request
}
headers = {'token': 'JgSvDOViKdKyBepyxxCKfJDaMaQghdfB'} #API token

stations = []

In [4]:
#Get a list of all stations in Minnesota
offset = 0 #Pagination will be necessary to retrieve all the data

while url:
    params['offset'] = offset #set the offset parameter
    try:
        response = requests.get(url, params = params, headers = headers) #make API request
        data = response.json() #transform to json
        
        if 'results' in data:
            stations.extend(data['results'])
            #for i in range(len(data['results'])):
                #stations.extend([data['results'][i]['id']]) #if results exist, add the station ids to the the list
                
            if len(data['results']) < params['limit']: #check if all data has been retrieved
                break
        else:
            break

        offset = offset + params['limit'] #update offset
        
    except:
        print(f'Error retriving data: {response.status_code}')
        break

In [5]:
url = r'https://www.ncei.noaa.gov/cdo-web/api/v2/data'
params = {
    'datasetid': 'NORMAL_MLY',
    'startdate': '2010-01-01',
    'enddate': '2010-12-01',
    'units': 'standard',
    'limit': 1000
}

In [6]:
gdd = pd.DataFrame()

for i in range(2189, len(stations)): #the first 2188 give errors
    attempt = 1 #create a counter for how many time to attempt to retrieve data
    max_attempts = 5 #maximum number of attempts
    success = False #default state of whether api request was successful

    while attempt <= max_attempts and not success: #iterate up to 3 times to try and make a successful api call
        try:
            params['stationid'] = stations[i]['id'] #add station id to url parameters
            response = requests.get(url, params = params, headers = headers) #retrieve data
            data = response.json() #convert to json
        
            dd = [stations[i]['id'], stations[i]['latitude'], stations[i]['longitude'], stations[i]['name']] #extract station id, location, and name
            for j in range(len(data['results'])): #iterate through retrieved data
                if data['results'][j]['datatype'] == 'MLY-CLDD-BASE50': #find desired dataset. BMSB become active at around 10C (50F)
                    dd.extend([data['results'][j]['value']]) #add data to station info list

            if len(dd) == 16: #check to see if dd is the proper length
                gdd = pd.concat([gdd, pd.DataFrame([dd])], ignore_index = True) #add to gdd dataframe if the proper data exists
                print(f'{i} Success')
            else:
                print(f'{i} Error: gdd_length = {len(dd)}')

            success = True #api call was succesful

        except:
            if response.status_code == 503: #check if api call was unsuccesful
                attempt += 1
                if attempt <= max_attempts: #if attempts remain, try api request again
                    print(f'{i} Error: 503, Retrying...')
                else: #if no attempts remain, skip point
                    print(f'{i} Error: 503, Failed to retrieve data.')
            elif response.status_code != 200: #check if status is not OK
                print(f'{i} Error: {response.status_code}')
                success = True
            elif len(dd) != 16: #check if dd is the proper length
                print(f'{i} Error: gdd_length = {len(dd)}')
                success = True
            else:
                print(f'{i} Error: unknown')
                success = True

        time.sleep(0.5)

2189 Success
2190 Success
2191 Success
2192 Success
2193 Error: gdd_length = 4
2194 Error: gdd_length = 4
2195 Error: gdd_length = 4
2196 Error: gdd_length = 4
2197 Error: gdd_length = 4
2198 Error: gdd_length = 4
2199 Error: gdd_length = 4
2200 Success
2201 Success
2202 Error: gdd_length = 4
2203 Success
2204 Success
2205 Error: gdd_length = 4
2206 Success
2207 Success
2208 Error: gdd_length = 4
2209 Success
2210 Error: gdd_length = 4
2211 Error: gdd_length = 4
2212 Error: gdd_length = 4
2213 Error: gdd_length = 4
2214 Success
2215 Error: gdd_length = 4
2216 Success
2217 Success
2218 Success
2219 Error: gdd_length = 4
2220 Error: gdd_length = 4
2221 Error: gdd_length = 4
2222 Error: gdd_length = 4
2223 Error: gdd_length = 4
2224 Success
2225 Error: gdd_length = 4
2226 Error: gdd_length = 4
2227 Error: gdd_length = 4
2228 Error: gdd_length = 4
2229 Error: gdd_length = 4
2230 Error: gdd_length = 4
2231 Success
2232 Success
2233 Error: gdd_length = 4
2234 Error: gdd_length = 4
2235 Succe

In [7]:
gdd.columns = ['ID', 'Latitude', 'Longitude', 'Name', 
               'January', 'February', 'March', 'April', 'May', 'June', 
               'July', 'August', 'September', 'October', 'November', 'December']

In [8]:
# Create a geometry column using the longitude and latitude
geometry = [Point(xy) for xy in zip(gdd['Longitude'], gdd['Latitude'])]

# Convert to GeoDataFrame
gdd_gdf = gpd.GeoDataFrame(gdd, geometry=geometry)

# Set the Coordinate Reference System (CRS) to WGS84 (EPSG:4326)
gdd_gdf.set_crs(epsg=4326, inplace=True)

Unnamed: 0,ID,Latitude,Longitude,Name,January,February,March,April,May,June,July,August,September,October,November,December,geometry
0,GHCND:USC00210018,47.299100,-96.516100,"ADA, MN US",0.0,-7777.0,1.0,51.0,227.0,460.0,614.0,570.0,269.0,49.0,2.0,-7777.0,POINT (-96.51610 47.29910)
1,GHCND:USC00210050,48.300500,-95.981600,"AGASSIZ REFUGE, MN US",0.0,-7777.0,1.0,47.0,238.0,460.0,615.0,555.0,266.0,50.0,1.0,0.0,POINT (-95.98160 48.30050)
2,GHCND:USC00210059,46.525700,-93.667400,"AITKIN 2 E, MN US",0.0,0.0,-7777.0,30.0,159.0,385.0,544.0,471.0,219.0,47.0,3.0,0.0,POINT (-93.66740 46.52570)
3,GHCND:USC00210075,43.606400,-93.301900,"ALBERT LEA 3 SE, MN US",0.0,-7777.0,6.0,70.0,272.0,539.0,680.0,604.0,341.0,90.0,9.0,-7777.0,POINT (-93.30190 43.60640)
4,GHCND:USC00210157,43.883500,-94.166500,"AMBOY, MN US",0.0,-7777.0,9.0,90.0,328.0,594.0,714.0,629.0,381.0,125.0,12.0,-7777.0,POINT (-94.16650 43.88350)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,GHCND:USW00094960,45.062220,-93.351070,"MINNEAPOLIS CRYSTAL AIRPORT, MN US",0.0,-7777.0,7.0,75.0,280.0,544.0,704.0,660.0,349.0,97.0,7.0,-7777.0,POINT (-93.35107 45.06222)
189,GHCND:USW00094961,48.726060,-94.612160,"BAUDETTE INTERNATIONAL AIRPORT, MN US",0.0,-7777.0,-7777.0,23.0,146.0,357.0,507.0,443.0,181.0,28.0,1.0,0.0,POINT (-94.61216 48.72606)
190,GHCND:USW00094963,44.832140,-93.470510,"MINNEAPOLIS FLYING CLOUD AIRPORT, MN US",0.0,-7777.0,7.0,81.0,302.0,568.0,727.0,649.0,365.0,103.0,10.0,-7777.0,POINT (-93.47051 44.83214)
191,GHCND:USW00094967,46.899670,-95.066820,"PARK RAPIDS MUNICIPAL AIRPORT, MN US",0.0,-7777.0,1.0,32.0,173.0,381.0,545.0,487.0,218.0,37.0,1.0,0.0,POINT (-95.06682 46.89967)


In [9]:
gdd_gdf.to_csv(gdd_csv_path, index=False)

In [10]:
#gdd = pd.read_csv(gdd_csv_path)
#gdd['geometry'] = [Point(xy) for xy in zip(gdd['Longitude'], gdd['Latitude'])]
#gdd_gdf = gpd.GeoDataFrame(gdd, geometry = 'geometry', crs = 'EPSG:4326')

In [11]:
min_latitude = 43
max_latitude = 49.5
min_longitude = -97.5
max_longitude = -89

invalid_geom = gdd_gdf[(gdd_gdf['Latitude'] < min_latitude) |
                       (gdd_gdf['Latitude'] > max_latitude) |
                       (gdd_gdf['Longitude'] < min_longitude) |
                       (gdd_gdf['Longitude'] > max_longitude)]

if invalid_geom.empty:
    print('All geometry is valid')
else:
    print('Invalid points:')
    print(invalid_geom)
    gdd_gdf = gdd_gdf[(gdd_gdf['Latitude'] >= min_latitude) &
                      (gdd_gdf['Latitude'] <= max_latitude) &
                      (gdd_gdf['Longitude'] >= min_longitude) &
                      (gdd_gdf['Longitude'] <= max_longitude)]

All geometry is valid


In [12]:
warm_months = ['May', 'June', 'July', 'August', 'September']
cold_months = ['December', 'January', 'February', 'March']

invalid_gdd = gdd_gdf.apply(lambda row: any(row[winter] >= row[summer] for winter in cold_months for summer in warm_months), axis = 1)

gdd_gdf = gdd_gdf[~invalid_gdd]

In [13]:
gdd_gdf.replace(-7777, 0, inplace=True)

In [15]:
#connect to the new database to enable PostGIS
connection_string = f'postgresql://<user>:<password>@34.133.43.30:5432/lab2'
engine = create_engine(connection_string)

# Push the GeoDataFrame to PostGIS
table_name = "mn_gdd"
gdd_gdf.to_postgis(table_name, engine, if_exists="replace", index=False)

print(f"GeoDataFrame successfully pushed to the PostGIS table '{table_name}'.")

GeoDataFrame successfully pushed to the PostGIS table 'mn_gdd'.
