# Clean Weather Station Locations

In [1]:
# package imports go here
import pandas as pd
import numpy as np
from urllib.request import urlopen

### Read in Weather Station Locations from a file

In [2]:
# read ground station list with lat/long and station names
# See pandas IO tools for help reading this file
# -- https://pandas.pydata.org/pandas-docs/version/0.10.1/io.html

# NOAA ground site Data
# https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt

noaa_daily_site = 'https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/'
noaa_ground_station_loc_file = noaa_daily_site + 'ghcnd-stations.txt'

# Note:  file is column delimited.  So need to use alternate method to read with colspec and read_fwf

#             ID      Lat      Lon      Elev     State    Name     GSN      HCN/CRN  WMO ID
col_specs = [(0,11), (12,20), (21,30), (31,37), (38,40), (41,71), (72,75), (76,79), (80,85)]

with urlopen(noaa_ground_station_loc_file) as url:
    ground_stations = pd.read_fwf(url, colspecs=col_specs, header=None, index_col=0)

ground_stations


Unnamed: 0_level_0,1,2,3,4,5,6,7,8
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,
ACW00011647,17.1333,-61.7833,19.2,,ST JOHNS,,,
AE000041196,25.3330,55.5170,34.0,,SHARJAH INTER. AIRP,GSN,,41196.0
AEM00041194,25.2550,55.3640,10.4,,DUBAI INTL,,,41194.0
AEM00041217,24.4330,54.6510,26.8,,ABU DHABI INTL,,,41217.0
...,...,...,...,...,...,...,...,...
ZI000067969,-21.0500,29.3670,861.0,,WEST NICHOLSON,,,67969.0
ZI000067975,-20.0670,30.8670,1095.0,,MASVINGO,,,67975.0
ZI000067977,-21.0170,31.5830,430.0,,BUFFALO RANGE,,,67977.0
ZI000067983,-20.2000,32.6160,1132.0,,CHIPINGE,GSN,,67983.0


In [3]:
# Examine ground station dataFrame
ground_stations.reset_index(inplace=True)
ground_stations

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,
1,ACW00011647,17.1333,-61.7833,19.2,,ST JOHNS,,,
2,AE000041196,25.3330,55.5170,34.0,,SHARJAH INTER. AIRP,GSN,,41196.0
3,AEM00041194,25.2550,55.3640,10.4,,DUBAI INTL,,,41194.0
4,AEM00041217,24.4330,54.6510,26.8,,ABU DHABI INTL,,,41217.0
...,...,...,...,...,...,...,...,...,...
125983,ZI000067969,-21.0500,29.3670,861.0,,WEST NICHOLSON,,,67969.0
125984,ZI000067975,-20.0670,30.8670,1095.0,,MASVINGO,,,67975.0
125985,ZI000067977,-21.0170,31.5830,430.0,,BUFFALO RANGE,,,67977.0
125986,ZI000067983,-20.2000,32.6160,1132.0,,CHIPINGE,GSN,,67983.0


In [4]:
# Name the ground station columns
ground_stations.columns = ['ID', 'Lat', 'Lon', 'Elevation', 'State', 'Name', 'GSN', 'HCN/CRN', 'WMO ID']

ground_stations.dtypes


ID            object
Lat          float64
Lon          float64
Elevation    float64
State         object
Name          object
GSN           object
HCN/CRN       object
WMO ID       float64
dtype: object

In [5]:
# Clean the data in the columns and check for remaining NaN values
ground_stations['WMO ID'] = ground_stations['WMO ID'].fillna(0)
ground_stations['GSN'] = ground_stations['GSN'].fillna('')
ground_stations['HCN/CRN'] = ground_stations['HCN/CRN'].fillna('')
ground_stations['State'] = ground_stations['State'].fillna('')

print(f"Number of remaining NaN values after cleaning is: {ground_stations.isnull().sum().sum()}")


Number of remaining NaN values after cleaning is: 0


In [6]:
# Set WMO ID to int
ground_stations = ground_stations.astype({'WMO ID': int}, errors='raise')
ground_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125988 entries, 0 to 125987
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   ID         125988 non-null  object 
 1   Lat        125988 non-null  float64
 2   Lon        125988 non-null  float64
 3   Elevation  125988 non-null  float64
 4   State      125988 non-null  object 
 5   Name       125988 non-null  object 
 6   GSN        125988 non-null  object 
 7   HCN/CRN    125988 non-null  object 
 8   WMO ID     125988 non-null  int64  
dtypes: float64(3), int64(1), object(5)
memory usage: 8.7+ MB


In [7]:
# Get the US and CA ground stations
ground_stations_US_CA = ground_stations[ground_stations['ID'].str.contains("^US|^CA")]
ground_stations_US_CA

Unnamed: 0,ID,Lat,Lon,Elevation,State,Name,GSN,HCN/CRN,WMO ID
23763,CA001010066,48.8667,-123.2833,4.0,BC,ACTIVE PASS,,,0
23764,CA001010235,48.4000,-123.4833,17.0,BC,ALBERT HEAD,,,0
23765,CA001010595,48.5833,-123.5167,85.0,BC,BAMBERTON OCEAN CEMENT,,,0
23766,CA001010720,48.5000,-124.0000,351.0,BC,BEAR CREEK,,,0
23767,CA001010774,48.5000,-123.3500,61.0,BC,BEAVER LAKE,,,0
...,...,...,...,...,...,...,...,...,...
125176,USW00096405,60.4731,-145.3542,25.3,AK,CORDOVA 14 ESE,,CRN,0
125177,USW00096406,64.5014,-154.1297,78.9,AK,RUBY 44 ESE,,CRN,0
125178,USW00096407,66.5620,-159.0036,6.7,AK,SELAWIK 28 E,,CRN,0
125179,USW00096408,63.4519,-150.8747,678.2,AK,DENALI 27 N,,CRN,0


In [8]:
# Remove unnecessary columns
ground_stations_US_CA = ground_stations_US_CA.drop(['GSN', 'HCN/CRN', 'WMO ID'], axis=1)
ground_stations_US_CA

Unnamed: 0,ID,Lat,Lon,Elevation,State,Name
23763,CA001010066,48.8667,-123.2833,4.0,BC,ACTIVE PASS
23764,CA001010235,48.4000,-123.4833,17.0,BC,ALBERT HEAD
23765,CA001010595,48.5833,-123.5167,85.0,BC,BAMBERTON OCEAN CEMENT
23766,CA001010720,48.5000,-124.0000,351.0,BC,BEAR CREEK
23767,CA001010774,48.5000,-123.3500,61.0,BC,BEAVER LAKE
...,...,...,...,...,...,...
125176,USW00096405,60.4731,-145.3542,25.3,AK,CORDOVA 14 ESE
125177,USW00096406,64.5014,-154.1297,78.9,AK,RUBY 44 ESE
125178,USW00096407,66.5620,-159.0036,6.7,AK,SELAWIK 28 E
125179,USW00096408,63.4519,-150.8747,678.2,AK,DENALI 27 N


In [9]:
# Missing Elevation Data will have a value of -999.
# This skews the maps.  We will set it to an median elevation
# This will not be correct. But it will impact the result the least

average_elevation = ground_stations_US_CA['Elevation'].median()

ground_stations_US_CA['Elevation'] = ground_stations_US_CA['Elevation'].replace(-999, average_elevation)
ground_stations_US_CA['Elevation'] = ground_stations_US_CA['Elevation'].apply(lambda x: x * 3.28084)
ground_stations_US_CA

Unnamed: 0,ID,Lat,Lon,Elevation,State,Name
23763,CA001010066,48.8667,-123.2833,13.123360,BC,ACTIVE PASS
23764,CA001010235,48.4000,-123.4833,55.774280,BC,ALBERT HEAD
23765,CA001010595,48.5833,-123.5167,278.871400,BC,BAMBERTON OCEAN CEMENT
23766,CA001010720,48.5000,-124.0000,1151.574840,BC,BEAR CREEK
23767,CA001010774,48.5000,-123.3500,200.131240,BC,BEAVER LAKE
...,...,...,...,...,...,...
125176,USW00096405,60.4731,-145.3542,83.005252,AK,CORDOVA 14 ESE
125177,USW00096406,64.5014,-154.1297,258.858276,AK,RUBY 44 ESE
125178,USW00096407,66.5620,-159.0036,21.981628,AK,SELAWIK 28 E
125179,USW00096408,63.4519,-150.8747,2225.065688,AK,DENALI 27 N


In [10]:
# Write US & CA ground stations to a csv file
#ground_stations_US_CA.to_csv('result_files/US_CA_Station_locations.csv', index=False)
ground_stations_US_CA.to_parquet('result_files/stp2_US_CA_Station_locations.parquet.gzip', compression='gzip', engine="fastparquet") 

In [11]:
#ground_stations_US_CA = pd.read_csv('result_files/US_CA_Station_locations.csv')
pd.read_parquet('result_files/stp2_US_CA_Station_locations.parquet.gzip', engine="fastparquet") 
ground_stations_US_CA

Unnamed: 0,ID,Lat,Lon,Elevation,State,Name
23763,CA001010066,48.8667,-123.2833,13.123360,BC,ACTIVE PASS
23764,CA001010235,48.4000,-123.4833,55.774280,BC,ALBERT HEAD
23765,CA001010595,48.5833,-123.5167,278.871400,BC,BAMBERTON OCEAN CEMENT
23766,CA001010720,48.5000,-124.0000,1151.574840,BC,BEAR CREEK
23767,CA001010774,48.5000,-123.3500,200.131240,BC,BEAVER LAKE
...,...,...,...,...,...,...
125176,USW00096405,60.4731,-145.3542,83.005252,AK,CORDOVA 14 ESE
125177,USW00096406,64.5014,-154.1297,258.858276,AK,RUBY 44 ESE
125178,USW00096407,66.5620,-159.0036,21.981628,AK,SELAWIK 28 E
125179,USW00096408,63.4519,-150.8747,2225.065688,AK,DENALI 27 N
