# Clean NOAA - Precipitation Data

## Purpose:
* Clean NOAA data
* Aggregate to FIPS code (county level)

## Dependencies

In [1]:
import pandas as pd
import numpy as np

In [2]:
## Load in NOAA data
noaa0 = pd.read_csv("data/raw/2370299.csv")

noaa0.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DAPR,MDPR,PRCP,SNOW,SNWD,FIPS,COUNTY
0,US1VAYR0007,"WILLIAMSBURG 4.4 N, VA US",37.333237,-76.712631,24.1,3/18/2020,5.0,0.0,,,,51830,Williamsburg County
1,US1VAYR0007,"WILLIAMSBURG 4.4 N, VA US",37.333237,-76.712631,24.1,3/19/2020,,,0.4,,,51830,Williamsburg County
2,US1VAYR0007,"WILLIAMSBURG 4.4 N, VA US",37.333237,-76.712631,24.1,3/20/2020,,,0.0,0.0,,51830,Williamsburg County
3,US1VAYR0007,"WILLIAMSBURG 4.4 N, VA US",37.333237,-76.712631,24.1,3/21/2020,,,0.18,,,51830,Williamsburg County
4,US1VAYR0007,"WILLIAMSBURG 4.4 N, VA US",37.333237,-76.712631,24.1,3/22/2020,,,0.0,0.0,,51830,Williamsburg County


In [3]:
## Check missing data of precipitation
sum(noaa0['PRCP'].isna())

578

In [4]:
## Convert DATE to datetime object
noaa0['DATE'] = pd.to_datetime(noaa0['DATE'])
    
## Check min/max
print(
min(noaa0['DATE']),
max(noaa0['DATE'])
)

2020-03-17 00:00:00 2020-11-23 00:00:00


In [5]:
## Aggregate to county instead of weather station level
noaa1 = pd.pivot_table(noaa0, values= "PRCP", index= ["COUNTY","FIPS","DATE"], aggfunc= np.mean)

noaa1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PRCP
COUNTY,FIPS,DATE,Unnamed: 3_level_1
Amelia County,51007,2020-03-17,0.0
Amelia County,51007,2020-03-18,0.0
Amelia County,51007,2020-03-19,0.095
Amelia County,51007,2020-03-20,0.0
Amelia County,51007,2020-03-21,0.145


In [6]:
noaa1.to_csv("data/build/build3_precipitation.csv")