In [52]:
import pandas as pd
import glob
import re

The GHCN files are in a fixed-width format. Each row represents a month of data for a variable recorded at a station. The first four columns contain the station id, year, month, and element being measured. The remaining columns are the daily value (VALUE1) and 3 daily flags (MFLAG1, QFLAG1, SFLAG1). Each day has these four columns with the number being the day of the month.

The tedious part of this is generating the list of column widths so the data will be separated correctly, and generating the list of column names. The pattern for both is specified in the data readme.

Code to generate widths argument:

In [24]:
# initial identifier columns
widths = [11, 4, 2, 4]

# generate column widths for 31 days
for i in range(1, 32):
    exend_widths = [5, 1, 1, 1]
    widths.extend(exend_widths)

Code to generate names argument:

In [23]:
# initial identifier columns
names = ['id', 'year', 'month', 'element']

# generate names for daily columns
for i in range(1, 32):
    extend_names = [f'value{i}', f'mflag{i}', f'qflag{i}', f'sflag{i}']
    names.extend(extend_names)

In [25]:
test = pd.read_fwf('../data/USC00401553_cedar_hill.dly', widths = widths, header = None, names = names)

In [26]:
test.head()

Unnamed: 0,id,year,month,element,value1,mflag1,qflag1,sflag1,value2,mflag2,...,qflag29,sflag29,value30,mflag30,qflag30,sflag30,value31,mflag31,qflag31,sflag31
0,USC00401553,1897,3,TMAX,-9999,,,,-9999,,...,,,-9999,,,,-9999,,,
1,USC00401553,1897,3,TMIN,-9999,,,,-9999,,...,,,-9999,,,,-9999,,,
2,USC00401553,1897,3,PRCP,0,P,,6.0,0,P,...,,6.0,198,,,6.0,203,,,6.0
3,USC00401553,1897,3,SNOW,-9999,,,,-9999,,...,,,-9999,,,,-9999,,,
4,USC00401553,1897,4,TMAX,-9999,,,,-9999,,...,,,-9999,,,,-9999,,,


Process all files in folder into a dataframe:

In [34]:
# list of the files
files = glob.glob('../data/*.dly')

# empty list to store dataframes
df_list = []

# loop over the files to create dataframes
for file in files:
    df_list.append(pd.read_fwf(file, widths = widths, header = None, names = names))

# concatenate to single dataframe
weather = pd.concat(df_list, ignore_index = True)

In [35]:
weather.tail()

Unnamed: 0,id,year,month,element,value1,mflag1,qflag1,sflag1,value2,mflag2,...,qflag29,sflag29,value30,mflag30,qflag30,sflag30,value31,mflag31,qflag31,sflag31
79280,USC00405349,1954,5,SNOW,0,,,0.0,0,,...,,0.0,0,,,0.0,0,,,0.0
79281,USC00405349,1954,5,SNWD,0,,,0.0,0,,...,,0.0,0,,,0.0,0,,,0.0
79282,USC00405349,1954,6,PRCP,28,,,0.0,0,,...,,0.0,0,,,0.0,-9999,,,
79283,USC00405349,1954,6,SNOW,0,,,0.0,0,,...,,0.0,0,,,0.0,-9999,,,
79284,USC00405349,1954,6,SNWD,0,,,0.0,0,,...,,0.0,0,,,0.0,-9999,,,


Melt and pivot dataframe so that there is a row per station per day:

In [102]:
# columns for melting
id_vars = ['id', 'year', 'month', 'element']
value_vars = weather.columns[4:-1]
var_name = 'day'
value_name = 'day_value'

weather_melt = pd.melt(weather, 
                       id_vars = id_vars,
                       value_vars = value_vars,
                       var_name = var_name,
                       value_name = value_name)

In [77]:
weather_melt.head()

Unnamed: 0,id,year,month,element,day,day_value
0,USC00408414,1941,2,PRCP,value1,-9999
1,USC00408414,1941,2,SNOW,value1,-9999
2,USC00408414,1941,2,SNWD,value1,-9999
3,USC00408414,1941,2,WT18,value1,-9999
4,USC00408414,1941,3,PRCP,value1,0


Drop 'flag' rows:

In [103]:
weather_melt = weather_melt[~weather_melt['day'].str.contains('flag')]

Create date column from year, month, and extracted number from day column:

In [104]:
# day_num column from day
weather_melt['day_num'] = weather_melt['day'].str.extract('(\d+)')[0].str.zfill(2)

In [105]:
# same for month_num
weather_melt['month_num'] = weather_melt['month'].astype(str).str.zfill(2)

In [106]:
# concatenate into date column
weather_melt['date'] =  (weather_melt['year'].astype(str) + '-' + 
                         weather_melt['month_num'].astype(str) + '-' + 
                         weather_melt['day_num'].astype(str))

In [90]:
weather_melt.head()

Unnamed: 0,id,year,month,element,day,day_value,day_num,month_num,date
0,USC00408414,1941,2,PRCP,value1,-9999,1,2,1941-02-01
1,USC00408414,1941,2,SNOW,value1,-9999,1,2,1941-02-01
2,USC00408414,1941,2,SNWD,value1,-9999,1,2,1941-02-01
3,USC00408414,1941,2,WT18,value1,-9999,1,2,1941-02-01
4,USC00408414,1941,3,PRCP,value1,0,1,3,1941-03-01


Drop unneeded columns:

In [107]:
weather_melt = weather_melt.drop(['day', 'day_num', 'month_num'], axis = 1)

In [108]:
weather_melt.head()

Unnamed: 0,id,year,month,element,day_value,date
0,USC00408414,1941,2,PRCP,-9999,1941-02-01
1,USC00408414,1941,2,SNOW,-9999,1941-02-01
2,USC00408414,1941,2,SNWD,-9999,1941-02-01
3,USC00408414,1941,2,WT18,-9999,1941-02-01
4,USC00408414,1941,3,PRCP,0,1941-03-01


Drop rows with missing value (-9999 per readme):

In [109]:
weather_melt = weather_melt[~weather_melt['day_value'].astype(str).str.contains('-9999')]

In [112]:
weather_melt.info(show_counts = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2110276 entries, 4 to 9593481
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   id         2110276 non-null  object
 1   year       2110276 non-null  int64 
 2   month      2110276 non-null  int64 
 3   element    2110276 non-null  object
 4   day_value  2110276 non-null  object
 5   date       2110276 non-null  object
dtypes: int64(2), object(4)
memory usage: 112.7+ MB
