# Dataset

bouldercreek_09_2013.txt

- stream discharge data
- summarized at 15 minute intervals (in cubic feet per second)
- measured from 1-30 September 2013

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('../data/bouldercreek_09_2013.txt')
# we get a ParserError because the file is not in csv format

# Read file correctly

- open the txt file in a text editor
- observe issues with the file format:
    - first 25 rows are descriptive text
    - uses tab delimiter (not comma)

In [None]:
# we skip the first 25 rows that contain descriptive text
data = pd.read_csv('../data/bouldercreek_09_2013.txt', skiprows=25)

In [None]:
data.head()

In [None]:
# we change the delimiter to tab: \t
data = pd.read_csv('data/bouldercreek_09_2013.txt', skiprows=25, delimiter='\t')

In [None]:
data.head()

In [None]:
# cleaning: remove the first row
data = data.loc[1:, :]

# Explore the data

- *04_00060* is the code for the discharge rate

In [None]:
# we are only observing one site
data['site_no'].unique()

In [None]:
data['04_00060'].describe()
# discharge rate column is not numerical

In [None]:
# convert to number
data['04_00060'] = data['04_00060'].astype(float)

In [None]:
data['04_00060'].describe()

In [None]:
data['datetime']
# datetime is not in datetime format

In [None]:
# convert to datetime
data['datetime'] = pd.to_datetime(data['datetime'])

# Create plot

In [None]:
import plotnine as p9

In [None]:
base_plot = p9.ggplot(data=data, mapping=p9.aes(x='datetime', y='04_00060'))

In [None]:
base_plot + p9.geom_line()

tidy up the x-axis

In [None]:
base_plot + p9.geom_line() + p9.scale_x_datetime(date_breaks='1 week', date_labels='%d/%m')
# scale_x_datetime also has 'limits' parameter to specify the boundaries