In [2]:
import pandas as pd
df = pd.read_csv('datasets/dirtydevil.txt', skiprows=lambda num: num < 34 or num == 35, sep='\t', low_memory=False)


1. Convert a date column from a string to a proper date.


In [31]:
df2 = pd.DataFrame({
    'date': ['6/2/2017', '5/23/2017', '5/20/2017'],
    'data': [1, 2, 3]})
df2['date'] = pd.to_datetime(df2['date'])
print(df2)

        date  data
0 2017-06-02     1
1 2017-05-23     2
2 2017-05-20     3


2. Group the data by month names and look at the mean values.


In [34]:
dg = df2.groupby(pd.Grouper(key='date', freq='1M')).mean()
dg.index = dg.index.strftime('%B')
print(dg)

      data
date      
May    2.5
June   1.0


3. Group the data by each month of every year and look at the mean values.

In [5]:
df3 = pd.DataFrame({
    'date': ['6/2/2017', '5/23/2017', '5/20/2017', '6/2/2018', '5/23/2018', '5/20/2018'],
    'data': [1, 2, 3, 4, 5, 6]})
df3['date'] = pd.to_datetime(df3['date'])
dg = df3.groupby(pd.Grouper(key='date', freq='1M')).mean()

print(dg)

            data
date            
2017-05-31   2.5
2017-06-30   1.0
2017-07-31   NaN
2017-08-31   NaN
2017-09-30   NaN
2017-10-31   NaN
2017-11-30   NaN
2017-12-31   NaN
2018-01-31   NaN
2018-02-28   NaN
2018-03-31   NaN
2018-04-30   NaN
2018-05-31   5.5
2018-06-30   4.0


4. Insert the date column in the index and slice out a portion of the rows by date.

In [25]:
def tweak_river(df_):
    return (df_.assign(datetime=pd.to_datetime(df_.datetime))
           .rename(columns={'144166_00060': 'cfs',
                            '144167_00065': 'gage_height'})
           .set_index('datetime')
           )

dd = tweak_river(df)
dd.cfs.loc['2018':]

datetime
2018-01-01 00:00:00    92.80
2018-01-01 00:15:00    88.30
2018-01-01 00:30:00    90.50
2018-01-01 00:45:00    90.50
2018-01-01 01:00:00    94.00
                       ...  
2020-09-28 08:30:00     9.53
2020-09-28 08:45:00     9.20
2020-09-28 09:00:00     9.20
2020-09-28 09:15:00     9.20
2020-09-28 09:30:00     9.20
Name: cfs, Length: 95886, dtype: float64