In [2]:
import numpy as np
import pandas as pd

pd.set_option('display.float_format', '{:,.2f}'.format)

def image(name, **kwargs):
    from IPython.display import Image
    return Image(filename=f'assets/{name}', retina=True, **kwargs)

In [6]:
cols = ['year', 'month', 'day', 'hour', 'air_temp', 'dew_point',
        'pressure', 'wind_dir', 'wind_speed', 'sky_code',
        'precip1', 'precip6']
dtypes = {
    'year': 'int16',
    'month': 'int8',
    'day': 'int8',
    'hour': 'int8',
    'air_temp': 'Int32',
    'dew_point': 'Int32',
    'pressure': 'float64',
    'wind_dir': 'Int32',
    'wind_speed': 'float64',
    'sky_code': 'Int32',
    'precip1': 'Int32',
    'precip6': 'Int32'
}

usecols = ['year', 'month', 'day', 'hour', 'air_temp', 
           'dew_point', 'wind_speed']

dat = (pd.read_fwf(
    'data/726505-04845-2009.txt', header=None, names=cols,
    dtype=dtypes, na_values='-9999', usecols=usecols)
    .query("month <= 6")
    .assign(
        air_temp=lambda df: df.air_temp / 10., # Celsius * 10
        dew_point=lambda df: df.dew_point / 10. # Celsius * 10
        # wind_speed=lambda df: df.wind_speed / 10. # meters/sec * 10
))

In [11]:
dat.year.unique()

array([2009], dtype=int16)

In [12]:
dat.dtypes

year            int16
month            int8
day              int8
hour             int8
air_temp      Float64
dew_point     Float64
wind_speed    float64
dtype: object

In [13]:
dat.head()

Unnamed: 0,year,month,day,hour,air_temp,dew_point,wind_speed
0,2009,1,1,1,-13.3,-16.7,15.0
1,2009,1,1,2,-13.3,-16.1,26.0
2,2009,1,1,3,-12.2,-15.6,0.0
3,2009,1,1,4,-11.7,-15.0,0.0
4,2009,1,1,5,-11.1,-15.0,15.0


In [15]:
grp = dat.groupby('month')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000021C7FA77440>

In [18]:
for month, month_df in grp:
    print(f'Month: {month}')
    break

Month: 1


In [19]:
month_df.head()

Unnamed: 0,year,month,day,hour,air_temp,dew_point,wind_speed
0,2009,1,1,1,-13.3,-16.7,15.0
1,2009,1,1,2,-13.3,-16.1,26.0
2,2009,1,1,3,-12.2,-15.6,0.0
3,2009,1,1,4,-11.7,-15.0,0.0
4,2009,1,1,5,-11.1,-15.0,15.0


In [22]:
#group names
grp.groups.keys()

dict_keys([1, 2, 3, 4, 5, 6])

In [30]:
list(grp.groups.values())[0][:5]

Index([0, 1, 2, 3, 4], dtype='int64')

In [37]:
# extact specific groups
grp.get_group(1)[:3]

Unnamed: 0,year,month,day,hour,air_temp,dew_point,wind_speed
0,2009,1,1,1,-13.3,-16.7,15.0
1,2009,1,1,2,-13.3,-16.1,26.0
2,2009,1,1,3,-12.2,-15.6,0.0


In [38]:
grp.mean()

Unnamed: 0_level_0,year,day,hour,air_temp,dew_point,wind_speed
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2009.0,16.02,11.52,-10.01,-14.11,42.85
2,2009.0,14.5,11.5,-2.95,-7.25,47.96
3,2009.0,16.0,11.5,2.1,-3.42,44.69
4,2009.0,15.5,11.5,7.03,0.27,49.48
5,2009.0,16.0,11.5,13.98,6.24,42.6
6,2009.0,15.49,11.51,18.06,12.25,32.73


In [43]:
#single column
grp['air_temp'].mean()

month
1   -10.01
2    -2.95
3     2.10
4     7.03
5    13.98
6    18.06
Name: air_temp, dtype: Float64