In [1]:
from seaborn import load_dataset
import numpy as np
import pandas as pd
import sqlite3
import sqlalchemy
from IPython.display import display
pd.set_option('max_rows', 9)
pd.set_option('max_columns', 11)

## 型変換

In [2]:
ozone = pd.read_csv('data/ozone.csv', index_col=0)
print('ozone')
display(ozone)
print('出典 : http://archive.ics.uci.edu/ml/datasets/Ozone+Level+Detection')

ozone


Unnamed: 0_level_0,WSR0,WSR1,WSR2,WSR3,WSR4,...,TT,SLP,SLP_,Precp,OzoneDay
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1/1/1998,0.8,1.8,2.4,2.1,2.0,...,17.90,10330.0,-55.0,0.00,0.0
1/2/1998,2.8,3.2,3.3,2.7,3.3,...,29.00,10275.0,-55.0,0.00,0.0
1/3/1998,2.9,2.8,2.6,2.1,2.2,...,41.30,10235.0,-40.0,0.00,0.0
1/4/1998,4.7,3.8,3.7,3.8,2.9,...,51.70,10195.0,-40.0,2.08,0.0
...,...,...,...,...,...,...,...,...,...,...,...
12/28/2004,1.0,1.4,1.1,1.7,1.5,...,19.10,10310.0,15.0,0.00,0.0
12/29/2004,0.8,0.8,1.2,0.9,0.4,...,35.20,10275.0,-35.0,0.00,0.0
12/30/2004,1.3,0.9,1.5,1.2,1.6,...,34.20,10245.0,-30.0,0.05,0.0
12/31/2004,1.5,1.3,1.8,1.4,1.2,...,39.35,10220.0,-25.0,0.00,0.0


出典 : http://archive.ics.uci.edu/ml/datasets/Ozone+Level+Detection


一般的な型変換には`pandas.Series.astype`を使用する。

In [3]:
pd.Series.astype??

In [4]:
ozone['OzoneDay'].astype(int)

Date
1/1/1998      0
1/2/1998      0
1/3/1998      0
1/4/1998      0
             ..
12/28/2004    0
12/29/2004    0
12/30/2004    0
12/31/2004    0
Name: OzoneDay, Length: 2536, dtype: int64

### datetime型への変換
---
よく利用する文字列型から datetime 型への変換と datetime 型からの要素抽出を扱う。  
datetime 型への変換には`pandas.to_datetime`を使用する。

In [5]:
pd.to_datetime??

In [6]:
dti = pd.to_datetime(ozone.index)
dti

DatetimeIndex(['1998-01-01', '1998-01-02', '1998-01-03', '1998-01-04',
               '1998-01-05', '1998-01-06', '1998-01-07', '1998-01-08',
               '1998-01-09', '1998-01-10',
               ...
               '2004-12-22', '2004-12-23', '2004-12-24', '2004-12-25',
               '2004-12-26', '2004-12-27', '2004-12-28', '2004-12-29',
               '2004-12-30', '2004-12-31'],
              dtype='datetime64[ns]', name='Date', length=2536, freq=None)

### タイムゾーンの変更
---
ローカル時間への変更は`tz_localize`を使用する。

In [7]:
pd.Timestamp.tz_localize??

In [8]:
dtl = dti.tz_localize('Asia/Tokyo')
dtl

DatetimeIndex(['1998-01-01 00:00:00+09:00', '1998-01-02 00:00:00+09:00',
               '1998-01-03 00:00:00+09:00', '1998-01-04 00:00:00+09:00',
               '1998-01-05 00:00:00+09:00', '1998-01-06 00:00:00+09:00',
               '1998-01-07 00:00:00+09:00', '1998-01-08 00:00:00+09:00',
               '1998-01-09 00:00:00+09:00', '1998-01-10 00:00:00+09:00',
               ...
               '2004-12-22 00:00:00+09:00', '2004-12-23 00:00:00+09:00',
               '2004-12-24 00:00:00+09:00', '2004-12-25 00:00:00+09:00',
               '2004-12-26 00:00:00+09:00', '2004-12-27 00:00:00+09:00',
               '2004-12-28 00:00:00+09:00', '2004-12-29 00:00:00+09:00',
               '2004-12-30 00:00:00+09:00', '2004-12-31 00:00:00+09:00'],
              dtype='datetime64[ns, Asia/Tokyo]', name='Date', length=2536, freq=None)

設定されたタイムゾーンを別のタイムゾーンに変更するには`tz_convert`を使用する。

In [9]:
dtl.tz_convert('Etc/GMT')

DatetimeIndex(['1997-12-31 15:00:00+00:00', '1998-01-01 15:00:00+00:00',
               '1998-01-02 15:00:00+00:00', '1998-01-03 15:00:00+00:00',
               '1998-01-04 15:00:00+00:00', '1998-01-05 15:00:00+00:00',
               '1998-01-06 15:00:00+00:00', '1998-01-07 15:00:00+00:00',
               '1998-01-08 15:00:00+00:00', '1998-01-09 15:00:00+00:00',
               ...
               '2004-12-21 15:00:00+00:00', '2004-12-22 15:00:00+00:00',
               '2004-12-23 15:00:00+00:00', '2004-12-24 15:00:00+00:00',
               '2004-12-25 15:00:00+00:00', '2004-12-26 15:00:00+00:00',
               '2004-12-27 15:00:00+00:00', '2004-12-28 15:00:00+00:00',
               '2004-12-29 15:00:00+00:00', '2004-12-30 15:00:00+00:00'],
              dtype='datetime64[ns, Etc/GMT]', name='Date', length=2536, freq=None)

### datetime型からの要素抽出
---
属性を指定して要素を抽出できる。  
`pandas.Series`オブジェクトからは`dt`属性を経由して使用する。例えば、`df['col'].dt.year`のように指定する。

<table class="border text-center">
    <tr>
        <td>年</td>
        <td>year</td>
    </tr>
    <tr>
        <td>月</td>
        <td>month</td>
    </tr>
    <tr>
        <td>日</td>
        <td>day</td>
    </tr>
    <tr>
        <td>時</td>
        <td>hour</td>
    </tr>
    <tr>
        <td>分</td>
        <td>minute</td>
    </tr>
    <tr>
        <td>秒</td>
        <td>second</td>
    </tr>
    <tr>
        <td>四半期</td>
        <td>quarter</td>
    </tr>
    <tr>
        <td>曜日名</td>
        <td>weekday_name</td>
    </tr>
    <tr>
        <td>曜日番号</td>
        <td>weekday</td>
    </tr>
</table>

pandas の datetime 型のメソッド。

In [10]:
print([p for p in dir(pd.Timestamp) if not p.startswith('_')])

['asm8', 'astimezone', 'ceil', 'combine', 'ctime', 'date', 'day', 'day_name', 'dayofweek', 'dayofyear', 'days_in_month', 'daysinmonth', 'dst', 'floor', 'fold', 'freq', 'freqstr', 'fromordinal', 'fromtimestamp', 'hour', 'is_leap_year', 'is_month_end', 'is_month_start', 'is_quarter_end', 'is_quarter_start', 'is_year_end', 'is_year_start', 'isocalendar', 'isoformat', 'isoweekday', 'max', 'microsecond', 'min', 'minute', 'month', 'month_name', 'nanosecond', 'normalize', 'now', 'quarter', 'replace', 'resolution', 'round', 'second', 'strftime', 'strptime', 'time', 'timestamp', 'timetuple', 'timetz', 'to_datetime64', 'to_julian_date', 'to_period', 'to_pydatetime', 'today', 'toordinal', 'tz', 'tz_convert', 'tz_localize', 'tzinfo', 'tzname', 'utcfromtimestamp', 'utcnow', 'utcoffset', 'utctimetuple', 'value', 'week', 'weekday', 'weekday_name', 'weekofyear', 'year']


In [11]:
dti.year

Int64Index([1998, 1998, 1998, 1998, 1998, 1998, 1998, 1998, 1998, 1998,
            ...
            2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004],
           dtype='int64', name='Date', length=2536)

## 置換

In [12]:
usarrests = pd.read_csv('data/USArrests.csv')
usarrests.columns = ['state'] + usarrests.columns.tolist()[1:]
state_code = {v: i + 1 for i, v in usarrests['state'].iteritems()}
print('usarrests')
display(usarrests)
print('state_code')
print(state_code)

usarrests


Unnamed: 0,state,Murder,Assault,UrbanPop,Rape
0,Alabama,13.2,236,58,21.2
1,Alaska,10.0,263,48,44.5
2,Arizona,8.1,294,80,31.0
3,Arkansas,8.8,190,50,19.5
...,...,...,...,...,...
46,Washington,4.0,145,73,26.2
47,West Virginia,5.7,81,39,9.3
48,Wisconsin,2.6,53,66,10.8
49,Wyoming,6.8,161,60,15.6


state_code
{'Alabama': 1, 'Alaska': 2, 'Arizona': 3, 'Arkansas': 4, 'California': 5, 'Colorado': 6, 'Connecticut': 7, 'Delaware': 8, 'Florida': 9, 'Georgia': 10, 'Hawaii': 11, 'Idaho': 12, 'Illinois': 13, 'Indiana': 14, 'Iowa': 15, 'Kansas': 16, 'Kentucky': 17, 'Louisiana': 18, 'Maine': 19, 'Maryland': 20, 'Massachusetts': 21, 'Michigan': 22, 'Minnesota': 23, 'Mississippi': 24, 'Missouri': 25, 'Montana': 26, 'Nebraska': 27, 'Nevada': 28, 'New Hampshire': 29, 'New Jersey': 30, 'New Mexico': 31, 'New York': 32, 'North Carolina': 33, 'North Dakota': 34, 'Ohio': 35, 'Oklahoma': 36, 'Oregon': 37, 'Pennsylvania': 38, 'Rhode Island': 39, 'South Carolina': 40, 'South Dakota': 41, 'Tennessee': 42, 'Texas': 43, 'Utah': 44, 'Vermont': 45, 'Virginia': 46, 'Washington': 47, 'West Virginia': 48, 'Wisconsin': 49, 'Wyoming': 50}


### 辞書を使用した置換
---
`pandas.Series.replace`または`pandas.Series.map`を使用する。  
辞書にない値の場合、`replace`では元の値のまま、`map`では欠損値になる。

In [13]:
usarrests['state'].replace(state_code)

0      1
1      2
2      3
3      4
      ..
46    47
47    48
48    49
49    50
Name: state, Length: 50, dtype: int64

In [14]:
usarrests['state'].map(state_code)

0      1
1      2
2      3
3      4
      ..
46    47
47    48
48    49
49    50
Name: state, Length: 50, dtype: int64

### 自己結合
---
`FROM`に続くテーブル名に別の名前をつけることができるので、これを利用して自信のカラムの値をキーにして結合できる。

In [15]:
engine = sqlalchemy.create_engine('sqlite:///data/chinook.db')
pd.read_sql('employees', engine)

Unnamed: 0,EmployeeId,LastName,FirstName,Title,ReportsTo,...,Country,PostalCode,Phone,Fax,Email
0,1,Adams,Andrew,General Manager,,...,Canada,T5K 2N1,+1 (780) 428-9482,+1 (780) 428-3457,andrew@chinookcorp.com
1,2,Edwards,Nancy,Sales Manager,1.0,...,Canada,T2P 2T3,+1 (403) 262-3443,+1 (403) 262-3322,nancy@chinookcorp.com
2,3,Peacock,Jane,Sales Support Agent,2.0,...,Canada,T2P 5M5,+1 (403) 262-3443,+1 (403) 262-6712,jane@chinookcorp.com
3,4,Park,Margaret,Sales Support Agent,2.0,...,Canada,T2P 5G3,+1 (403) 263-4423,+1 (403) 263-4289,margaret@chinookcorp.com
4,5,Johnson,Steve,Sales Support Agent,2.0,...,Canada,T3B 1Y7,1 (780) 836-9987,1 (780) 836-9543,steve@chinookcorp.com
5,6,Mitchell,Michael,IT Manager,1.0,...,Canada,T3B 0C5,+1 (403) 246-9887,+1 (403) 246-9899,michael@chinookcorp.com
6,7,King,Robert,IT Staff,6.0,...,Canada,T1K 5N8,+1 (403) 456-9986,+1 (403) 456-8485,robert@chinookcorp.com
7,8,Callahan,Laura,IT Staff,6.0,...,Canada,T1H 1Y8,+1 (403) 467-3351,+1 (403) 467-8772,laura@chinookcorp.com


In [16]:
print('出典 : http://www.sqlitetutorial.net/sqlite-sample-database/')

出典 : http://www.sqlitetutorial.net/sqlite-sample-database/


In [17]:
pd.read_sql(
    '''SELECT m.firstname || ' ' || m.lastname AS 'Name',
              e.firstname || ' ' || e.lastname AS 'Direct report' 
       FROM employees e
       INNER JOIN employees m ON m.employeeid = e.reportsto
       ORDER BY name
    ''', engine)

Unnamed: 0,Name,Direct report
0,Andrew Adams,Nancy Edwards
1,Andrew Adams,Michael Mitchell
2,Michael Mitchell,Robert King
3,Michael Mitchell,Laura Callahan
4,Nancy Edwards,Jane Peacock
5,Nancy Edwards,Margaret Park
6,Nancy Edwards,Steve Johnson


## 集計

### pandas

In [18]:
iris = load_dataset(
    'iris',
    usecols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
print('iris')
display(iris)

iris


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
...,...,...,...,...
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8


#### 基本統計量

In [19]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


#### 合計
---
`pandas.DataFrame.sum`を使用。

In [20]:
iris.sum()

sepal_length    876.5
sepal_width     458.6
petal_length    563.7
petal_width     179.9
dtype: float64

#### レコード数
---
欠損を考慮しない場合はインデックスの長さを取得する。

In [21]:
iris.index.size

150

欠損を考慮する場合は`pandas.DataFrame.count`を使用する。

In [22]:
pd.DataFrame.count??

In [23]:
iris.count()

sepal_length    150
sepal_width     150
petal_length    150
petal_width     150
dtype: int64

#### 平均
---
`pandas.DataFrame.mean`を使用する。

In [24]:
pd.DataFrame.mean??

In [25]:
iris.mean()

sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

#### 標準偏差
---
`pandas.DataFrame.std`を使用する。既定値は`ddof=1`なので、標本標準偏差。

In [26]:
pd.DataFrame.std??

In [27]:
iris.std()

sepal_length    0.828066
sepal_width     0.435866
petal_length    1.765298
petal_width     0.762238
dtype: float64

#### 最大値
---
`pandas.DataFrame.max`を使用する。

In [28]:
pd.DataFrame.max??

In [29]:
iris.max()

sepal_length    7.9
sepal_width     4.4
petal_length    6.9
petal_width     2.5
dtype: float64

#### 最小値
---
`pandas.DataFrame.min`を使用する。

In [30]:
pd.DataFrame.min??

In [31]:
iris.min()

sepal_length    4.3
sepal_width     2.0
petal_length    1.0
petal_width     0.1
dtype: float64

#### 累積和
---
`pandas.DataFrame.cumsum`を使用する。

In [32]:
pd.DataFrame.cumsum??

In [33]:
usarrests = pd.read_csv('data/USArrests.csv', index_col=0)
print('usarrests')
display(usarrests)

usarrests


Unnamed: 0,Murder,Assault,UrbanPop,Rape
Alabama,13.2,236,58,21.2
Alaska,10.0,263,48,44.5
Arizona,8.1,294,80,31.0
Arkansas,8.8,190,50,19.5
...,...,...,...,...
Washington,4.0,145,73,26.2
West Virginia,5.7,81,39,9.3
Wisconsin,2.6,53,66,10.8
Wyoming,6.8,161,60,15.6


In [34]:
usarrests.cumsum()

Unnamed: 0,Murder,Assault,UrbanPop,Rape
Alabama,13.2,236.0,58.0,21.2
Alaska,23.2,499.0,106.0,65.7
Arizona,31.3,793.0,186.0,96.7
Arkansas,40.1,983.0,236.0,116.2
...,...,...,...,...
Washington,374.3,8243.0,3112.0,1025.9
West Virginia,380.0,8324.0,3151.0,1035.2
Wisconsin,382.6,8377.0,3217.0,1046.0
Wyoming,389.4,8538.0,3277.0,1061.6


値順に並べ替えてから累積和・累積割合を算出したりする。

In [35]:
usarrests['Murder'].sort_values().cumsum() / usarrests['Murder'].sum()

North Dakota     0.002054
New Hampshire    0.007447
Maine            0.012840
Iowa             0.018490
                   ...   
Florida          0.874422
Louisiana        0.913970
Mississippi      0.955316
Georgia          1.000000
Name: Murder, Length: 50, dtype: float64

### SQL

In [36]:
engine = sqlalchemy.create_engine('sqlite:///data/iris.sqlite3')
pd.read_sql('iris', engine)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
...,...,...,...,...,...
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


#### 合計
---
`SUM`関数を使用する。

In [37]:
pd.read_sql(
    '''SELECT SUM(sepal_length),
              SUM(sepal_width),
              SUM(petal_length),
              SUM(petal_width)
       FROM iris''', engine)

Unnamed: 0,SUM(sepal_length),SUM(sepal_width),SUM(petal_length),SUM(petal_width)
0,876.5,458.6,563.7,179.9


このままでは見にくいので、通常は列名を付け直す。

In [38]:
pd.read_sql(
    '''SELECT SUM(sepal_length) AS sep_len_sum,
              SUM(sepal_width) AS sep_wid_sum,
              SUM(petal_length) AS pet_len_sum,
              SUM(petal_width) AS pet_wid_sum
    FROM iris''', engine)

Unnamed: 0,sep_len_sum,sep_wid_sum,pet_len_sum,pet_wid_sum
0,876.5,458.6,563.7,179.9


#### レコード数
---
`COUNT`関数を使用する。欠損は除外される。

In [39]:
pd.read_sql('SELECT COUNT(*) AS count FROM iris', engine)

Unnamed: 0,count
0,150


#### 平均
---
`AVG`関数を使用する。

In [40]:
pd.read_sql(
    '''SELECT AVG(sepal_length) AS sep_len_avg,
              AVG(sepal_width) AS sep_wid_avg,
              AVG(petal_length) AS pet_len_avg,
              AVG(petal_width) AS pet_wid_avg
       FROM iris''',
    engine)

Unnamed: 0,sep_len_avg,sep_wid_avg,pet_len_avg,pet_wid_avg
0,5.843333,3.057333,3.758,1.199333


#### 標準偏差
---
データベースごとに異なる標準偏差を求める関数が実装されていることもあるが、 SQLite には標準で実装されていない。  
分散は以下のようにして求められるので、 Python から得られた結果の平方根を算出すれば標準偏差を求められるが、 [extension-functions.c](https://www.sqlite.org/contrib) をインストールすれば関数が利用可能になるので、通常はそちらを使用する。

In [41]:
query_varp = '''
SELECT AVG(dev.sepal_length * dev.sepal_length) AS sep_len_var,
       AVG(dev.sepal_width * dev.sepal_width) AS sep_wid_var,
       AVG(dev.petal_length * dev.petal_length) AS pet_len_var,
       AVG(dev.petal_width * dev.petal_width) AS pet_wid_var
FROM (SELECT (iris.sepal_length - avg.sepal_length) AS sepal_length,
             (iris.sepal_width - avg.sepal_width) AS sepal_width,
             (iris.petal_length - avg.petal_length) AS petal_length,
             (iris.petal_width - avg.petal_width) AS petal_width
      FROM iris,
           (SELECT AVG(sepal_length) AS sepal_length,
                   AVG(sepal_width) AS sepal_width,
                   AVG(petal_length) AS petal_length,
                   AVG(petal_width) AS petal_width
            FROM iris
           ) AS avg
     ) AS dev
'''
pd.read_sql(query_varp, engine)

Unnamed: 0,sep_len_var,sep_wid_var,pet_len_var,pet_wid_var
0,0.681122,0.188713,3.095503,0.577133


標本分散の場合は以下の通り。

In [42]:
query_var = '''
SELECT SUM(dev.sepal_length * dev.sepal_length) / dof.sepal_length AS sep_len_var,
       SUM(dev.sepal_width * dev.sepal_width) / dof.sepal_width AS sep_wid_var,
       SUM(dev.petal_length * dev.petal_length) / dof.petal_length AS pet_len_var,
       SUM(dev.petal_width * dev.petal_width) / dof.petal_width AS pet_wid_var
FROM (SELECT (iris.sepal_length - avg.sepal_length) AS sepal_length,
             (iris.sepal_width - avg.sepal_width) AS sepal_width,
             (iris.petal_length - avg.petal_length) AS petal_length,
             (iris.petal_width - avg.petal_width) AS petal_width
      FROM iris, 
           (SELECT AVG(sepal_length) AS sepal_length,
                   AVG(sepal_width) AS sepal_width,
                   AVG(petal_length) AS petal_length,
                   AVG(petal_width) AS petal_width
                   FROM iris
            ) AS avg
     ) AS dev,
     (SELECT COUNT(sepal_length) - 1 AS sepal_length,
             COUNT(sepal_width) - 1 AS sepal_width,
             COUNT(petal_length) - 1 AS petal_length,
             COUNT(petal_width) - 1 AS petal_width
      FROM iris
     ) AS dof
'''
pd.read_sql(query_var, engine)

Unnamed: 0,sep_len_var,sep_wid_var,pet_len_var,pet_wid_var
0,0.685694,0.189979,3.116278,0.581006


SQLite でのカスタム関数の定義方法は以下の通り。 (集約関数の場合)

In [43]:
class StandardDeviation(object):
    def __init__(self):
        self.values = []

    def step(self, value):
        self.values.append(value)

    def finalize(self):
        return np.std(self.values)


def sqlite_engine_creator():
    con = sqlite3.connect('data/iris.sqlite3')
    con.create_aggregate("STDEV", 1, StandardDeviation)
    return con


engine_with_stdev = sqlalchemy.create_engine(
    'sqlite://', creator=sqlite_engine_creator)
pd.read_sql('SELECT STDEV(sepal_length) AS sep_len_std FROM iris',
            engine_with_stdev)

Unnamed: 0,sep_len_std
0,0.825301


#### 最大値
---
`MAX`関数を使用する。

In [44]:
pd.read_sql(
    '''SELECT MAX(sepal_length) AS sep_len_max,
              MAX(sepal_width) AS sep_wid_max,
              MAX(petal_length) AS pet_len_max,
              MAX(petal_width) AS pet_wid_max
       FROM iris''', engine)

Unnamed: 0,sep_len_max,sep_wid_max,pet_len_max,pet_wid_max
0,7.9,4.4,6.9,2.5


#### 最小値
---
`MIN`関数を使用する。

In [45]:
pd.read_sql(
    '''SELECT MIN(sepal_length) AS sep_len_max,
              MIN(sepal_width) AS sep_wid_max,
              MIN(petal_length) AS pet_len_max,
              MIN(petal_width) AS pet_wid_max
       FROM iris''', engine)

Unnamed: 0,sep_len_max,sep_wid_max,pet_len_max,pet_wid_max
0,4.3,2.0,1.0,0.1


#### 累積和
---
ウィンドウ関数を使用する。  
SQLite でウィンドウ関数を使用するには**`SQLite>=3.25`が必要**。

`OVER`句で`ORDER BY`を使用して並び替えると先頭から該当行までの集計になる。

In [46]:
sqlite3.sqlite_version

'3.28.0'

In [47]:
engine = sqlalchemy.create_engine('sqlite:///data/usarrests.sqlite3')
pd.read_sql('usarrests', engine)

Unnamed: 0,state,Murder,Assault,UrbanPop,Rape
0,Alabama,13.2,236,58,21.2
1,Alaska,10.0,263,48,44.5
2,Arizona,8.1,294,80,31.0
3,Arkansas,8.8,190,50,19.5
...,...,...,...,...,...
46,Washington,4.0,145,73,26.2
47,West Virginia,5.7,81,39,9.3
48,Wisconsin,2.6,53,66,10.8
49,Wyoming,6.8,161,60,15.6


In [48]:
pd.read_sql(
    'SELECT SUM(Murder) OVER (ORDER BY Murder) AS cumsum FROM usarrests',
    engine)

Unnamed: 0,cumsum
0,0.8
1,5.0
2,5.0
3,9.4
...,...
46,355.9
47,355.9
48,372.0
49,389.4


## スケーリング
---
分析にあたって、変数のスケール (大きさ) を揃えることが必要な場合がある。実際に使用するケースは中級編で扱う。

### pandas

#### 正規化 (normalization)
---
変数内の最大値が $1$ 、最小値が $0$ となるようにスケールを変更する。  
変数内に外れ値があるとうまく機能しない。

In [49]:
(iris - iris.min()) / (iris.max() - iris.min())

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
...,...,...,...,...
146,0.555556,0.208333,0.677966,0.750000
147,0.611111,0.416667,0.711864,0.791667
148,0.527778,0.583333,0.745763,0.916667
149,0.444444,0.416667,0.694915,0.708333


#### 標準化 (standardization)
---
変数の平均が $0$ 、標準偏差が $1$ となるようにスケールを変更する。

In [50]:
(iris - iris.mean()) / iris.std(ddof=0)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
...,...,...,...,...
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832
149,0.068662,-0.131979,0.762758,0.790671


### SQL

In [51]:
engine = sqlalchemy.create_engine('sqlite:///data/iris.sqlite3')
pd.read_sql('iris', engine)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
...,...,...,...,...,...
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


#### 正規化

In [52]:
query_norm = '''
SELECT ((iris.sepal_length - min.sepal_length) / (max.sepal_length - min.sepal_length)) AS sep_len_norm,
       ((iris.sepal_width - min.sepal_width) / (max.sepal_width - min.sepal_width)) AS sep_wid_norm,
       ((iris.petal_length - min.petal_length) / (max.petal_length - min.petal_length)) AS pet_len_norm,
       ((iris.petal_width - min.petal_width) / (max.petal_width - min.petal_width)) AS pet_wid_norm
FROM iris,
     (SELECT MIN(sepal_length) AS sepal_length,
             MIN(sepal_width) AS sepal_width,
             MIN(petal_length) AS petal_length,
             MIN(petal_width) AS petal_width
      FROM iris
     ) AS min,
     (SELECT MAX(sepal_length) AS sepal_length,
             MAX(sepal_width) AS sepal_width,
             MAX(petal_length) AS petal_length,
             MAX(petal_width) AS petal_width
      FROM iris
     ) AS max
'''
pd.read_sql(query_norm, engine)

Unnamed: 0,sep_len_norm,sep_wid_norm,pet_len_norm,pet_wid_norm
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
...,...,...,...,...
146,0.555556,0.208333,0.677966,0.750000
147,0.611111,0.416667,0.711864,0.791667
148,0.527778,0.583333,0.745763,0.916667
149,0.444444,0.416667,0.694915,0.708333


#### 標準化

In [53]:
query_dev = '''
SELECT (iris.sepal_length - avg.sepal_length) AS sep_len_dev,
       (iris.sepal_width - avg.sepal_width) AS sep_wid_dev,
       (iris.petal_length - avg.petal_length) AS pet_len_dev,
       (iris.petal_width - avg.petal_width) AS pet_wid_dev
FROM iris,
     (SELECT AVG(sepal_length) AS sepal_length,
             AVG(sepal_width) AS sepal_width,
             AVG(petal_length) AS petal_length,
             AVG(petal_width) AS petal_width
      FROM iris
     ) AS avg
'''
iris_dev = pd.read_sql(query_dev, engine)
iris_stdev = np.sqrt(pd.read_sql(query_varp, engine))
iris_dev.columns = iris_stdev.columns = ('sep_len_std', 'sep_wid_std',
                                         'pet_len_std', 'pet_wid_std')
iris_std = iris_dev / iris_stdev.squeeze()
# iris_std = iris_dev / iris_stdev.iloc[0]
iris_std

Unnamed: 0,sep_len_std,sep_wid_std,pet_len_std,pet_wid_std
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
...,...,...,...,...
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832
149,0.068662,-0.131979,0.762758,0.790671


カスタム関数を利用する場合は以下の通り。

In [54]:
pd.read_sql(
    '''SELECT ((iris.sepal_length - avg.sepal_length) / std.sepal_length) AS sep_len_std,
              ((iris.sepal_width - avg.sepal_width) / std.sepal_width) AS sep_wid_std,
              ((iris.petal_length - avg.petal_length) / std.petal_length) AS pet_len_std,
              ((iris.petal_width - avg.petal_width) / std.petal_width) AS pet_wid_std
       FROM iris,
            (SELECT AVG(sepal_length) AS sepal_length,
                    AVG(sepal_width) AS sepal_width,
                    AVG(petal_length) AS petal_length,
                    AVG(petal_width) AS petal_width
             FROM iris
            ) AS avg,
            (SELECT STDEV(sepal_length) AS sepal_length,
                    STDEV(sepal_width) AS sepal_width,
                    STDEV(petal_length) AS petal_length,
                    STDEV(petal_width) AS petal_width
             FROM iris
            ) AS std
    ''', engine_with_stdev)

Unnamed: 0,sep_len_std,sep_wid_std,pet_len_std,pet_wid_std
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
...,...,...,...,...
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832
149,0.068662,-0.131979,0.762758,0.790671


## カテゴリ別集計

In [55]:
mpg = load_dataset('mpg')
print('mpg')
display(mpg)

mpg


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
...,...,...,...,...,...,...,...,...,...
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger
397,31.0,4,119.0,82.0,2720,19.4,82,usa,chevy s-10


### pandas
---
`pandas.DataFrame.groupby`と集約関数 (`sum`や`mean`のように複数の値を 1 つにまとめる関数) を使用する。

In [56]:
pd.DataFrame.groupby??

In [57]:
print([p for p in dir(pd.core.groupby.groupby.DataFrameGroupBy) if not p.startswith('_')])

['agg', 'aggregate', 'all', 'any', 'apply', 'backfill', 'bfill', 'boxplot', 'corr', 'corrwith', 'count', 'cov', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'dtypes', 'expanding', 'ffill', 'fillna', 'filter', 'first', 'get_group', 'groups', 'head', 'hist', 'idxmax', 'idxmin', 'indices', 'last', 'mad', 'max', 'mean', 'median', 'min', 'ndim', 'ngroup', 'ngroups', 'nth', 'nunique', 'ohlc', 'pad', 'pct_change', 'pipe', 'plot', 'prod', 'quantile', 'rank', 'resample', 'rolling', 'sem', 'shift', 'size', 'skew', 'std', 'sum', 'tail', 'take', 'transform', 'tshift', 'var']


In [58]:
mpg.groupby('origin').sum()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
europe,1952.4,291,7640.0,5478.0,169631,1175.1,5307
japan,2405.6,324,8114.0,6307.0,175477,1277.6,6118
usa,5000.8,1556,61229.5,29167.0,837121,3743.4,18827


In [59]:
mpg.groupby('origin').mean()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
europe,27.891429,4.157143,109.142857,80.558824,2423.3,16.787143,75.814286
japan,30.450633,4.101266,102.708861,79.835443,2221.227848,16.172152,77.443038
usa,20.083534,6.248996,245.901606,119.04898,3361.931727,15.033735,75.610442


同時に複数の関数を使用したり、 pandas にない関数を使用する場合には、`agg`を利用する。

In [60]:
mpg.groupby('origin').agg(['mean', 'median'])

Unnamed: 0_level_0,mpg,mpg,cylinders,cylinders,displacement,...,weight,acceleration,acceleration,model_year,model_year
Unnamed: 0_level_1,mean,median,mean,median,mean,...,median,mean,median,mean,median
origin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
europe,27.891429,26.5,4.157143,4,109.142857,...,2240,16.787143,15.7,75.814286,76
japan,30.450633,31.6,4.101266,4,102.708861,...,2155,16.172152,16.4,77.443038,78
usa,20.083534,18.5,6.248996,6,245.901606,...,3365,15.033735,15.0,75.610442,76


In [61]:
mpg.groupby('origin').agg({'horsepower': ['mean', 'median']})

Unnamed: 0_level_0,horsepower,horsepower
Unnamed: 0_level_1,mean,median
origin,Unnamed: 1_level_2,Unnamed: 2_level_2
europe,80.558824,76.5
japan,79.835443,75.0
usa,119.04898,105.0


In [62]:
def percentile95(series):
    return np.percentile(series.loc[~series.isna()], 95)


mpg.groupby('origin').agg({'horsepower': percentile95})

Unnamed: 0_level_0,horsepower
origin,Unnamed: 1_level_1
europe,115.0
japan,110.6
usa,197.0


### SQL
---
`GROUP BY`句を使用する。

In [63]:
engine = sqlalchemy.create_engine('sqlite:///data/mpg.sqlite3')
pd.read_sql('mpg', engine)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
...,...,...,...,...,...,...,...,...,...
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger
397,31.0,4,119.0,82.0,2720,19.4,82,usa,chevy s-10


In [64]:
pd.read_sql('SELECT origin, COUNT(*) AS count FROM mpg GROUP BY origin',
            engine)

Unnamed: 0,origin,count
0,europe,70
1,japan,79
2,usa,249


In [65]:
pd.read_sql('SELECT origin, AVG(horsepower) AS avg FROM mpg GROUP BY origin',
            engine)

Unnamed: 0,origin,avg
0,europe,80.558824
1,japan,79.835443
2,usa,119.04898


集約関数を用いた条件抽出には`HAVING`句を使用する。

In [66]:
pd.read_sql(
    '''SELECT origin, COUNT(*) AS count
       FROM mpg
       GROUP BY origin
       HAVING count > 75
    ''', engine)

Unnamed: 0,origin,count
0,japan,79
1,usa,249


In [67]:
pd.read_sql(
    '''SELECT origin, AVG(horsepower) AS avg
       FROM mpg
       GROUP BY origin
       HAVING COUNT(*) > 75
    ''', engine)

Unnamed: 0,origin,avg
0,japan,79.835443
1,usa,119.04898


## 集約

### pandas

In [68]:
ozone = pd.read_csv('data/ozone.csv', index_col=0)
print('ozone')
display(ozone)

ozone


Unnamed: 0_level_0,WSR0,WSR1,WSR2,WSR3,WSR4,...,TT,SLP,SLP_,Precp,OzoneDay
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1/1/1998,0.8,1.8,2.4,2.1,2.0,...,17.90,10330.0,-55.0,0.00,0.0
1/2/1998,2.8,3.2,3.3,2.7,3.3,...,29.00,10275.0,-55.0,0.00,0.0
1/3/1998,2.9,2.8,2.6,2.1,2.2,...,41.30,10235.0,-40.0,0.00,0.0
1/4/1998,4.7,3.8,3.7,3.8,2.9,...,51.70,10195.0,-40.0,2.08,0.0
...,...,...,...,...,...,...,...,...,...,...,...
12/28/2004,1.0,1.4,1.1,1.7,1.5,...,19.10,10310.0,15.0,0.00,0.0
12/29/2004,0.8,0.8,1.2,0.9,0.4,...,35.20,10275.0,-35.0,0.00,0.0
12/30/2004,1.3,0.9,1.5,1.2,1.6,...,34.20,10245.0,-30.0,0.05,0.0
12/31/2004,1.5,1.3,1.8,1.4,1.2,...,39.35,10220.0,-25.0,0.00,0.0


#### アンサンブル平均
---
同じ条件 (同一時点など) で複数の値を集計した平均。

In [69]:
cols = [
    col for col in ozone.columns if col.startswith('WSR') and '_' not in col
]
ozone[cols].mean(axis=1)

Date
1/1/1998      3.100000
1/2/1998      3.375000
1/3/1998      3.475000
1/4/1998      3.195833
                ...   
12/28/2004    2.641667
12/29/2004    1.916667
12/30/2004    2.137500
12/31/2004    2.533333
Length: 2536, dtype: float64

#### 移動平均
---
一定期間前までの値の平均。値の変化を滑らかにするために使用する。
`pandas.DataFrame.rolling`を使用する。

In [70]:
pd.DataFrame.rolling??

In [71]:
ozone.rolling(3).mean()

Unnamed: 0_level_0,WSR0,WSR1,WSR2,WSR3,WSR4,...,TT,SLP,SLP_,Precp,OzoneDay
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1/1/1998,,,,,,...,,,,,
1/2/1998,,,,,,...,,,,,
1/3/1998,2.166667,2.600000,2.766667,2.300000,2.500000,...,29.400000,10280.000000,-50.000000,0.000000e+00,0.0
1/4/1998,3.466667,3.266667,3.200000,2.866667,2.800000,...,40.666667,10235.000000,-45.000000,6.933333e-01,0.0
...,...,...,...,...,...,...,...,...,...,...,...
12/28/2004,0.733333,0.733333,0.633333,0.800000,0.666667,...,22.133333,10278.333333,30.000000,4.473274e-15,0.0
12/29/2004,0.700000,0.866667,0.933333,1.033333,0.700000,...,25.366667,10293.333333,15.000000,4.473274e-15,0.0
12/30/2004,1.033333,1.033333,1.266667,1.266667,1.166667,...,29.500000,10276.666667,-16.666667,1.666667e-02,0.0
12/31/2004,1.200000,1.000000,1.500000,1.166667,1.066667,...,36.250000,10246.666667,-30.000000,1.666667e-02,0.0


時間軸でアンサンブル平均のように値をまとめる場合は、移動平均から等間隔で抽出したり、年と月などから`groupby`で集約したりする。

In [72]:
ozone.rolling(3).mean()[::3]

Unnamed: 0_level_0,WSR0,WSR1,WSR2,WSR3,WSR4,...,TT,SLP,SLP_,Precp,OzoneDay
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1/1/1998,,,,,,...,,,,,
1/4/1998,3.466667,3.266667,3.200000,2.866667,2.800000,...,40.666667,10235.000000,-45.000000,6.933333e-01,0.0
1/7/1998,3.133333,2.933333,2.900000,3.000000,2.833333,...,,,,2.200000e+00,0.0
1/10/1998,1.366667,1.666667,1.700000,1.966667,2.000000,...,35.133333,10121.666667,46.666667,0.000000e+00,0.0
...,...,...,...,...,...,...,...,...,...,...,...
12/22/2004,0.800000,0.900000,1.066667,1.066667,1.100000,...,37.883333,10170.000000,-41.666667,3.633333e-01,0.0
12/25/2004,4.133333,4.100000,4.066667,3.533333,3.733333,...,25.916667,10206.666667,40.000000,1.666667e-02,0.0
12/28/2004,0.733333,0.733333,0.633333,0.800000,0.666667,...,22.133333,10278.333333,30.000000,4.473274e-15,0.0
12/31/2004,1.200000,1.000000,1.500000,1.166667,1.066667,...,36.250000,10246.666667,-30.000000,1.666667e-02,0.0


In [73]:
ozone_with_date = ozone.reset_index()
ozone_with_date['Date'] = pd.to_datetime(ozone_with_date['Date'])
ozone_with_date['year'] = ozone_with_date['Date'].dt.year
ozone_with_date['month'] = ozone_with_date['Date'].dt.month
ozone_with_date.groupby(['year', 'month']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,WSR0,WSR1,WSR2,WSR3,WSR4,...,TT,SLP,SLP_,Precp,OzoneDay
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1998,1,1.961290,2.019355,1.983871,1.977419,2.016129,...,38.471667,10163.333333,-5.689655,0.356452,0.000000
1998,2,1.764286,1.789286,1.896429,1.725000,1.742857,...,37.128571,10118.928571,-1.785714,0.530357,0.000000
1998,3,2.609677,2.574194,2.625806,2.500000,2.403226,...,35.398214,10155.535714,-5.000000,0.190645,0.000000
1998,4,1.673333,1.520000,1.760000,1.800000,1.696667,...,34.373214,10137.142857,3.076923,0.102333,0.033333
...,...,...,...,...,...,...,...,...,...,...,...,...
2004,9,0.976667,1.070000,1.040000,0.986667,1.076667,...,40.734483,10133.000000,-0.833333,0.085667,0.066667
2004,10,1.361290,1.335484,1.332258,1.287097,1.270968,...,40.369355,10136.612903,0.000000,0.168387,0.000000
2004,11,1.690000,1.703333,1.973333,1.836667,1.866667,...,34.376667,10176.000000,-0.500000,0.993667,0.000000
2004,12,1.451613,1.332258,1.445161,1.409677,1.516129,...,29.798387,10213.225806,3.064516,0.159677,0.000000


### SQL

In [74]:
engine = sqlalchemy.create_engine('sqlite:///data/ozone.sqlite3')
pd.read_sql('ozone', engine, index_col='Date')

Unnamed: 0_level_0,WSR0,WSR1,WSR2,WSR3,WSR4,...,TT,SLP,SLP_,Precp,OzoneDay
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1/1/1998,0.8,1.8,2.4,2.1,2.0,...,17.90,10330.0,-55.0,0.00,0.0
1/2/1998,2.8,3.2,3.3,2.7,3.3,...,29.00,10275.0,-55.0,0.00,0.0
1/3/1998,2.9,2.8,2.6,2.1,2.2,...,41.30,10235.0,-40.0,0.00,0.0
1/4/1998,4.7,3.8,3.7,3.8,2.9,...,51.70,10195.0,-40.0,2.08,0.0
...,...,...,...,...,...,...,...,...,...,...,...
12/28/2004,1.0,1.4,1.1,1.7,1.5,...,19.10,10310.0,15.0,0.00,0.0
12/29/2004,0.8,0.8,1.2,0.9,0.4,...,35.20,10275.0,-35.0,0.00,0.0
12/30/2004,1.3,0.9,1.5,1.2,1.6,...,34.20,10245.0,-30.0,0.05,0.0
12/31/2004,1.5,1.3,1.8,1.4,1.2,...,39.35,10220.0,-25.0,0.00,0.0


#### アンサンブル平均
---
欠損値がない場合は、以下のように列名を列挙して列数で割ればよい。

In [75]:
cols = [
    col for col in ozone.columns if col.startswith('WSR') and '_' not in col
]
query_ensemble = f'''
SELECT Date, (({'+'.join(cols)}) / {len(cols)}) AS ensemble_mean
FROM ozone
'''
pd.read_sql(query_ensemble, engine, index_col='Date')

Unnamed: 0_level_0,ensemble_mean
Date,Unnamed: 1_level_1
1/1/1998,3.100000
1/2/1998,3.375000
1/3/1998,3.475000
1/4/1998,3.195833
...,...
12/28/2004,2.641667
12/29/2004,1.916667
12/30/2004,2.137500
12/31/2004,2.533333


欠損値がある場合には、以下のように欠損でない値の数を数える必要がある。

In [76]:
wsr = pd.read_sql(
    'SELECT Date, ({sum}) AS sum, ({count}) AS count FROM ozone'.format(
        sum='+'.join(cols),
        count='+'.join(
            [f'(CASE {col} WHEN NULL THEN 0 ELSE 1 END)' for col in cols])),
    engine,
    index_col='Date')
wsr.apply(
    lambda r: np.nan if r['count'] == 0 else r['sum'] / r['count'], axis=1)

Date
1/1/1998      3.100000
1/2/1998      3.375000
1/3/1998      3.475000
1/4/1998      3.195833
                ...   
12/28/2004    2.641667
12/29/2004    1.916667
12/30/2004    2.137500
12/31/2004    2.533333
Length: 2536, dtype: float64

#### 移動平均
---
ウィンドウ関数を使用する。

`OVER`句の中で`BETWEEN`や`ROWS`で該当行からの範囲を指定して平均を算出する。

<table>
    <tr>
        <td>$n$ PRECEDING</td>
        <td>$n$ 行前</td>
    </tr>
    <tr>
        <td>$n$ FOLLOWING</td>
        <td>$n$ 行後</td>
    </tr>
</table>

ただし SQL の集約関数の場合、**`NULL`値は無視される**ので注意が必要。

3 件での移動平均は以下の通り。

In [77]:
pd.read_sql(
    '''SELECT Date,
              AVG(WSR0) OVER (ROWS 2 PRECEDING) AS moving_average0,
              AVG(WSR1) OVER (ROWS 2 PRECEDING) AS moving_average1
       FROM ozone
    ''', engine)

Unnamed: 0,Date,moving_average0,moving_average1
0,1/1/1998,0.800000,1.800000
1,1/2/1998,1.800000,2.500000
2,1/3/1998,2.166667,2.600000
3,1/4/1998,3.466667,3.266667
...,...,...,...
2532,12/28/2004,0.733333,0.733333
2533,12/29/2004,0.700000,0.866667
2534,12/30/2004,1.033333,1.033333
2535,12/31/2004,1.200000,1.000000


前後 1 件ずつの範囲での移動平均は以下の通り。

In [78]:
pd.read_sql(
    '''SELECT Date,
              AVG(WSR0) OVER (ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS moving_average0,
              AVG(WSR1) OVER (ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS moving_average1
       FROM ozone
    ''', engine)

Unnamed: 0,Date,moving_average0,moving_average1
0,1/1/1998,1.800000,2.500000
1,1/2/1998,2.166667,2.600000
2,1/3/1998,3.466667,3.266667
3,1/4/1998,3.400000,2.900000
...,...,...,...
2532,12/28/2004,0.700000,0.866667
2533,12/29/2004,1.033333,1.033333
2534,12/30/2004,1.200000,1.000000
2535,12/31/2004,1.400000,1.100000
