In [1]:
import numpy as np
import pandas as pd

# 事前準備

## データ

- CSV: 201704health.csv
- HTML: 201704weather.html

## ライブラリインストール

- HTML解析用 (lxml): `$ pip install lxml`
- Excel操作用 (openpyxl): `$ pip install openpyxl`

# CSVファイルの読み込み

- 201704health.csv

## データ

- 日付
- 歩数
- 摂取カロリー



In [2]:
df = pd.read_csv("../201704health.csv")

In [3]:
df.head()

Unnamed: 0,日付,歩数,摂取カロリー
0,2017-04-01,5439,2500
1,2017-04-02,2510,2300
2,2017-04-03,10238,1950
3,2017-04-04,8209,1850
4,2017-04-05,9434,1930


In [4]:
df.tail()

Unnamed: 0,日付,歩数,摂取カロリー
25,2017-04-26,7492,1850
26,2017-04-27,7203,1930
27,2017-04-28,7302,1850
28,2017-04-29,6033,2300
29,2017-04-30,4093,1950


In [5]:
df["歩数"]

0      5439
1      2510
2     10238
3      8209
4      9434
5      7593
6      9320
7      4873
8     12045
9      7493
10     7289
11     6481
12    10287
13     8043
14     7435
15     7529
16     8031
17     8475
18     8132
19    15328
20    12849
21     4029
22     3890
23     8093
24     7823
25     7492
26     7203
27     7302
28     6033
29     4093
Name: 歩数, dtype: int64

In [6]:
df.mean(axis=0)

歩数        7766.366667
摂取カロリー    2026.666667
dtype: float64

In [7]:
df['日付'].astype(np.datetime64)

0    2017-04-01
1    2017-04-02
2    2017-04-03
3    2017-04-04
4    2017-04-05
5    2017-04-06
6    2017-04-07
7    2017-04-08
8    2017-04-09
9    2017-04-10
10   2017-04-11
11   2017-04-12
12   2017-04-13
13   2017-04-14
14   2017-04-15
15   2017-04-16
16   2017-04-17
17   2017-04-18
18   2017-04-19
19   2017-04-20
20   2017-04-21
21   2017-04-22
22   2017-04-23
23   2017-04-24
24   2017-04-25
25   2017-04-26
26   2017-04-27
27   2017-04-28
28   2017-04-29
29   2017-04-30
Name: 日付, dtype: datetime64[ns]

In [8]:
df['date'] = df['日付'].astype(np.datetime64)

In [9]:
df_health = df

In [10]:
df_health

Unnamed: 0,日付,歩数,摂取カロリー,date
0,2017-04-01,5439,2500,2017-04-01
1,2017-04-02,2510,2300,2017-04-02
2,2017-04-03,10238,1950,2017-04-03
3,2017-04-04,8209,1850,2017-04-04
4,2017-04-05,9434,1930,2017-04-05
5,2017-04-06,7593,1800,2017-04-06
6,2017-04-07,9320,1940,2017-04-07
7,2017-04-08,4873,2300,2017-04-08
8,2017-04-09,12045,1950,2017-04-09
9,2017-04-10,7493,1850,2017-04-10


# HTML表の読み込み

- 201704weather.html

以下のサイトから2017年4月の東京の気象データをダウンロード
http://www.data.jma.go.jp/obd/stats/etrn/view/daily_s1.php?prec_no=44&block_no=47662&year=2017&month=4&day=&view=

- 必要な表を選択
- 日にちと気温関係のデータのみにする
- ヘッダーの処理
- 日にちを、日付として扱えるように変換

In [11]:
tables = pd.read_html("201704weather.html")

In [12]:
tables

[     0        1        2      3      4           5        6      7     8   \
 0     日  気圧(hPa)  降水量(mm)  気温(℃)  湿度(％)  風向・風速(m/s)  日照時間(h)  雪(cm)  天気概況   
 1    現地       海面      NaN    NaN    NaN         NaN      NaN    NaN   NaN   
 2    平均       平均       合計     最大     平均          最高       最低     平均    最小   
 3   1時間     10分間       風速     風向     風速          風向       合計      値   NaN   
 4     1   1010.9   1013.9    5.5    1.5         0.5      6.3    8.9   4.6   
 5     2   1011.0   1014.0     --     --          --      8.7   14.2   4.6   
 6     3   1013.5   1016.4    1.5    1.5         1.0      9.9   16.1   6.4   
 7     4   1021.3   1024.3     --     --          --     11.2   17.3   4.5   
 8     5   1023.3   1026.2     --     --          --     14.7   20.7   7.7   
 9     6   1019.7   1022.6    0.5    0.5         0.5     16.7   21.2  12.9   
 10    7   1012.6   1015.5    3.0    1.5         0.5     17.5   21.6  15.5   
 11    8   1009.5   1012.4    7.0    2.0         1.0     15.3   

In [13]:
len(tables)

2

In [14]:
tables[1]

Unnamed: 0,0,1,2,3
0,利用される方へ,よくある質問（FAQ）,気象観測統計の解説,年・季節・各月の天候


In [15]:
df = tables[0]

In [16]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,日,気圧(hPa),降水量(mm),気温(℃),湿度(％),風向・風速(m/s),日照時間(h),雪(cm),天気概況,,...,,,,,,,,,,
1,現地,海面,,,,,,,,,...,,,,,,,,,,
2,平均,平均,合計,最大,平均,最高,最低,平均,最小,平均風速,...,最大瞬間風速,降雪,最深積雪,昼(06:00-18:00),夜(18:00-翌日06:00),,,,,
3,1時間,10分間,風速,風向,風速,風向,合計,値,,,...,,,,,,,,,,
4,1,1010.9,1013.9,5.5,1.5,0.5,6.3,8.9,4.6,80,...,2.7,5.1,北北東,9.4,北東,0.0,--,--,雨後一時曇,曇時々晴


In [17]:
df_weather = pd.DataFrame(df.iloc[:, [0, 6, 7, 8]])

In [18]:
df_weather = df_weather.drop([0, 1, 2, 3])

In [19]:
df_weather.head()

Unnamed: 0,0,6,7,8
4,1,6.3,8.9,4.6
5,2,8.7,14.2,4.6
6,3,9.9,16.1,6.4
7,4,11.2,17.3,4.5
8,5,14.7,20.7,7.7


In [20]:
df_weather = df_weather.rename(columns={0: '日', 6: '平均気温', 7: '最高気温', 8: '最低気温'})

In [21]:
df_weather.head()

Unnamed: 0,日,平均気温,最高気温,最低気温
4,1,6.3,8.9,4.6
5,2,8.7,14.2,4.6
6,3,9.9,16.1,6.4
7,4,11.2,17.3,4.5
8,5,14.7,20.7,7.7


In [22]:
pd.to_datetime("2017-4-{}".format(1))

Timestamp('2017-04-01 00:00:00')

In [23]:
df_weather['日'].apply(lambda x: pd.to_datetime("2017-4-{}".format(x)))

4    2017-04-01
5    2017-04-02
6    2017-04-03
7    2017-04-04
8    2017-04-05
9    2017-04-06
10   2017-04-07
11   2017-04-08
12   2017-04-09
13   2017-04-10
14   2017-04-11
15   2017-04-12
16   2017-04-13
17   2017-04-14
18   2017-04-15
19   2017-04-16
20   2017-04-17
21   2017-04-18
22   2017-04-19
23   2017-04-20
24   2017-04-21
25   2017-04-22
26   2017-04-23
27   2017-04-24
28   2017-04-25
29   2017-04-26
30   2017-04-27
31   2017-04-28
32   2017-04-29
33   2017-04-30
Name: 日, dtype: datetime64[ns]

In [24]:
df_weather['date'] = df_weather['日'].apply(lambda x: pd.to_datetime("2017-4-{}".format(x)))

In [25]:
df_weather.head()

Unnamed: 0,日,平均気温,最高気温,最低気温,date
4,1,6.3,8.9,4.6,2017-04-01
5,2,8.7,14.2,4.6,2017-04-02
6,3,9.9,16.1,6.4,2017-04-03
7,4,11.2,17.3,4.5,2017-04-04
8,5,14.7,20.7,7.7,2017-04-05


In [26]:
df_weather

Unnamed: 0,日,平均気温,最高気温,最低気温,date
4,1,6.3,8.9,4.6,2017-04-01
5,2,8.7,14.2,4.6,2017-04-02
6,3,9.9,16.1,6.4,2017-04-03
7,4,11.2,17.3,4.5,2017-04-04
8,5,14.7,20.7,7.7,2017-04-05
9,6,16.7,21.2,12.9,2017-04-06
10,7,17.5,21.6,15.5,2017-04-07
11,8,15.3,17.7,12.6,2017-04-08
12,9,13.8,15.2,11.1,2017-04-09
13,10,10.4,15.7,6.9,2017-04-10


# データフレームの応用

- 日付データを元に同じ行に列を増やす
- 上記で使用した、CSVとHTMLデータを元に一つのデータフレームを作る



In [27]:
df_health.head()

Unnamed: 0,日付,歩数,摂取カロリー,date
0,2017-04-01,5439,2500,2017-04-01
1,2017-04-02,2510,2300,2017-04-02
2,2017-04-03,10238,1950,2017-04-03
3,2017-04-04,8209,1850,2017-04-04
4,2017-04-05,9434,1930,2017-04-05


In [28]:
df_weather.head()

Unnamed: 0,日,平均気温,最高気温,最低気温,date
4,1,6.3,8.9,4.6,2017-04-01
5,2,8.7,14.2,4.6,2017-04-02
6,3,9.9,16.1,6.4,2017-04-03
7,4,11.2,17.3,4.5,2017-04-04
8,5,14.7,20.7,7.7,2017-04-05


In [29]:
df_201704 = pd.merge(df_health, df_weather)

In [30]:
df_201704

Unnamed: 0,日付,歩数,摂取カロリー,date,日,平均気温,最高気温,最低気温
0,2017-04-01,5439,2500,2017-04-01,1,6.3,8.9,4.6
1,2017-04-02,2510,2300,2017-04-02,2,8.7,14.2,4.6
2,2017-04-03,10238,1950,2017-04-03,3,9.9,16.1,6.4
3,2017-04-04,8209,1850,2017-04-04,4,11.2,17.3,4.5
4,2017-04-05,9434,1930,2017-04-05,5,14.7,20.7,7.7
5,2017-04-06,7593,1800,2017-04-06,6,16.7,21.2,12.9
6,2017-04-07,9320,1940,2017-04-07,7,17.5,21.6,15.5
7,2017-04-08,4873,2300,2017-04-08,8,15.3,17.7,12.6
8,2017-04-09,12045,1950,2017-04-09,9,13.8,15.2,11.1
9,2017-04-10,7493,1850,2017-04-10,10,10.4,15.7,6.9


In [31]:
df_201704.head()

Unnamed: 0,日付,歩数,摂取カロリー,date,日,平均気温,最高気温,最低気温
0,2017-04-01,5439,2500,2017-04-01,1,6.3,8.9,4.6
1,2017-04-02,2510,2300,2017-04-02,2,8.7,14.2,4.6
2,2017-04-03,10238,1950,2017-04-03,3,9.9,16.1,6.4
3,2017-04-04,8209,1850,2017-04-04,4,11.2,17.3,4.5
4,2017-04-05,9434,1930,2017-04-05,5,14.7,20.7,7.7


In [32]:
df_2017_fix = pd.DataFrame(df_201704, columns=['date', '歩数', '摂取カロリー', '平均気温', '最高気温', '最低気温'])

In [33]:
df_2017_fix

Unnamed: 0,date,歩数,摂取カロリー,平均気温,最高気温,最低気温
0,2017-04-01,5439,2500,6.3,8.9,4.6
1,2017-04-02,2510,2300,8.7,14.2,4.6
2,2017-04-03,10238,1950,9.9,16.1,6.4
3,2017-04-04,8209,1850,11.2,17.3,4.5
4,2017-04-05,9434,1930,14.7,20.7,7.7
5,2017-04-06,7593,1800,16.7,21.2,12.9
6,2017-04-07,9320,1940,17.5,21.6,15.5
7,2017-04-08,4873,2300,15.3,17.7,12.6
8,2017-04-09,12045,1950,13.8,15.2,11.1
9,2017-04-10,7493,1850,10.4,15.7,6.9


# ExcelやCSVにデータを保存

In [34]:
df_2017_fix.to_excel('運動量と天気201704.xlsx')

In [35]:
df_2017_fix.to_csv('運動量と天気201704.csv')

# Pickle形式でデータフレームを保存

In [36]:
df_2017_fix.to_pickle("./df_2017.db")