# pandasでデータ連結など

## 内容

- データの読み込み
- データ連結(列方向)
- 欠損値
- データ連結(行方向)

In [1]:
import numpy as np
import pandas as pd

## 5章で保存したデータを読み出し

In [2]:
df = pd.read_pickle("data/df_201704health.pickle")

In [3]:
df

Unnamed: 0_level_0,歩数,摂取カロリー,歩数/カロリー,運動指数
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-04-01,5439,2500.0,2.1756,Low
2017-04-02,2510,2300.0,1.091304,Low
2017-04-03,10238,1950.0,5.250256,Mid
2017-04-04,8209,1850.0,4.437297,Mid
2017-04-05,9434,1930.0,4.888083,Mid
2017-04-06,7593,1800.0,4.218333,Mid
2017-04-07,9320,1940.0,4.804124,Mid
2017-04-08,4873,2300.0,2.118696,Low
2017-04-09,12045,1950.0,6.176923,High
2017-04-10,7493,1850.0,4.05027,Mid


In [4]:
df_moved = pd.read_pickle("data/df_201704moved.pickle")

In [5]:
df_moved

Unnamed: 0_level_0,運動_High,運動_Low,運動_Mid
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-04-01,0,1,0
2017-04-02,0,1,0
2017-04-03,0,0,1
2017-04-04,0,0,1
2017-04-05,0,0,1
2017-04-06,0,0,1
2017-04-07,0,0,1
2017-04-08,0,1,0
2017-04-09,1,0,0
2017-04-10,0,0,1


## データの連結(列方向)

In [6]:
df_marged = pd.concat([df, df_moved], axis=1)

In [7]:
df_marged

Unnamed: 0_level_0,歩数,摂取カロリー,歩数/カロリー,運動指数,運動_High,運動_Low,運動_Mid
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-04-01,5439,2500.0,2.1756,Low,0,1,0
2017-04-02,2510,2300.0,1.091304,Low,0,1,0
2017-04-03,10238,1950.0,5.250256,Mid,0,0,1
2017-04-04,8209,1850.0,4.437297,Mid,0,0,1
2017-04-05,9434,1930.0,4.888083,Mid,0,0,1
2017-04-06,7593,1800.0,4.218333,Mid,0,0,1
2017-04-07,9320,1940.0,4.804124,Mid,0,0,1
2017-04-08,4873,2300.0,2.118696,Low,0,1,0
2017-04-09,12045,1950.0,6.176923,High,1,0,0
2017-04-10,7493,1850.0,4.05027,Mid,0,0,1


In [8]:
df_201705 = pd.read_csv("data/201705health.csv", encoding="utf-8", index_col='日付', parse_dates=True)

新たに5月分のデータを読み込み

In [9]:
df_201705

Unnamed: 0_level_0,歩数,摂取カロリー
日付,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-05-01,1439.0,4500.0
2017-05-02,8120.0,2420.0
2017-05-03,,
2017-05-04,2329.0,1500.0
2017-05-05,,
2017-05-06,3233.0,1800.0
2017-05-07,9593.0,2200.0
2017-05-08,9213.0,1800.0
2017-05-09,5593.0,2500.0


## 欠損値処理

In [10]:
df_201705.dropna()

Unnamed: 0_level_0,歩数,摂取カロリー
日付,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-05-01,1439.0,4500.0
2017-05-02,8120.0,2420.0
2017-05-04,2329.0,1500.0
2017-05-06,3233.0,1800.0
2017-05-07,9593.0,2200.0
2017-05-08,9213.0,1800.0
2017-05-09,5593.0,2500.0


In [11]:
df_201705.fillna(0)

Unnamed: 0_level_0,歩数,摂取カロリー
日付,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-05-01,1439.0,4500.0
2017-05-02,8120.0,2420.0
2017-05-03,0.0,0.0
2017-05-04,2329.0,1500.0
2017-05-05,0.0,0.0
2017-05-06,3233.0,1800.0
2017-05-07,9593.0,2200.0
2017-05-08,9213.0,1800.0
2017-05-09,5593.0,2500.0


In [12]:
df_201705_fill = df_201705.fillna(method='ffill')

In [13]:
df_201705_fill

Unnamed: 0_level_0,歩数,摂取カロリー
日付,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-05-01,1439.0,4500.0
2017-05-02,8120.0,2420.0
2017-05-03,8120.0,2420.0
2017-05-04,2329.0,1500.0
2017-05-05,2329.0,1500.0
2017-05-06,3233.0,1800.0
2017-05-07,9593.0,2200.0
2017-05-08,9213.0,1800.0
2017-05-09,5593.0,2500.0


## データの連結(行方向)

In [14]:
pd.concat([df_marged, df_201705_fill], axis=0)

Unnamed: 0,摂取カロリー,歩数,歩数/カロリー,運動_High,運動_Low,運動_Mid,運動指数
2017-04-01,2500.0,5439.0,2.1756,0.0,1.0,0.0,Low
2017-04-02,2300.0,2510.0,1.091304,0.0,1.0,0.0,Low
2017-04-03,1950.0,10238.0,5.250256,0.0,0.0,1.0,Mid
2017-04-04,1850.0,8209.0,4.437297,0.0,0.0,1.0,Mid
2017-04-05,1930.0,9434.0,4.888083,0.0,0.0,1.0,Mid
2017-04-06,1800.0,7593.0,4.218333,0.0,0.0,1.0,Mid
2017-04-07,1940.0,9320.0,4.804124,0.0,0.0,1.0,Mid
2017-04-08,2300.0,4873.0,2.118696,0.0,1.0,0.0,Low
2017-04-09,1950.0,12045.0,6.176923,1.0,0.0,0.0,High
2017-04-10,1850.0,7493.0,4.05027,0.0,0.0,1.0,Mid
