## 要約統計量

In [1]:
import seaborn as sns
df = sns.load_dataset('iris')

In [2]:
# 先頭5行を表示
df.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
# DataFrame の統計量をまとめて表示
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [4]:
# 平均値
# - これまでは数値のみを計算対象としてくれたが、エラーを出す方向に変更予定 (deprecated)
df.mean()

  df.mean()


sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

In [5]:
# species の列が数値ではないため、列を指定 axis=1 して削除
df.drop(labels=['species'], axis=1).head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
# 平均値
df.drop(labels=['species'], axis=1).mean()

sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

In [7]:
# 中間値
df.drop(labels=['species'], axis=1).median()

sepal_length    5.80
sepal_width     3.00
petal_length    4.35
petal_width     1.30
dtype: float64

In [8]:
# 最小値
df.min()

sepal_length       4.3
sepal_width        2.0
petal_length       1.0
petal_width        0.1
species         setosa
dtype: object

In [9]:
# 最大値
df.max()

sepal_length          7.9
sepal_width           4.4
petal_length          6.9
petal_width           2.5
species         virginica
dtype: object

In [10]:
# 最頻値
df.mode()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.0,3.0,1.4,0.2,setosa
1,,,1.5,,versicolor
2,,,,,virginica


In [11]:
# sepal_length の値の出現頻度 value_counts() の先頭を表示 head()
df.sepal_length.value_counts().head()

5.0    10
5.1     9
6.3     9
5.7     8
6.7     8
Name: sepal_length, dtype: int64

In [12]:
# petal_length は最頻値が2つある
df.petal_length.value_counts().head()

1.4    13
1.5    13
5.1     8
4.5     8
1.6     7
Name: petal_length, dtype: int64

In [13]:
# 50パーセンタイル（中央値）
df.quantile()

sepal_length    5.80
sepal_width     3.00
petal_length    4.35
petal_width     1.30
Name: 0.5, dtype: float64

In [14]:
# 25パーセンタイル（第1四分位数）
df.quantile(0.25)

sepal_length    5.1
sepal_width     2.8
petal_length    1.6
petal_width     0.3
Name: 0.25, dtype: float64

In [15]:
# 第1, 2, 3四分位数
df.quantile([0.25, 0.5, 0.75])

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0.25,5.1,2.8,1.6,0.3
0.5,5.8,3.0,4.35,1.3
0.75,6.4,3.3,5.1,1.8


In [16]:
# 列の統計量をまとめて表示
df.petal_length.describe()

count    150.000000
mean       3.758000
std        1.765298
min        1.000000
25%        1.600000
50%        4.350000
75%        5.100000
max        6.900000
Name: petal_length, dtype: float64

In [17]:
# petal_length列の平均値
df.petal_length.mean()

3.7580000000000027

In [18]:
# petal_length列の中央値
df.petal_length.median()

4.35

In [19]:
# petal_length列の最小値
df.petal_length.min()

1.0

In [20]:
# petal_length列の最大値
df.petal_length.max()

6.9

In [21]:
# petal_length列の最頻値
df.petal_length.mode()

0    1.4
1    1.5
Name: petal_length, dtype: float64

In [22]:
# petal_length列の50パーセンタイル（中央値）
df.petal_length.quantile()

4.35

## ばらつき

分散、標準偏差、偏差値、変動係数

In [23]:
# 分散
# - ddof=0: 総和を N-ddof で割る
df.drop(labels=['species'], axis=1).var(ddof=0)

sepal_length    0.681122
sepal_width     0.188713
petal_length    3.095503
petal_width     0.577133
dtype: float64

In [24]:
# 標準偏差
# - ddof=0: 総和を N-ddof で割る
df.drop(labels=['species'], axis=1).std(ddof=0)

sepal_length    0.825301
sepal_width     0.434411
petal_length    1.759404
petal_width     0.759693
dtype: float64

In [25]:
# 変動係数
# - 標準偏差を平均値の絶対値で割った値
df.drop(labels=['species'], axis=1).std(ddof=0) / abs(df.drop(labels=['species'], axis=1).mean())

sepal_length    0.141238
sepal_width     0.142088
petal_length    0.468176
petal_width     0.633429
dtype: float64

<img src='slides/26.png'>

In [26]:
# 偏差値
pl_std = df.petal_length.std(ddof=0)
pl_mean = df.petal_length.mean()
for x in df.petal_length:
    pl_dev = ((x - pl_mean) / pl_std) * 10 + 50
    print('{}  {:.1f}'.format(x, pl_dev))

1.4  36.6
1.4  36.6
1.3  36.0
1.5  37.2
1.4  36.6
1.7  38.3
1.4  36.6
1.5  37.2
1.4  36.6
1.5  37.2
1.5  37.2
1.6  37.7
1.4  36.6
1.1  34.9
1.2  35.5
1.5  37.2
1.3  36.0
1.4  36.6
1.7  38.3
1.5  37.2
1.7  38.3
1.5  37.2
1.0  34.3
1.7  38.3
1.9  39.4
1.6  37.7
1.6  37.7
1.5  37.2
1.4  36.6
1.6  37.7
1.6  37.7
1.5  37.2
1.5  37.2
1.4  36.6
1.5  37.2
1.2  35.5
1.3  36.0
1.4  36.6
1.3  36.0
1.5  37.2
1.3  36.0
1.3  36.0
1.3  36.0
1.6  37.7
1.9  39.4
1.4  36.6
1.6  37.7
1.4  36.6
1.5  37.2
1.4  36.6
4.7  55.4
4.5  54.2
4.9  56.5
4.0  51.4
4.6  54.8
4.5  54.2
4.7  55.4
3.3  47.4
4.6  54.8
3.9  50.8
3.5  48.5
4.2  52.5
4.0  51.4
4.7  55.4
3.6  49.1
4.4  53.6
4.5  54.2
4.1  51.9
4.5  54.2
3.9  50.8
4.8  55.9
4.0  51.4
4.9  56.5
4.7  55.4
4.3  53.1
4.4  53.6
4.8  55.9
5.0  57.1
4.5  54.2
3.5  48.5
3.8  50.2
3.7  49.7
3.9  50.8
5.1  57.6
4.5  54.2
4.5  54.2
4.7  55.4
4.4  53.6
4.1  51.9
4.0  51.4
4.4  53.6
4.6  54.8
4.0  51.4
3.3  47.4
4.2  52.5
4.2  52.5
4.2  52.5
4.3  53.1
3.0  45.7
4.1  51.9
