# 推測統計の基本

- 全生徒数 : 400人
- Aさんの点数　: 80点
- ランダムに選んだ20人の平均点 : 70.4点

Aさんの点数は全生徒の平均より上と言えるか？

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%precision 3
%matplotlib inline
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Hiragino Maru Gothic Pro', 'Yu Gothic', 'Meirio', 'Takao', 'IPAexGothic', 'IPAPGothic', 'VL PGothic', 'Noto Sans CJK JP']

In [None]:
df = pd.read_csv('data/ch4_scores400.csv')
scores = np.array(df['点数'])
scores[:10]

## 標本の抽出方法

In [None]:
# 無作為抽出
print('復元抽出')
print(np.random.choice([1, 2, 3], 3))

print('非復元抽出')
print(np.random.choice([1, 2, 3], 3, replace=False))

### Aさんの行った無作為抽出の再現

In [None]:
np.random.seed(0)
sample = np.random.choice(scores, 20)
print('標本平均')
sample.mean()

In [None]:
print('母平均')
scores.mean()

In [None]:
for i in range(5):
    sample = np.random.choice(scores, 20)
    print(f'{i+1}回目の無作為抽出で得た標本平均', sample.mean())

## 確率モデル

### 確率分布

In [None]:
# イカサマサイコロ
dice = [1, 2, 3, 4, 5, 6] # 確率変数Xが取りうる値
prob = [1/21, 2/21, 3/21, 4/21, 5/21, 6/21] # 各目が出る確率

np.random.choice(dice, p=prob)

In [None]:
num_trial = 100
sample = np.random.choice(dice, num_trial, p=prob)
sample

In [None]:
freq, _ = np.histogram(sample, bins=6, range=(1, 7))
pd.DataFrame({'度数':freq, '相対度数':freq / num_trial}, index=pd.Index(np.arange(1, 7), name='出目'))

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111)
ax.hist(sample, bins=6, range=(1, 7), density=True, rwidth=0.8)
# 真の確率分布を横線で表示
ax.hlines(prob, np.arange(1, 7), np.arange(2, 8), colors='gray')
ax.set_xticks(np.linspace(1.5, 6.5, 6))
ax.set_xticklabels(np.arange(1, 7))
ax.set_xlabel('出目')
ax.set_ylabel('相対度数')
print('100回試行して得た出目の相対度数')
plt.show()

In [None]:
num_trial = 1000
sample = np.random.choice(dice, num_trial, p=prob)
sample
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111)
ax.hist(sample, bins=6, range=(1, 7), density=True, rwidth=0.8)
# 真の確率分布を横線で表示
ax.hlines(prob, np.arange(1, 7), np.arange(2, 8), colors='gray')
ax.set_xticks(np.linspace(1.5, 6.5, 6))
ax.set_xticklabels(np.arange(1, 7))
ax.set_xlabel('出目')
ax.set_ylabel('相対度数')
print('1000回試行して得た出目の相対度数')
plt.show()

In [None]:
num_trial = 10000
sample = np.random.choice(dice, num_trial, p=prob)
sample
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111)
ax.hist(sample, bins=6, range=(1, 7), density=True, rwidth=0.8)
# 真の確率分布を横線で表示
ax.hlines(prob, np.arange(1, 7), np.arange(2, 8), colors='gray')
ax.set_xticks(np.linspace(1.5, 6.5, 6))
ax.set_xticklabels(np.arange(1, 7))
ax.set_xlabel('出目')
ax.set_ylabel('相対度数')
print('10000回試行して得た出目の相対度数')
plt.show()

In [None]:
num_trial = 100000
sample = np.random.choice(dice, num_trial, p=prob)
sample
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111)
ax.hist(sample, bins=6, range=(1, 7), density=True, rwidth=0.8)
# 真の確率分布を横線で表示
ax.hlines(prob, np.arange(1, 7), np.arange(2, 8), colors='gray')
ax.set_xticks(np.linspace(1.5, 6.5, 6))
ax.set_xticklabels(np.arange(1, 7))
ax.set_xlabel('出目')
ax.set_ylabel('相対度数')
print('100000回試行して得た出目の相対度数')
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111)
ax.hist(scores, bins=100, range=(0, 100), density=True)
ax.set_xlim(20, 100)
ax.set_ylim(0, 0.042)
ax.set_xlabel('点数')
ax.set_ylabel('相対度数')
plt.show()

In [None]:
np.random.choice(scores)

In [None]:
sample = np.random.choice(scores, 10000)

fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111)
ax.hist(sample, bins=100, range=(0, 100), density=True)
ax.set_xlim(20, 100)
ax.set_ylim(0, 0.042)
ax.set_xlabel('点数')
ax.set_ylabel('相対度数')
plt.show()

In [None]:
# 標本平均
sample_means = [np.random.choice(scores, 20).mean() for _ in range(10000)]

fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111)
ax.hist(sample_means, bins=100, range=(0, 100), density=True)
# 母平均を縦線で表示
ax.vlines(np.mean(scores), 0, 1, 'gray')
ax.set_xlim(50, 90)
ax.set_ylim(0, 0.13)
ax.set_xlabel('点数')
ax.set_ylabel('相対度数')
plt.show()

In [None]:
np.mean(sample_means)