In [1]:
import numpy as np

In [2]:
# numpyの配列処理速度
%timeit np.arange(10000)+1

113 µs ± 10.8 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [3]:
# pythonのリスト処理速度
%timeit [i+1 for i in range(10000)]

5.24 ms ± 463 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [4]:
a=np.array([1,2,3,4])
b=2

In [5]:
a+b

array([3, 4, 5, 6])

In [6]:
a**2

array([ 1,  4,  9, 16], dtype=int32)

In [7]:
#　arange メソッドで０から８までの連番の配列を生成
#　reshape　メソッドで３行３列の２次元配列に変更
a=np.arange(9).reshape(3,3)

In [8]:
a

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [9]:
b=np.arange(2,7,2)

In [10]:
a+b

array([[ 2,  5,  8],
       [ 5,  8, 11],
       [ 8, 11, 14]])

In [11]:
a+b

array([[ 2,  5,  8],
       [ 5,  8, 11],
       [ 8, 11, 14]])

In [12]:
np.savetxt('data.txt',a)

In [13]:
!ls data.txt

data.txt


In [14]:
np.loadtxt('data.txt',dtype=np.uint8)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]], dtype=uint8)

In [15]:
np.save('data',a)

In [16]:
np.load('data.npy')

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [17]:
np.savez('data2',x=a,y=b)

In [18]:
c=np.load('data2.npz')

In [19]:
c['x']

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [20]:
c['y']

array([2, 4, 6])

In [21]:
# 関数操作
# precision で　表示有効桁数を2桁に設定
# edgeitemsで表示列を先頭末尾それぞれ2桁に設定
# suppressで指数を非表示に設定

np.set_printoptions(precision=2,edgeitems=2,suppress=True)

In [22]:
np.random.seed(0)

In [23]:
a=np.random.randint(0,100,9)

In [24]:
a

array([44, 47, 64, 67, 67,  9, 83, 21, 36])

In [25]:
a.max()

83

In [26]:
np.random.choice(a,3)

array([21, 83, 36])

In [27]:
b=a.reshape(3,3)

In [28]:
b

array([[44, 47, 64],
       [67, 67,  9],
       [83, 21, 36]])

In [29]:
# ｂの逆行列を計算
np.linalg.inv(b)

array([[-0.01,  0.  ,  0.02],
       [ 0.01,  0.02, -0.02],
       [ 0.02, -0.01,  0.  ]])

In [30]:
#　scipyでＱＲ分解
from scipy import random, linalg, allclose

In [31]:
# ランダムな値をとる3行2列を生成
a=random.rand(3,2)

In [32]:
a


array([[0.96, 0.38],
       [0.79, 0.53],
       [0.57, 0.93]])

In [33]:
q,r=linalg.qr(a)

In [34]:
q

array([[-0.7 ,  0.48,  0.52],
       [-0.58,  0.04, -0.82],
       [-0.41, -0.88,  0.25]])

In [35]:
r

array([[-1.37, -0.96],
       [ 0.  , -0.6 ],
       [ 0.  ,  0.  ]])

In [36]:
q.shape,r.shape

((3, 3), (3, 2))

In [37]:
allclose(a,np.dot(q,r))

True

In [38]:
import pandas as pd

In [39]:
# 表示桁数を2桁に設定
pd.set_option('precision',2)

In [40]:
# 事前にabalonデータをダウンロードする。
# abalonデータの読み込み
# namesは列名、usecolは読み込む列番号の設定
df=pd.read_csv('abalone.data', names=('性別','殻長','殻幅','高さ','重さ','年輪'), usecols=[0,1,2,3,4,8])

In [41]:
df

Unnamed: 0,性別,殻長,殻幅,高さ,重さ,年輪
0,M,0.46,0.36,0.10,0.51,15
1,M,0.35,0.27,0.09,0.23,7
2,F,0.53,0.42,0.14,0.68,9
3,M,0.44,0.36,0.12,0.52,10
4,I,0.33,0.26,0.08,0.20,7
5,I,0.42,0.30,0.10,0.35,8
6,F,0.53,0.41,0.15,0.78,20
7,F,0.55,0.42,0.12,0.77,16
8,M,0.47,0.37,0.12,0.51,9
9,F,0.55,0.44,0.15,0.89,19


In [42]:
df.dtypes


性別     object
殻長    float64
殻幅    float64
高さ    float64
重さ    float64
年輪      int64
dtype: object

In [43]:
df['年齢']=df['年輪']+1.5

In [44]:
df.head()

Unnamed: 0,性別,殻長,殻幅,高さ,重さ,年輪,年齢
0,M,0.46,0.36,0.1,0.51,15,16.5
1,M,0.35,0.27,0.09,0.23,7,8.5
2,F,0.53,0.42,0.14,0.68,9,10.5
3,M,0.44,0.36,0.12,0.52,10,11.5
4,I,0.33,0.26,0.08,0.2,7,8.5


In [45]:
df2=df.loc[:,['殻長','殻幅','高さ']]

In [46]:
df2.describe()

Unnamed: 0,殻長,殻幅,高さ
count,4177.0,4177.0,4177.0
mean,0.52,0.41,0.14
std,0.12,0.1,0.04
min,0.07,0.06,0.0
25%,0.45,0.35,0.12
50%,0.55,0.42,0.14
75%,0.61,0.48,0.17
max,0.81,0.65,1.13


In [47]:
df_sex=pd.get_dummies(df['性別'],prefix='性別')

In [48]:
df_sex.head()

Unnamed: 0,性別_F,性別_I,性別_M
0,0,0,1
1,0,0,1
2,1,0,0
3,0,0,1
4,0,1,0


In [56]:
# df_sex と dfをjoin、不必要な列を削除
# axsis=1は行ではなく、列であることを示す。
train_data=df_sex.join(df).drop(['性別','年輪'],axis=1)

In [57]:
train_data.head()

Unnamed: 0,性別_F,性別_I,性別_M,殻長,殻幅,高さ,重さ,年齢
0,0,0,1,0.46,0.36,0.1,0.51,16.5
1,0,0,1,0.35,0.27,0.09,0.23,8.5
2,1,0,0,0.53,0.42,0.14,0.68,10.5
3,0,0,1,0.44,0.36,0.12,0.52,11.5
4,0,1,0,0.33,0.26,0.08,0.2,8.5


In [58]:
# x_train と　y_train　の分離
# ilocは行と列をしていてデータを抽出するメソッド
# x_trainはすべての行の0から6列目
#y_trainはすべての行の7列目
x_train=train_data.iloc[:,:7]
y_train=train_data.iloc[:,7]

In [59]:
x_train.head()

Unnamed: 0,性別_F,性別_I,性別_M,殻長,殻幅,高さ,重さ
0,0,0,1,0.46,0.36,0.1,0.51
1,0,0,1,0.35,0.27,0.09,0.23
2,1,0,0,0.53,0.42,0.14,0.68
3,0,0,1,0.44,0.36,0.12,0.52
4,0,1,0,0.33,0.26,0.08,0.2


In [60]:
y_train.head()

0    16.5
1     8.5
2    10.5
3    11.5
4     8.5
Name: 年齢, dtype: float64

In [61]:
# 回帰モデルの学習と推定
# ランダムフォレストによる学習と推定
from sklearn.ensemble import RandomForestRegressor

In [64]:
# モデルの定義
model=RandomForestRegressor()

In [65]:
# 訓練データを使ってモデルを構築
model.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [66]:
# 訓練データに対する推定　★評価などの詳細は割愛　→　別紙で勉強か？
prediction=model.predict(x_train)

In [67]:
# 学習済みモデルの評価　MSI（平均２乗誤差）
# MSIの計算
from sklearn.metrics import mean_squared_error

In [68]:
mean_squared_error(y_train,prediction)

1.317376298779028

In [69]:
# モデルとデータの永続化
# モデルとデータの保存
from sklearn.externals import joblib

In [70]:
joblib.dump(model,'abalone_randomforest.pkl')

['abalone_randomforest.pkl']

In [72]:
# データの保存　
y_train_prediction=np.array([y_train,prediction])
np.save('y_train_prediction',y_train_prediction)