<a href="https://colab.research.google.com/github/suyasuyao/Kikagaku-Tyouki-Colab/blob/main/%E9%95%B7%E6%9C%9F%E3%82%B3%E3%83%BC%E3%82%B904_16%E3%83%81%E3%83%BC%E3%83%A0%E5%AD%A6%E7%BF%92.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

アヤメの計測データから三種類のアヤメの品種を分類するモデルを作成していただきます

最も有名なアヤメの品種を分類する問題です。初学者はまずはここからチャレンジしましょう。

本コンペを活用して、SIGNATEでのデータ解析・モデル構築を体験してください。


データ概要

課題種別：分類
データ種別：多変量
学習データサンプル数：75
説明変数の数：4
欠損値：無し

- 流れ
  - データの準備
  - データの前処理
  - データの切り分け（入力データ　目標値）
  - モデルの学習
  - モデルの評価 

In [1]:
## データの準備
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris


In [2]:
df_train = pd.read_table('/content/drive/MyDrive/Colab Notebooks/kikagaku_long/0416_team/train.tsv')
df_test = pd.read_table('/content/drive/MyDrive/Colab Notebooks/kikagaku_long/0416_team/test.tsv')

In [3]:
df_train

Unnamed: 0,id,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class
0,0,5.3,3.7,1.5,0.2,Iris-setosa
1,1,6.8,2.8,4.8,1.4,Iris-versicolor
2,3,6.1,3.0,4.9,1.8,Iris-virginica
3,4,6.4,3.2,5.3,2.3,Iris-virginica
4,5,6.3,3.3,4.7,1.6,Iris-versicolor
...,...,...,...,...,...,...
70,142,6.4,3.2,4.5,1.5,Iris-versicolor
71,143,5.4,3.9,1.7,0.4,Iris-setosa
72,145,5.6,3.0,4.1,1.3,Iris-versicolor
73,148,5.2,2.7,3.9,1.4,Iris-versicolor


In [4]:
df_test

Unnamed: 0,id,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm
0,2,6.1,2.8,4.7,1.2
1,7,6.3,2.5,4.9,1.5
2,8,6.2,3.4,5.4,2.3
3,10,6.7,3.1,4.7,1.5
4,13,5.0,3.4,1.6,0.4
...,...,...,...,...,...
70,137,5.8,2.6,4.0,1.2
71,141,5.1,3.5,1.4,0.2
72,144,6.2,2.2,4.5,1.5
73,146,6.5,3.0,5.2,2.0


In [5]:
# データの前処理
df_train =  df_train.drop_duplicates()

df_train

Unnamed: 0,id,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class
0,0,5.3,3.7,1.5,0.2,Iris-setosa
1,1,6.8,2.8,4.8,1.4,Iris-versicolor
2,3,6.1,3.0,4.9,1.8,Iris-virginica
3,4,6.4,3.2,5.3,2.3,Iris-virginica
4,5,6.3,3.3,4.7,1.6,Iris-versicolor
...,...,...,...,...,...,...
70,142,6.4,3.2,4.5,1.5,Iris-versicolor
71,143,5.4,3.9,1.7,0.4,Iris-setosa
72,145,5.6,3.0,4.1,1.3,Iris-versicolor
73,148,5.2,2.7,3.9,1.4,Iris-versicolor


In [6]:
# データの切り分け（入力データ　目標値）
x_train = df_train.drop('class', axis=1).values
t_train = df_train['class'].values

In [7]:
x_train

array([[0.00e+00, 5.30e+00, 3.70e+00, 1.50e+00, 2.00e-01],
       [1.00e+00, 6.80e+00, 2.80e+00, 4.80e+00, 1.40e+00],
       [3.00e+00, 6.10e+00, 3.00e+00, 4.90e+00, 1.80e+00],
       [4.00e+00, 6.40e+00, 3.20e+00, 5.30e+00, 2.30e+00],
       [5.00e+00, 6.30e+00, 3.30e+00, 4.70e+00, 1.60e+00],
       [6.00e+00, 6.10e+00, 2.80e+00, 4.00e+00, 1.30e+00],
       [9.00e+00, 6.10e+00, 2.90e+00, 4.70e+00, 1.40e+00],
       [1.10e+01, 7.30e+00, 2.90e+00, 6.30e+00, 1.80e+00],
       [1.20e+01, 6.40e+00, 2.80e+00, 5.60e+00, 2.10e+00],
       [1.40e+01, 7.20e+00, 3.20e+00, 6.00e+00, 1.80e+00],
       [1.70e+01, 6.10e+00, 3.00e+00, 4.60e+00, 1.40e+00],
       [1.90e+01, 5.90e+00, 3.00e+00, 5.10e+00, 1.80e+00],
       [2.10e+01, 6.70e+00, 3.00e+00, 5.20e+00, 2.30e+00],
       [2.30e+01, 7.90e+00, 3.80e+00, 6.40e+00, 2.00e+00],
       [2.50e+01, 4.80e+00, 3.00e+00, 1.40e+00, 3.00e-01],
       [2.80e+01, 4.90e+00, 3.60e+00, 1.40e+00, 1.00e-01],
       [2.90e+01, 5.90e+00, 3.00e+00, 4.20e+00, 1.50e+00

In [8]:
t_train

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-setosa', 'Iris-setosa', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-setosa', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-virginica', 'Iris-virginica',
       'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-setosa',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-setosa', 'Iris-

In [9]:
# モデルの構築

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=0)

In [10]:
# モデルの学習

%%time
model.fit(x_train,t_train)

CPU times: user 134 ms, sys: 0 ns, total: 134 ms
Wall time: 137 ms


RandomForestClassifier(random_state=0)

In [11]:
# モデルの定義
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state=0)

In [12]:
# モデルの学習
dtree.fit(x_train, t_train)

DecisionTreeClassifier(random_state=0)

In [13]:
# モデルの検証
print('train score : ', dtree.score(x_train, t_train))
# print('test score : ', dtree.score(x_test, t_test))

train score :  1.0


In [14]:
# データの切り分け（入力データ　目標値）
x_test = df_train.drop('class', axis=1).values

In [15]:
x_test

array([[0.00e+00, 5.30e+00, 3.70e+00, 1.50e+00, 2.00e-01],
       [1.00e+00, 6.80e+00, 2.80e+00, 4.80e+00, 1.40e+00],
       [3.00e+00, 6.10e+00, 3.00e+00, 4.90e+00, 1.80e+00],
       [4.00e+00, 6.40e+00, 3.20e+00, 5.30e+00, 2.30e+00],
       [5.00e+00, 6.30e+00, 3.30e+00, 4.70e+00, 1.60e+00],
       [6.00e+00, 6.10e+00, 2.80e+00, 4.00e+00, 1.30e+00],
       [9.00e+00, 6.10e+00, 2.90e+00, 4.70e+00, 1.40e+00],
       [1.10e+01, 7.30e+00, 2.90e+00, 6.30e+00, 1.80e+00],
       [1.20e+01, 6.40e+00, 2.80e+00, 5.60e+00, 2.10e+00],
       [1.40e+01, 7.20e+00, 3.20e+00, 6.00e+00, 1.80e+00],
       [1.70e+01, 6.10e+00, 3.00e+00, 4.60e+00, 1.40e+00],
       [1.90e+01, 5.90e+00, 3.00e+00, 5.10e+00, 1.80e+00],
       [2.10e+01, 6.70e+00, 3.00e+00, 5.20e+00, 2.30e+00],
       [2.30e+01, 7.90e+00, 3.80e+00, 6.40e+00, 2.00e+00],
       [2.50e+01, 4.80e+00, 3.00e+00, 1.40e+00, 3.00e-01],
       [2.80e+01, 4.90e+00, 3.60e+00, 1.40e+00, 1.00e-01],
       [2.90e+01, 5.90e+00, 3.00e+00, 4.20e+00, 1.50e+00

In [16]:
# 推論
t_test = dtree.predict(x_test)

In [17]:


t_test

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-setosa', 'Iris-setosa', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-setosa', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-virginica', 'Iris-virginica',
       'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-setosa',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-setosa', 'Iris-

In [18]:
type(t_test),type(x_test)

(numpy.ndarray, numpy.ndarray)

In [19]:
x_test.shape,t_test.shape

((75, 5), (75,))

In [26]:
x_test[:,0]

array([  0.,   1.,   3.,   4.,   5.,   6.,   9.,  11.,  12.,  14.,  17.,
        19.,  21.,  23.,  25.,  28.,  29.,  31.,  32.,  34.,  35.,  36.,
        38.,  39.,  41.,  42.,  46.,  47.,  49.,  53.,  55.,  57.,  58.,
        65.,  67.,  68.,  70.,  72.,  74.,  75.,  77.,  79.,  81.,  82.,
        87.,  88.,  96.,  98.,  99., 102., 103., 104., 105., 109., 110.,
       113., 115., 117., 118., 120., 122., 124., 128., 129., 130., 131.,
       136., 138., 139., 140., 142., 143., 145., 148., 149.])

In [28]:
t_test

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-setosa', 'Iris-setosa', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-setosa', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-virginica', 'Iris-virginica',
       'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-setosa',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-setosa', 'Iris-

In [48]:
# まずデータフレームに変換

#id = [1, 2, 3]
#test = ['あ', 'い', 'う']

#pd.DataFrame({'id': id, 'test': test})
# ０列を整数のリストにする
x_test[:,:0].astype(int)
# idとあわせてデータフレームに変換

output = pd.DataFrame({'id': x_test[:,0].astype(int), 'test': t_test})

In [49]:
output

Unnamed: 0,id,test
0,0,Iris-setosa
1,1,Iris-versicolor
2,3,Iris-virginica
3,4,Iris-virginica
4,5,Iris-versicolor
...,...,...
70,142,Iris-versicolor
71,143,Iris-setosa
72,145,Iris-versicolor
73,148,Iris-versicolor


In [51]:
# ヘッダとインデックスなし出力
outpath= '/content/drive/MyDrive/Colab Notebooks/kikagaku_long/0416_team/0417_submit.csv'
output.to_csv(outpath, header=False, index=False)