In [106]:
#!/usr/bin/env python
#coding=utf-8

### **資料預處理：**
- 遺漏值：取眾數填補Station (同座標)之遺漏值
- 類別資料：one-hot encoding

### **建模：**
- 隨機森林

In [107]:
# Connect with google drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
except:
    pass

In [108]:
# import package
# 資料處理用
import pandas as pd
import numpy as np

# 數學統計函式
import statistics as stat

# 選擇隨機森林建模
from sklearn.ensemble import RandomForestClassifier # 隨機森林
from sklearn.model_selection import GridSearchCV, train_test_split # splite training data to train, test data for training 
from sklearn import metrics # 看精確度用
from sklearn.model_selection import cross_val_score # 交叉驗證用
from sklearn.metrics import confusion_matrix #混淆矩陣

# 繪圖函式庫
import matplotlib.pyplot as plt

# 繪圖函式庫
import seaborn as sns 

%matplotlib inline

In [109]:
# load data
try:
    train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Game/OceanWast_data/train.csv')
    test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Game/OceanWast_data/test2.csv')
except:
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test2.csv')

In [110]:
train.tail()

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Foam material,Float,Fishing nets and ropes,Fishing equipment,Cigarette and lighter,Glass jar,Metal,Paper,Others,LEVEL
314,TT02,4,台東縣長濱鄉中成,長光,23.32287,121.46538,14,5,4,3,...,0,1,0,0,0,0,0,0,0,2
315,TT03,1,台東縣長濱鄉,白桑安,23.24933,121.41867,14,5,4,4,...,0,0,0,0,0,0,0,0,0,3
316,TT03,2,台東縣長濱鄉,白桑安/長濱觀景平台,23.24933,121.41867,14,5,4,4,...,0,0,0,0,0,0,0,0,0,2
317,TT03,3,台東縣長濱鄉,白桑安/長濱觀景平台,23.24933,121.41867,14,5,4,4,...,0,0,1,0,0,0,0,0,0,3
318,TT03,4,台東縣長濱鄉,寜埔,23.24933,121.41867,14,5,4,4,...,0,0,0,0,0,0,0,0,0,1


In [111]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319 entries, 0 to 318
Data columns (total 35 columns):
Station                               319 non-null object
Season                                319 non-null int64
County                                319 non-null object
Location                              319 non-null object
Lat                                   319 non-null float64
Lon                                   319 non-null float64
縣市                                    319 non-null int64
海岸段                                   319 non-null int64
Region                                319 non-null int64
Seat                                  319 non-null int64
Shore shape                           319 non-null int64
Substrate type                        319 non-null int64
1暴露岩岸                                 243 non-null float64
2暴露人造結構物                              243 non-null float64
3暴露岩盤                                 243 non-null float64
4沙灘                        

In [112]:
test.tail()

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Plastic bag,Foam material,Float,Fishing nets and ropes,Fishing equipment,Cigarette and lighter,Glass jar,Metal,Paper,Others
158,TT01,4,台東縣長濱鄉樟原村,八仙北,23.40975,121.48345,14,5,4,3,...,0,0,0,0,0,0,0,0,0,0
159,TT04,1,台東縣長濱鄉,石雨傘,23.17844,121.40177,14,5,4,2,...,1,1,0,1,0,0,0,0,0,1
160,TT04,2,台東縣長濱鄉,石雨傘,23.17844,121.40177,14,5,4,2,...,0,0,0,1,0,0,0,0,0,1
161,TT04,3,台東縣長濱鄉,石雨傘,23.17844,121.40177,14,5,4,2,...,0,1,0,1,0,0,0,0,0,0
162,TT04,4,台東縣長濱鄉,石雨傘,23.17844,121.40177,14,5,4,2,...,0,1,0,1,0,0,0,0,0,1


In [113]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Data columns (total 34 columns):
Station                               163 non-null object
Season                                163 non-null int64
County                                163 non-null object
Location                              163 non-null object
Lat                                   163 non-null float64
Lon                                   163 non-null float64
縣市                                    163 non-null int64
海岸段                                   163 non-null int64
Region                                163 non-null int64
Seat                                  163 non-null int64
Shore shape                           163 non-null int64
Substrate type                        163 non-null int64
1暴露岩岸                                 124 non-null float64
2暴露人造結構物                              124 non-null float64
3暴露岩盤                                 124 non-null float64
4沙灘                        

### **補遺漏值**
取眾數填補Station (同座標)之遺漏值

In [114]:
# 將train, test 做outer join, 以利補遺漏值。
mix = pd.concat([train, test], axis=0, join='outer' , ignore_index=True, sort = True)

# 重整cloumn順序
mix = mix[['Station', 'Season', 'County', 'Location', 'Lat', 'Lon', '縣市', '海岸段', 'Region', 'Seat', 'Shore shape', 'Substrate type', \
           '1暴露岩岸', '2暴露人造結構物', '3暴露岩盤', '4沙灘', '5砂礫混合灘', '6礫石灘', '7開闊潮間帶', '8遮蔽岩岸', '9遮蔽潮間帶', '10遮蔽濕地', \
           'Plastic bottle container', 'Disposable cup / straw / tableware', 'Plastic bag', 'Foam material', 'Float', \
           'Fishing nets and ropes', 'Fishing equipment', 'Cigarette and lighter', 'Glass jar', 'Metal', 'Paper', 'Others', 'LEVEL']]
mix.head(8)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Foam material,Float,Fishing nets and ropes,Fishing equipment,Cigarette and lighter,Glass jar,Metal,Paper,Others,LEVEL
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,1,0,0,0,0,0,2.0
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,1,0,0,0,0,0,0,5.0
2,E02,3,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,1,0,0,0,2.0
3,E02,4,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,0,0,0,0,1.0
4,E03,1,宜蘭縣,頭城,24.8573,121.83342,16,5,1,4,...,0,0,0,0,0,0,0,0,0,5.0
5,E03,2,宜蘭縣,頭城,24.8573,121.83342,16,5,1,4,...,0,1,0,0,0,0,0,0,1,5.0
6,E03,3,宜蘭縣,頭城,24.8573,121.83342,16,5,1,4,...,0,0,1,0,0,0,0,0,0,7.0
7,E03,4,宜蘭縣,頭城,24.8573,121.83342,16,5,1,4,...,0,0,0,0,0,1,0,0,0,2.0


In [115]:
mix.tail(8)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Foam material,Float,Fishing nets and ropes,Fishing equipment,Cigarette and lighter,Glass jar,Metal,Paper,Others,LEVEL
474,TT01,1,台東縣長濱鄉樟原村,八仙北,23.40975,121.48345,14,5,4,3,...,1,0,0,0,0,0,0,0,0,
475,TT01,2,台東縣長濱鄉樟原村,八仙北,23.40975,121.48345,14,5,4,3,...,0,0,0,0,0,0,0,0,0,
476,TT01,3,台東縣長濱鄉樟原村,八仙北,23.40975,121.48345,14,5,4,3,...,0,0,1,0,0,0,0,0,0,
477,TT01,4,台東縣長濱鄉樟原村,八仙北,23.40975,121.48345,14,5,4,3,...,0,0,0,0,0,0,0,0,0,
478,TT04,1,台東縣長濱鄉,石雨傘,23.17844,121.40177,14,5,4,2,...,1,0,1,0,0,0,0,0,1,
479,TT04,2,台東縣長濱鄉,石雨傘,23.17844,121.40177,14,5,4,2,...,0,0,1,0,0,0,0,0,1,
480,TT04,3,台東縣長濱鄉,石雨傘,23.17844,121.40177,14,5,4,2,...,1,0,1,0,0,0,0,0,0,
481,TT04,4,台東縣長濱鄉,石雨傘,23.17844,121.40177,14,5,4,2,...,1,0,1,0,0,0,0,0,1,


In [116]:
# 取得station清單
Station_set = set()
for i in mix['Station']:
  Station_set.add(i)

# 地形清單
terrain_list = ['1暴露岩岸', '2暴露人造結構物', '3暴露岩盤', '4沙灘','5砂礫混合灘', '6礫石灘', '7開闊潮間帶', '8遮蔽岩岸', '9遮蔽潮間帶', '10遮蔽濕地'] 

In [117]:
for sat in Station_set:
  for terr in terrain_list:
    try:
      # 計算該Station之眾數
      mode_ = stat.mode(mix.loc[mix.Station == sat, terr])
      # 補該Station之遺漏值
      mix.loc[mix.Station == sat, terr]= mix.loc[mix.Station == sat, terr].fillna(value= mode_)
    except:
      # 若無眾數，則補999
      mix.loc[mix.Station == sat, terr]= mix.loc[mix.Station == sat, terr].fillna(value= 999)

In [118]:
# 找出地形為999的row
mix.loc[(mix['1暴露岩岸'] == 999) | (mix['2暴露人造結構物'] == 999) | (mix['3暴露岩盤'] == 999) | (mix['4沙灘'] == 999) | (mix['5砂礫混合灘'] == 999) \
        | (mix['6礫石灘'] == 999) | (mix['7開闊潮間帶'] == 999) | (mix['8遮蔽岩岸'] == 999) | (mix['9遮蔽潮間帶'] == 999)| (mix['10遮蔽濕地'] == 999)]

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Foam material,Float,Fishing nets and ropes,Fishing equipment,Cigarette and lighter,Glass jar,Metal,Paper,Others,LEVEL
102,M17,4,台中市清水區,高美溼地,24.31156,120.526,7,2,2,8,...,1,1,0,0,0,0,0,0,0,7.0
349,LI10,4,宜蘭縣,漢本,24.34039,121.77126,16,5,1,3,...,0,0,0,0,0,0,0,0,0,


In [119]:
# 找出 Station == M17 來補遺漏值
mix.loc[mix.Station == 'M17']

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Foam material,Float,Fishing nets and ropes,Fishing equipment,Cigarette and lighter,Glass jar,Metal,Paper,Others,LEVEL
100,M17,2,台中市清水,高美溼地,24.31156,120.526,7,2,2,8,...,0,0,0,0,0,0,0,0,1,3.0
101,M17,3,台中市清水,高美溼地,24.31156,120.526,7,2,2,8,...,0,1,0,0,0,0,0,0,0,5.0
102,M17,4,台中市清水區,高美溼地,24.31156,120.526,7,2,2,8,...,1,1,0,0,0,0,0,0,0,7.0


In [120]:
# 覺得高美濕地'暴露人造結構物'不會很多，因此補0
mix.loc[102, '2暴露人造結構物'] = 0
# 檢查一下
mix.loc[mix.Station == 'M17']

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Foam material,Float,Fishing nets and ropes,Fishing equipment,Cigarette and lighter,Glass jar,Metal,Paper,Others,LEVEL
100,M17,2,台中市清水,高美溼地,24.31156,120.526,7,2,2,8,...,0,0,0,0,0,0,0,0,1,3.0
101,M17,3,台中市清水,高美溼地,24.31156,120.526,7,2,2,8,...,0,1,0,0,0,0,0,0,0,5.0
102,M17,4,台中市清水區,高美溼地,24.31156,120.526,7,2,2,8,...,1,1,0,0,0,0,0,0,0,7.0


In [121]:
# 找出 Station == LI10 來補遺漏值
mix.loc[mix.Station == 'LI10']

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Foam material,Float,Fishing nets and ropes,Fishing equipment,Cigarette and lighter,Glass jar,Metal,Paper,Others,LEVEL
347,LI10,2,宜蘭縣,漢本,24.34039,121.77126,16,5,1,3,...,0,0,1,0,0,0,0,0,0,
348,LI10,3,宜蘭縣,漢本,24.34039,121.77126,16,5,1,3,...,0,1,1,0,0,0,0,0,0,
349,LI10,4,宜蘭縣,漢本,24.34039,121.77126,16,5,1,3,...,0,0,0,0,0,0,0,0,0,


In [122]:
# '暴露人造結構物'補0, '5砂礫混合灘', '6礫石灘' 補1
mix.loc[349, '2暴露人造結構物'] = 0
mix.loc[349, '5砂礫混合灘'] = 1
mix.loc[349, '6礫石灘'] = 1
# 檢查一下
mix.loc[mix.Station == 'LI10']

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Foam material,Float,Fishing nets and ropes,Fishing equipment,Cigarette and lighter,Glass jar,Metal,Paper,Others,LEVEL
347,LI10,2,宜蘭縣,漢本,24.34039,121.77126,16,5,1,3,...,0,0,1,0,0,0,0,0,0,
348,LI10,3,宜蘭縣,漢本,24.34039,121.77126,16,5,1,3,...,0,1,1,0,0,0,0,0,0,
349,LI10,4,宜蘭縣,漢本,24.34039,121.77126,16,5,1,3,...,0,0,0,0,0,0,0,0,0,


### **類別資料：**
1. 類別資料（Season, 海岸段、Region、Seat、Shore shape、Substrate type）編碼並無順序性，須使用one-hot encoding。
2. 類別資料，Station、County，直接使用one-hot encoding。

In [123]:
# 先處理Station之one-hot encoding
oneHot = pd.get_dummies(mix['Station'])

# mix 與 oneHot 合併
mix = pd.concat([mix, oneHot], axis=1)
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,SW20,SW21,SW22,SW23,SW24,SW25,TT01,TT02,TT03,TT04
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,0,0,0,0,0
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,0,0,0,0,0


In [124]:
# 處理County之one-hot encoding
oneHot = pd.get_dummies(mix['County'])

# mix 與 oneHot 合併
mix = pd.concat([mix, oneHot], axis=1)
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,雲林縣四湖鄉,雲林縣麥寮鄉,高雄小港區,高雄市,高雄彌陀區,高雄旗津區,高雄林園,高雄梓官區,高雄永安區,高雄鼓山區
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,0,0,0,0,0
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,0,0,0,0,0


In [125]:
# 處理Season之one-hot encoding
oneHot = pd.get_dummies(mix['Season'])

# mix 與 oneHot 合併
mix = pd.concat([mix, oneHot], axis=1)
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,高雄彌陀區,高雄旗津區,高雄林園,高雄梓官區,高雄永安區,高雄鼓山區,1,2,3,4
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,0,1,0,0,0
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,0,0,1,0,0


In [126]:
# 修改column 名
mix = mix.rename(columns = {1:'Season1', 2:'Season2', 3:'Season3', 4:'Season4'})
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,高雄彌陀區,高雄旗津區,高雄林園,高雄梓官區,高雄永安區,高雄鼓山區,Season1,Season2,Season3,Season4
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,0,1,0,0,0
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,0,0,1,0,0


In [127]:
# 處理縣市之one-hot encoding
oneHot = pd.get_dummies(mix['縣市'])

# mix 與 oneHot 合併
mix = pd.concat([mix, oneHot], axis=1)
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,7,8,9,10,11,12,13,14,15,16
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,0,0,0,0,1
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,0,0,0,0,1


In [128]:
# 修改column 名
mix = mix.rename(columns = {1:'縣市1',    2:'縣市2',   3:'縣市3',   4:'縣市4',
                            5:'縣市5',    6:'縣市6',   7:'縣市7',   8:'縣市8',
                            9:'縣市9',   10:'縣市10', 11:'縣市11', 12:'縣市12',
                            13:'縣市13', 14:'縣市14', 15:'縣市15', 16:'縣市16'})
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,縣市7,縣市8,縣市9,縣市10,縣市11,縣市12,縣市13,縣市14,縣市15,縣市16
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,0,0,0,0,1
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,0,0,0,0,1


In [129]:
# 處理海岸段之one-hot encoding
oneHot = pd.get_dummies(mix['海岸段'])

# mix 與 oneHot 合併
mix = pd.concat([mix, oneHot], axis=1)
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,縣市12,縣市13,縣市14,縣市15,縣市16,1,2,3,4,5
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,1,0,0,0,0,1
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,1,0,0,0,0,1


In [130]:
# 修改column 名
mix = mix.rename(columns = {1:'海岸段1', 2:'海岸段2', 3:'海岸段3', 4:'海岸段4', 5:'海岸段5'})
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,縣市12,縣市13,縣市14,縣市15,縣市16,海岸段1,海岸段2,海岸段3,海岸段4,海岸段5
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,1,0,0,0,0,1
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,1,0,0,0,0,1


In [131]:
# 處理Region之one-hot encoding
oneHot = pd.get_dummies(mix['Region'])

# mix 與 oneHot 合併
mix = pd.concat([mix, oneHot], axis=1)
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,海岸段1,海岸段2,海岸段3,海岸段4,海岸段5,1,2,3,4,5
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,1,1,0,0,0,0
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,1,1,0,0,0,0


In [132]:
# 修改column 名
mix = mix.rename(columns = {1:'Region1', 2:'Region2', 3:'Region3', 4:'Region4', 5:'Region5'})
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,海岸段1,海岸段2,海岸段3,海岸段4,海岸段5,Region1,Region2,Region3,Region4,Region5
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,1,1,0,0,0,0
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,1,1,0,0,0,0


In [133]:
# 處理Seat之one-hot encoding
oneHot = pd.get_dummies(mix['Seat'])

# mix 與 oneHot 合併
mix = pd.concat([mix, oneHot], axis=1)
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Region4,Region5,1,2,3,4,5,6,7,8
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,1,0,0,0,0
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,1,0,0,0,0


In [134]:
# 修改column 名
mix = mix.rename(columns = {1:'Seat1', 2:'Seat2', 3:'Seat3', 4:'Seat4', 5:'Seat5', 
                            6:'Seat6', 7:'Seat7', 8:'Seat8'})
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Region4,Region5,Seat1,Seat2,Seat3,Seat4,Seat5,Seat6,Seat7,Seat8
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,1,0,0,0,0
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,0,0,0,1,0,0,0,0


In [135]:
# 處理Shore shape之one-hot encoding
oneHot = pd.get_dummies(mix['Shore shape'])

# mix 與 oneHot 合併
mix = pd.concat([mix, oneHot], axis=1)
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Seat2,Seat3,Seat4,Seat5,Seat6,Seat7,Seat8,1,2,3
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,1,0,0,0,0,0,1,0
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,1,0,0,0,0,0,1,0


In [136]:
# 修改column 名
mix = mix.rename(columns = {1:'Shore shape1', 2:'Shore shape2', 3:'Shore shape3'})
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Seat2,Seat3,Seat4,Seat5,Seat6,Seat7,Seat8,Shore shape1,Shore shape2,Shore shape3
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,1,0,0,0,0,0,1,0
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,0,0,1,0,0,0,0,0,1,0


In [137]:
# 處理Substrate type之one-hot encoding
oneHot = pd.get_dummies(mix['Substrate type'])

# mix 與 oneHot 合併
mix = pd.concat([mix, oneHot], axis=1)
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Shore shape2,Shore shape3,1,2,3,4,5,6,7,8
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,1,0,0,0,1,0,0,0,0,0
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,1,0,0,0,1,0,0,0,0,0


In [138]:
# 修改column 名
mix = mix.rename(columns = {1:'Substrate type1', 2:'Substrate type2', 3:'Substrate type3', 
                            4:'Substrate type4', 5:'Substrate type5', 6:'Substrate type6', 
                            7:'Substrate type7', 8:'Substrate type8'})
mix.head(2)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Shore shape2,Shore shape3,Substrate type1,Substrate type2,Substrate type3,Substrate type4,Substrate type5,Substrate type6,Substrate type7,Substrate type8
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,1,0,0,0,1,0,0,0,0,0
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,1,0,0,0,1,0,0,0,0,0


In [139]:
# 存個檔
file = 'mix_v1.1.csv'
try:
    mix.to_csv('/content/drive/My Drive/Colab Notebooks/Game/OceanWast_data/' + file)
except:
     mix.to_csv(file)

In [140]:
mix.head(8)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Shore shape2,Shore shape3,Substrate type1,Substrate type2,Substrate type3,Substrate type4,Substrate type5,Substrate type6,Substrate type7,Substrate type8
0,E02,1,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,1,0,0,0,1,0,0,0,0,0
1,E02,2,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,1,0,0,0,1,0,0,0,0,0
2,E02,3,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,1,0,0,0,1,0,0,0,0,0
3,E02,4,宜蘭縣,大溪,24.92528,121.88569,16,5,1,4,...,1,0,0,0,1,0,0,0,0,0
4,E03,1,宜蘭縣,頭城,24.8573,121.83342,16,5,1,4,...,0,0,0,0,0,1,0,0,0,0
5,E03,2,宜蘭縣,頭城,24.8573,121.83342,16,5,1,4,...,0,0,0,0,0,1,0,0,0,0
6,E03,3,宜蘭縣,頭城,24.8573,121.83342,16,5,1,4,...,0,0,0,0,0,1,0,0,0,0
7,E03,4,宜蘭縣,頭城,24.8573,121.83342,16,5,1,4,...,0,0,0,0,0,1,0,0,0,0


In [141]:
mix.tail(8)

Unnamed: 0,Station,Season,County,Location,Lat,Lon,縣市,海岸段,Region,Seat,...,Shore shape2,Shore shape3,Substrate type1,Substrate type2,Substrate type3,Substrate type4,Substrate type5,Substrate type6,Substrate type7,Substrate type8
474,TT01,1,台東縣長濱鄉樟原村,八仙北,23.40975,121.48345,14,5,4,3,...,1,0,0,0,0,0,1,0,0,0
475,TT01,2,台東縣長濱鄉樟原村,八仙北,23.40975,121.48345,14,5,4,3,...,1,0,0,0,0,0,1,0,0,0
476,TT01,3,台東縣長濱鄉樟原村,八仙北,23.40975,121.48345,14,5,4,3,...,1,0,0,0,0,0,1,0,0,0
477,TT01,4,台東縣長濱鄉樟原村,八仙北,23.40975,121.48345,14,5,4,3,...,1,0,0,0,0,0,1,0,0,0
478,TT04,1,台東縣長濱鄉,石雨傘,23.17844,121.40177,14,5,4,2,...,0,0,0,0,0,1,0,0,0,0
479,TT04,2,台東縣長濱鄉,石雨傘,23.17844,121.40177,14,5,4,2,...,0,0,0,0,0,1,0,0,0,0
480,TT04,3,台東縣長濱鄉,石雨傘,23.17844,121.40177,14,5,4,2,...,0,0,0,0,0,1,0,0,0,0
481,TT04,4,台東縣長濱鄉,石雨傘,23.17844,121.40177,14,5,4,2,...,0,0,0,0,0,1,0,0,0,0


In [142]:
# 將train and test 分開
drop_ = ['Station', 'Season', 'County', 'Location', '縣市', '海岸段', 'Region', 
         'Seat', 'Shore shape', 'Substrate type'] #不想放進建模的因子
mix = mix.drop(drop_, axis = 1)
mix.columns.values.tolist() # 看資料的column有哪些

['Lat',
 'Lon',
 '1暴露岩岸',
 '2暴露人造結構物',
 '3暴露岩盤',
 '4沙灘',
 '5砂礫混合灘',
 '6礫石灘',
 '7開闊潮間帶',
 '8遮蔽岩岸',
 '9遮蔽潮間帶',
 '10遮蔽濕地',
 'Plastic bottle container',
 'Disposable cup / straw / tableware',
 'Plastic bag',
 'Foam material',
 'Float',
 'Fishing nets and ropes',
 'Fishing equipment',
 'Cigarette and lighter',
 'Glass jar',
 'Metal',
 'Paper',
 'Others',
 'LEVEL',
 'E01',
 'E02',
 'E03',
 'E04',
 'E05',
 'E06',
 'E07',
 'E08',
 'E09',
 'HL01',
 'HL02',
 'HL03',
 'HL04',
 'HL05',
 'HL06',
 'HL07',
 'HL08',
 'HL09',
 'HL10',
 'HL11',
 'HL12',
 'LI10',
 'M01',
 'M02',
 'M03',
 'M04',
 'M05',
 'M06',
 'M07',
 'M08',
 'M09',
 'M10',
 'M11',
 'M12',
 'M13',
 'M14',
 'M15',
 'M16',
 'M17',
 'M18',
 'M19',
 'M20',
 'M21',
 'M22',
 'M23',
 'M24',
 'N01',
 'N02',
 'N03',
 'N04',
 'N05',
 'N06',
 'N07',
 'N08',
 'N09',
 'N10',
 'N11',
 'N12',
 'N13',
 'N14',
 'N15',
 'N16',
 'N17',
 'N18',
 'N19',
 'N20',
 'N21',
 'N22',
 'N23',
 'N24',
 'SE03',
 'SE04',
 'SE05',
 'SE06',
 'SE07',
 'SE08',
 'SE09',
 '

In [143]:
train = mix.iloc[:319]
test = mix.iloc[319:].reset_index(drop=True)

In [144]:
train.tail(8)

Unnamed: 0,Lat,Lon,1暴露岩岸,2暴露人造結構物,3暴露岩盤,4沙灘,5砂礫混合灘,6礫石灘,7開闊潮間帶,8遮蔽岩岸,...,Shore shape2,Shore shape3,Substrate type1,Substrate type2,Substrate type3,Substrate type4,Substrate type5,Substrate type6,Substrate type7,Substrate type8
311,23.32287,121.46538,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,0
312,23.32287,121.46538,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,0
313,23.32287,121.46538,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,0
314,23.32287,121.46538,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,0
315,23.24933,121.41867,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,1,0,0,0,0
316,23.24933,121.41867,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,1,0,0,0,0
317,23.24933,121.41867,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1,0,0,0,0,1,0,0,0,0
318,23.24933,121.41867,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,1,0,0,0,0


In [145]:
test.tail(8)

Unnamed: 0,Lat,Lon,1暴露岩岸,2暴露人造結構物,3暴露岩盤,4沙灘,5砂礫混合灘,6礫石灘,7開闊潮間帶,8遮蔽岩岸,...,Shore shape2,Shore shape3,Substrate type1,Substrate type2,Substrate type3,Substrate type4,Substrate type5,Substrate type6,Substrate type7,Substrate type8
155,23.40975,121.48345,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,0
156,23.40975,121.48345,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,0
157,23.40975,121.48345,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,0
158,23.40975,121.48345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,0
159,23.17844,121.40177,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
160,23.17844,121.40177,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
161,23.17844,121.40177,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
162,23.17844,121.40177,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0


#### **檢查各因子相關性**
因為個因子都為分類，所以不檢查個因子相關性。

#### **建模**

##### 1. 先隨意建立一個模型，找出哪些因子重要性較高

In [146]:
# 設定建模的x,y

x = train.drop('LEVEL', axis = 1)          
y = train['LEVEL'] # 建模的y

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=56) #8, 2分，隨機種子為56，確保每次隨機可得到相同的結果

In [147]:
X_train.head(5) # 確認隨機抽的情況，發現為不規則抽。

Unnamed: 0,Lat,Lon,1暴露岩岸,2暴露人造結構物,3暴露岩盤,4沙灘,5砂礫混合灘,6礫石灘,7開闊潮間帶,8遮蔽岩岸,...,Shore shape2,Shore shape3,Substrate type1,Substrate type2,Substrate type3,Substrate type4,Substrate type5,Substrate type6,Substrate type7,Substrate type8
135,25.05134,121.08195,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
168,25.13981,121.80207,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
20,24.4595,121.81998,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
75,23.75403,120.17959,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
23,24.4595,121.81998,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0


In [148]:
# 隨意建模，參數幾乎皆為預設
clf = RandomForestClassifier()
clf = RandomForestClassifier(n_estimators = 100, oob_score=True, random_state=56)
clf = clf.fit(X_train, y_train)
print(clf.oob_score_)

0.27058823529411763


**隨機森林參數：**

n_estimators: 樹數
- version 0.20: 預設10顆。
- version 0.22: 預設100顆 。

min_samples_split:
- 某節點的樣本數少於min_samples_split，則不會繼續再嘗試選擇最優特徵來進行劃分。

oob_score: bool (default=False)
- Whether to use out-of-bag samples to estimate the generalization accuracy. #利用out-of-bag samples (袋外樣本)評估泛化精確度。

- if True, 即採用袋外樣本來評估模型的好壞。
- 多單個模型的參數訓練中，通常用cross validation（cv）來進行交叉驗證，但是特別消耗時間，因此可使用這類的數據對決策樹模型進行驗證，算是一個簡單的交叉驗證。性能消耗小，但是效果不錯。

random_state: 類似隨機種子，可每次切分資料的結果都相同。

**資料料來源：**
1. https://www.itread01.com/content/1549571767.html (各參數說明)
2. https://www.cnblogs.com/pinard/p/6160412.html
3. https://zhuanlan.zhihu.com/p/31322257

In [149]:
# 找出各參數重要性
x_factor = X_train.columns.values.tolist()
importance = []

for i in range(len(clf.feature_importances_)): # 將因子與重要性配對
  importance.append([x_factor[i], clf.feature_importances_[i]])

importance.sort(key=lambda x:x[1], reverse=True) # 將重要性依大至小排列

print('各參數重要性') # 將各因子印出
for i in importance:
  print(i)

各參數重要性
['Lon', 0.051361291113373904]
['Lat', 0.042990906947493725]
['Season2', 0.038528833946049904]
['Season1', 0.03786540040662681]
['Season4', 0.036699475952553975]
['Season3', 0.029707063116580463]
['Plastic bottle container', 0.029636897723925717]
['Foam material', 0.02747223040136645]
['Fishing nets and ropes', 0.02663832033107695]
['Float', 0.021835601513174333]
['Disposable cup / straw / tableware', 0.01755867437228665]
['4沙灘', 0.01753777207243437]
['Others', 0.016738384831377788]
['2暴露人造結構物', 0.01601249470245443]
['Glass jar', 0.015662769326917677]
['5砂礫混合灘', 0.01472252996792244]
['6礫石灘', 0.013163979969551506]
['Substrate type4', 0.012654653977598422]
['海岸段5', 0.012015972705939182]
['Seat3', 0.011999243528205266]
['Shore shape1', 0.011877478178257886]
['Shore shape2', 0.011422298289448688]
['Region4', 0.009403734794186325]
['Seat8', 0.009329059170459295]
['Substrate type6', 0.009225571146933228]
['Plastic bag', 0.009172784574101128]
['Seat7', 0.009159084149456678]
['Substrate 

得知'Lon'重要性最高，'Lat'次等，再來為'Season2'。

##### **2. 開始認真建模，找出噪聲因子**

用隨意建模中，最重要的兩個因子來建模，並以此為基準。再依重要性增加因子建模，若準確度或分數下降則表示該因子為噪聲，或是模型參數已經調過頭導致了overfitting，此時該做的就是回到最簡單的Base Model，一項一項特徵慢慢地加入。

參考來源：https://medium.com/@yulongtsai/https-medium-com-yulongtsai-titanic-top3-8e64741cc11f

In [150]:
# 設定建模的x,y，排除不想要放入模型的變數，如座標，可用測站編號代替。
x_factor = ['Lon', 'Lat', 'Season2']
           
x = train[x_factor]       
y = train['LEVEL'] # 建模的y

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=56) #8, 2分，隨機種子為56，確保每次隨機可得到相同的結果

# 建模
clf = RandomForestClassifier()
clf = RandomForestClassifier(n_estimators = 300, oob_score=True, random_state=56)
clf = clf.fit(X_train, y_train)
print('oob_score_:', clf.oob_score_)

oob_score_: 0.23529411764705882


以oob_score_: 0.2325為基準，依序將因子依序加入，若oob_score_下降則將該因子排除。

In [151]:
# 得依重要性排序的變數
x_factor = []
for i in range(len(importance)):
    x_factor.append(importance[i][0])
x_factor

['Lon',
 'Lat',
 'Season2',
 'Season1',
 'Season4',
 'Season3',
 'Plastic bottle container',
 'Foam material',
 'Fishing nets and ropes',
 'Float',
 'Disposable cup / straw / tableware',
 '4沙灘',
 'Others',
 '2暴露人造結構物',
 'Glass jar',
 '5砂礫混合灘',
 '6礫石灘',
 'Substrate type4',
 '海岸段5',
 'Seat3',
 'Shore shape1',
 'Shore shape2',
 'Region4',
 'Seat8',
 'Substrate type6',
 'Plastic bag',
 'Seat7',
 'Substrate type5',
 '3暴露岩盤',
 'Seat4',
 '縣市14',
 'Seat2',
 '1暴露岩岸',
 '7開闊潮間帶',
 'Region3',
 'Metal',
 '海岸段3',
 '雲林縣口湖鄉',
 '海岸段4',
 'Region1',
 'Substrate type2',
 'Region5',
 '海岸段1',
 '海岸段2',
 'Region2',
 '縣市3',
 '縣市13',
 '縣市10',
 'SW19',
 '8遮蔽岩岸',
 '花蓮縣',
 '縣市11',
 '縣市15',
 'M04',
 '宜蘭縣',
 'Seat6',
 'M01',
 'SW25',
 '縣市8',
 'Seat1',
 '縣市16',
 '新北',
 'N16',
 'Shore shape3',
 'Substrate type1',
 'Seat5',
 '縣市9',
 '縣市7',
 '縣市12',
 'Substrate type8',
 '嘉義縣東石鄉',
 'SE13',
 'Fishing equipment',
 'N14',
 '屏東',
 '台南市',
 'Paper',
 '嘉義東石',
 '高雄鼓山區',
 'M11',
 'Substrate type3',
 '雲林縣麥寮鄉',
 '縣市1',
 '新北市貢寮區',
 

In [152]:
# 以迴圈方式找出會增加oob_score_的變數

x_x = [] # 將欲放進model的變數
x_notin = [] # 沒被放入model的變數
pre_oob_score_ = 0 # 先將前一次的袋外分數設為0


for i in x_factor:
    
    x_x.append(i) # 變數
    x = train[x_x]          
    y = train['LEVEL'] # 建模的y

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=56) #8, 2分，隨機種子為56，確保每次隨機可得到相同的結果
    
    # 建模
    clf = RandomForestClassifier(n_estimators = 300, oob_score=True, random_state=56)
    clf = clf.fit(X_train, y_train)
    oob_score_ = clf.oob_score_
    
    if oob_score_ > pre_oob_score_:
        pre_oob_score_ = oob_score_
        continue
        
    elif oob_score_ < pre_oob_score_:
        x_notin.append([x_x[-1],pre_oob_score_, "->", oob_score_]) # 將讓oob_score_下降的變數放入該list
        pre_oob_score_ = pre_oob_score_ # pre_oob_score_ 維持原狀
        x_x = x_x[:-1] # 將讓oob_score_下降的變數排除

# 得最終結果的oob_score
x = train[x_x]          
y = train['LEVEL'] # 建模的y

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=56) #8, 2分，隨機種子為56，確保每次隨機可得到相同的結果

# 建模
clf = RandomForestClassifier(n_estimators = 300, oob_score=True, random_state=56)
clf = clf.fit(X_train, y_train)
oob_score_ = clf.oob_score_

print("讓oob_score_下降的變數及下降分數：")
for i in x_notin: #印出讓oob_score_下降的變數及下降分數
    print(i)
    
print("-"*50)
print("讓oob_score_上升的變數及最後的oob_score_：")
print(x_x, oob_score_) # 印出會讓oob_score_上升的變數及最後的oob_score_

讓oob_score_下降的變數及下降分數：
['Season2', 0.28627450980392155, '->', 0.23529411764705882]
['Season1', 0.28627450980392155, '->', 0.26666666666666666]
['Season4', 0.28627450980392155, '->', 0.25882352941176473]
['Season3', 0.28627450980392155, '->', 0.20784313725490197]
['Plastic bottle container', 0.28627450980392155, '->', 0.24705882352941178]
['Foam material', 0.28627450980392155, '->', 0.24313725490196078]
['Fishing nets and ropes', 0.28627450980392155, '->', 0.25098039215686274]
['Float', 0.28627450980392155, '->', 0.24705882352941178]
['4沙灘', 0.2901960784313726, '->', 0.23921568627450981]
['Others', 0.2901960784313726, '->', 0.28627450980392155]
['2暴露人造結構物', 0.2901960784313726, '->', 0.27450980392156865]
['Glass jar', 0.2901960784313726, '->', 0.28627450980392155]
['5砂礫混合灘', 0.2901960784313726, '->', 0.27450980392156865]
['6礫石灘', 0.2901960784313726, '->', 0.27058823529411763]
['Substrate type4', 0.2901960784313726, '->', 0.2784313725490196]
['海岸段5', 0.2901960784313726, '->', 0.2862745098

## 建模嘍

GridSearchCV 參數：

iid：
- If True, return the average score across folds, weighted by the number of samples in each test set. In this case, the data is assumed to be identically distributed across the folds, and the loss minimized is the total loss per sample, and not the mean loss across the folds. If False, return the average score across folds. Default is True, but will change to False in version 0.22, to correspond to the standard definition of cross-validation.
- If Ture, 回傳的每折疊之平均分數，會因樣本數的差異而進行加權。If false, 僅回傳的每折疊之平均分數，並不會因樣本數的差異而進行加權。這個參數預設從0.22版的True改為False in version 0.24.

In [153]:
# 利用GridSearchCV找出最適合的參數
x = train[x_x]          
y = train['LEVEL'] # 建模的y

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=56) #8, 2分，隨機種子為56，確保每次隨機可得到相同的結果

p = {
    'n_estimators': range(100,500, 10),
    'max_depth': range(5, 100)
}

grid = GridSearchCV(clf, p, cv = 7, iid = False)
grid.fit(X_train, y_train)
grid.best_params_
max_depth= dict(grid .best_params_)['max_depth']
n_estimators = dict(grid.best_params_)['n_estimators']
print("'max_depth':{0}, \n'n_estimators':{1}".format(max_depth, n_estimators))



'max_depth':15, 
'n_estimators':160


In [165]:
# 建模
clf = RandomForestClassifier()
clf = RandomForestClassifier(max_depth = 15, n_estimators = 160, oob_score=True, random_state=56)
clf = clf.fit(X_train, y_train)
print('oob_score_:', clf.oob_score_)

oob_score_: 0.3137254901960784


In [166]:
# 用X_test 預測 y_pred，之後再跟y_test比對精確度(正確率)
y_pred=clf.predict(X_test)
y_pred

array([3., 9., 6., 1., 1., 4., 5., 1., 6., 6., 7., 5., 7., 2., 5., 6., 6.,
       6., 4., 4., 6., 2., 5., 2., 2., 8., 6., 6., 6., 5., 6., 5., 5., 3.,
       7., 5., 6., 7., 2., 2., 3., 8., 4., 6., 9., 7., 3., 2., 9., 2., 2.,
       8., 4., 2., 6., 7., 4., 6., 7., 6., 7., 5., 4., 4.])

In [167]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.359375


In [169]:
score = cross_val_score(clf, X_test, y_test, cv = 5) #cv = 5, 分5組
print('十次分數：', score)
print('平均分數：',score.mean())



十次分數： [0.38461538 0.30769231 0.23076923 0.38461538 0.16666667]
平均分數： 0.2948717948717949


In [170]:
# 混淆矩陣，看哪些分錯
# '列' or 左邊為正確標籤，'行' or '上面'為預測
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(cm)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,0,0,1,0,0,0,0,0,0
1,2,3,1,1,0,1,0,0,0,0
2,1,2,0,1,0,0,0,0,0,0
3,0,3,1,1,0,1,1,0,0,0
4,0,2,0,3,5,3,0,0,0,0
5,0,0,2,1,2,5,1,0,0,0
6,0,0,0,0,1,3,6,1,0,0
7,0,0,0,0,1,2,0,2,0,0
8,0,0,0,0,0,1,0,0,1,0
9,0,0,0,0,0,0,0,0,2,0


In [171]:
test

Unnamed: 0,Lat,Lon,1暴露岩岸,2暴露人造結構物,3暴露岩盤,4沙灘,5砂礫混合灘,6礫石灘,7開闊潮間帶,8遮蔽岩岸,...,Shore shape2,Shore shape3,Substrate type1,Substrate type2,Substrate type3,Substrate type4,Substrate type5,Substrate type6,Substrate type7,Substrate type8
0,24.97876,121.94786,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
1,24.97876,121.94786,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
2,24.97876,121.94786,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
3,24.97876,121.94786,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
4,24.78206,121.81792,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,23.40975,121.48345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,0
159,23.17844,121.40177,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
160,23.17844,121.40177,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
161,23.17844,121.40177,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0


In [172]:
# 真正來預測嘍
y_pred = clf.predict(test[x_x])
y_pred

array([ 5.,  5.,  5.,  5.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  4.,  4.,
        4.,  4.,  2.,  2.,  2.,  8.,  8.,  8.,  8.,  6.,  6.,  6.,  6.,
        4.,  4.,  4.,  4.,  6.,  6.,  6.,  6.,  7.,  8.,  7.,  8.,  7.,
        7.,  7.,  7.,  2.,  7.,  7.,  7.,  6.,  6.,  6.,  2.,  2.,  2.,
        2.,  2.,  9.,  9.,  9.,  9.,  8.,  8.,  8.,  8.,  2.,  9.,  9.,
        9.,  5.,  5.,  5.,  5.,  9.,  9.,  9.,  9., 10., 10., 10., 10.,
        8.,  8.,  8.,  8.,  2.,  2.,  2.,  2.,  1.,  2.,  1.,  2.,  2.,
        2.,  2.,  2.,  3.,  3.,  3.,  3.,  5.,  5.,  5.,  5.,  7.,  7.,
        7.,  7.,  5.,  5.,  5.,  5.,  6.,  6.,  6.,  6.,  7.,  6.,  7.,
        7.,  7.,  3.,  3.,  7.,  7.,  7.,  3.,  3.,  7.,  7.,  7.,  7.,
        2.,  2.,  2.,  2.,  9.,  9.,  9.,  9.,  7.,  7.,  7.,  7.,  2.,
        2.,  2.,  2.,  3.,  3.,  3.,  3.])

In [177]:
submission = pd.read_csv('submission/submission.csv')
submission['LEVEL'] = y_pred

# 存檔
file = 'submission_v1.2.csv'
try:
    submission.to_csv('/content/drive/My Drive/Colab Notebooks/Game/OceanWast_data/' + file)
except:
    submission.to_csv('submission/'+file, index=False)
    
submission

Unnamed: 0,ID,LEVEL
0,E01_1,5.0
1,E01_2,5.0
2,E01_3,5.0
3,E01_4,5.0
4,E04_1,6.0
...,...,...
158,TT01_4,2.0
159,TT04_1,3.0
160,TT04_2,3.0
161,TT04_3,3.0


## **提交**
得0.558分，第14/24名。
<img src="Aidea_海洋廢棄物_v1.2_排名191208.png" width="80%">

之後針對遺漏值填補進行修改。