# 제조 공정 내 가스 혼합물의 개별 가스 성분 분류

## 01. 데이터 소개 및 분석 프로세스 수립

### 데이터 수집  
데이터 소스: https://archive.ics.uci.edu/static/public/270/gas+sensor+array+drift+dataset+at+different+concentrations.zip

< 데이터 >  

|GAS|1-128|  
|--|--|  
|혼합가스종류|금속 산화물 가스 센서 계측값들|

### 데이터 전처리
- Target Feature 생성
- 상관성 분석

### 모델 생성  
- PCA 적용
- Random Forest, Ada Boost, Bagging with KNN, Decision Tree, Voting Ensemble

### 모델 평가 및 의사결정
- Feature Selection
- 재모델링

matplotlib 한글 깨짐 현상 해결
> 셀 실행 후 런타임 재시작 필요

In [1]:
# matplotlib 한글깨짐 현상 해결
! apt-get update -qq
! apt-get install fonts-nanum* -qq

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

path = '/usr/share/fonts/truetype/nanum/NanumGothicEco.ttf'
font_name = 'NanumGothic'
fe = fm.FontEntry(fname=path, name=font_name)
fm.fontManager.ttflist.insert(0, fe)
plt.rcParams.update({'font.size': 10, 'font.family': font_name})

Selecting previously unselected package fonts-nanum.
(Reading database ... 121749 files and directories currently installed.)
Preparing to unpack .../fonts-nanum_20200506-1_all.deb ...
Unpacking fonts-nanum (20200506-1) ...
Selecting previously unselected package fonts-nanum-coding.
Preparing to unpack .../fonts-nanum-coding_2.5-3_all.deb ...
Unpacking fonts-nanum-coding (2.5-3) ...
Selecting previously unselected package fonts-nanum-eco.
Preparing to unpack .../fonts-nanum-eco_1.000-7_all.deb ...
Unpacking fonts-nanum-eco (1.000-7) ...
Selecting previously unselected package fonts-nanum-extra.
Preparing to unpack .../fonts-nanum-extra_20200506-1_all.deb ...
Unpacking fonts-nanum-extra (20200506-1) ...
Setting up fonts-nanum-extra (20200506-1) ...
Setting up fonts-nanum (20200506-1) ...
Setting up fonts-nanum-coding (2.5-3) ...
Setting up fonts-nanum-eco (1.000-7) ...
Processing triggers for fontconfig (2.13.1-4.2ubuntu5) ...


## 02. 데이터 준비를 위한 EDA 및 전처리

### 0. 데이터 불러오기

In [2]:
########## 필요한 파이썬 라이브러리 불러오기 ##########
import os
import time
import glob
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
%matplotlib inline

from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

In [8]:
path = "/content/drive/MyDrive/MLStudy/Data_Anaylsis_And_Machine_Learning/11_gas_sensor_array_drift_at_different_concentrations/dataset"
all_files = glob.glob(os.path.join(path, "*.dat"))

In [36]:
df_from_each_file = (pd.read_csv(f, sep="\s+", index_col=0, header=None) for f in all_files)
df = pd.concat(df_from_each_file, sort=True)

# 각 셀마다 feature와 value를 나눠준다 (예 -> 1;15596.16 -> 15596.16)
for col in df.columns.values:
  df[col] = df[col].apply(lambda x: float(str(x).split(":")[1]))

df = df.rename_axis("Gas").reset_index()
df['Gas'] = df['Gas'].apply(lambda x: float(str(x).split(";")[0]))
df.sort_values(by=['Gas'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [37]:
df

Unnamed: 0,Gas,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,128
0,1.0,34181.6942,42.217819,8.077428,11.060225,15.324717,-6.237308,-8.640350,-22.773236,76987.3013,...,-18.088229,-30.107785,31626.1782,7.214432,14.421883,33.202304,40.598880,-8.920710,-15.434603,-28.125227
1,1.0,3008.7476,2.462359,0.790187,1.495105,3.545801,-0.402571,-0.806028,-3.984063,13496.0776,...,-2.466937,-5.849443,6707.0798,3.133532,4.428225,12.515252,16.296656,-1.397067,-2.223667,-4.842768
2,1.0,2872.7572,2.404817,0.803031,1.508790,3.572785,-0.397897,-0.753870,-3.771515,13525.2793,...,-2.429918,-6.307317,6697.3631,3.112266,4.443709,12.618117,16.957225,-1.399658,-2.184855,-5.350035
3,1.0,2838.5399,2.428299,0.788218,1.466609,3.962417,-0.381463,-0.779135,-4.069448,13500.1397,...,-2.444341,-5.607505,6637.2874,3.094251,4.620435,13.243091,17.279677,-1.368559,-2.113937,-5.843864
4,1.0,3099.6742,2.527231,0.842334,1.511417,3.055021,-0.431201,-0.808438,-3.819882,13428.2017,...,-2.352689,-5.552964,6695.1635,3.097356,4.433018,12.514647,16.905624,-1.368128,-2.234458,-5.394663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13905,6.0,12600.4721,3.967865,3.094982,4.540664,8.101418,-1.924777,-2.897625,-9.353642,45335.6582,...,-4.386727,-9.119150,10938.9927,3.156375,3.338902,6.332657,9.225503,-2.459694,-3.892331,-9.953861
13906,6.0,1374.5815,1.423290,1.266835,2.217778,4.394369,-0.168291,-0.458974,-3.206636,1989.8376,...,-0.462312,-3.355550,-39.9263,0.986410,0.837301,2.425313,5.468411,-0.106302,-0.404082,-3.297575
13907,6.0,17730.3342,9.446026,4.501150,6.539394,9.898564,-2.601076,-3.776140,-11.457366,61575.4526,...,-8.117530,-15.673747,20061.1811,4.336805,6.812591,11.914944,15.068176,-4.704891,-7.561662,-16.321429
13908,6.0,12703.2036,6.410769,2.990252,4.343527,7.120345,-1.886126,-2.828974,-8.134370,49808.0839,...,-5.604095,-12.472530,13913.5664,3.269042,4.668716,8.604283,12.107818,-3.065823,-5.095964,-11.282160


### 1. 데이터 탐색

#### 1) Basic

In [38]:
df.Gas.nunique()

6

In [39]:
df.head()

Unnamed: 0,Gas,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,128
0,1.0,34181.6942,42.217819,8.077428,11.060225,15.324717,-6.237308,-8.64035,-22.773236,76987.3013,...,-18.088229,-30.107785,31626.1782,7.214432,14.421883,33.202304,40.59888,-8.92071,-15.434603,-28.125227
1,1.0,3008.7476,2.462359,0.790187,1.495105,3.545801,-0.402571,-0.806028,-3.984063,13496.0776,...,-2.466937,-5.849443,6707.0798,3.133532,4.428225,12.515252,16.296656,-1.397067,-2.223667,-4.842768
2,1.0,2872.7572,2.404817,0.803031,1.50879,3.572785,-0.397897,-0.75387,-3.771515,13525.2793,...,-2.429918,-6.307317,6697.3631,3.112266,4.443709,12.618117,16.957225,-1.399658,-2.184855,-5.350035
3,1.0,2838.5399,2.428299,0.788218,1.466609,3.962417,-0.381463,-0.779135,-4.069448,13500.1397,...,-2.444341,-5.607505,6637.2874,3.094251,4.620435,13.243091,17.279677,-1.368559,-2.113937,-5.843864
4,1.0,3099.6742,2.527231,0.842334,1.511417,3.055021,-0.431201,-0.808438,-3.819882,13428.2017,...,-2.352689,-5.552964,6695.1635,3.097356,4.433018,12.514647,16.905624,-1.368128,-2.234458,-5.394663


In [40]:
df.shape

(13910, 129)

#### 2) 데이터 타입

In [41]:
pd.unique(df.dtypes),len(df.select_dtypes(exclude="object").columns) - 1

(array([dtype('float64')], dtype=object), 128)

#### 3) 데이터 통계값

In [42]:
df.describe()

Unnamed: 0,Gas,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,128
count,13910.0,13910.0,13910.0,13910.0,13910.0,13910.0,13910.0,13910.0,13910.0,13910.0,...,13910.0,13910.0,13910.0,13910.0,13910.0,13910.0,13910.0,13910.0,13910.0,13910.0
mean,3.387994,50435.066174,6.638156,12.936688,18.743953,26.890695,-9.158655,-14.402383,-59.927598,57340.104585,...,-9.601927,-19.1365,19688.565373,6.072066,7.138634,14.929364,19.09098,-4.901016,-8.167792,-16.089791
std,1.728602,69844.785952,13.486391,17.610061,24.89945,38.107685,12.729206,21.304606,131.017675,64045.265134,...,9.220031,26.516679,14281.652395,4.642192,5.248573,12.437311,14.39181,4.19536,7.637701,20.958479
min,1.0,-16757.5986,0.088287,0.0001,0.0001,0.0001,-131.332873,-227.627758,-1664.735576,-16119.4609,...,-76.0692,-482.278033,-8297.5488,0.712112,0.003238,0.011488,0.118849,-30.205911,-58.844076,-410.152297
25%,2.0,6694.72595,2.284843,1.63335,2.386836,4.967988,-11.587169,-17.292559,-48.492764,13287.301875,...,-13.212575,-22.363498,8837.83875,3.007381,3.059178,5.407551,8.039227,-6.789599,-11.162406,-18.93869
50%,3.0,19364.43935,3.871227,4.977123,7.250892,11.680725,-3.3387,-4.956917,-14.040088,37764.2632,...,-7.33885,-13.527887,16313.9673,4.973783,5.809107,11.325214,14.560676,-3.881763,-6.305962,-11.747499
75%,5.0,63104.837125,8.400619,17.189166,26.411109,34.843226,-1.126897,-1.670327,-5.212213,70300.782575,...,-3.26008,-7.358031,27009.592425,7.389566,10.222169,21.207572,26.547437,-1.804032,-2.874532,-6.42969
max,6.0,670687.3477,1339.879283,167.079751,226.619457,993.605306,-0.006941,22.201589,115.273147,502202.8125,...,9.270956,11.516418,96706.7927,45.574835,32.203601,297.22588,195.242555,-0.003817,6.851792,8.357968
