## Randomforest를 이용한 Classification
- Classification 또는 Regression 모델에 사용
- Missing value(결측치)를 다루기 쉬움
- 대용량 데이터 처리에 효과적

## 결론
- Y_Quality value에 따른 Y_CLASS value 지정

## 작업 순서
- 데이터 불러오기 및 데이터 결측치 처리
- 데이터 전처리
- 데이터 분류
- 데이터 분류모델 정의
- 데이터 학습
- 데이터 검증 및 테스트

In [7]:
#Import Required packages
pip install -U scikit-learn

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting scikit-learn
  Downloading scikit_learn-1.2.1-cp39-cp39-macosx_10_9_x86_64.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy>=1.3.2
  Downloading scipy-1.10.0-cp39-cp39-macosx_10_15_x86_64.whl (35.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.2/35.2 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting threadpoolctl>=2.0.0
  Downloa

In [61]:
%matplotlib inline
import pandas as pd
import random
import os
import numpy as np

In [62]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [63]:
#Fixed seed number
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37)

In [64]:
#Load to data files

train_data = pd.read_csv("/Users/yujinkim/Desktop/lg-auto-factory/data/train.csv")
test_data = pd.read_csv("/Users/yujinkim/Desktop/lg-auto-factory/data/test.csv")

print(train_data.shape)
print(test_data.shape)

(598, 2881)
(310, 2879)


In [65]:
train_input = train_data.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_target = train_data['Y_Class']

test_input = test.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

train_input = train_input.fillna(0)
test_input = test_input.fillna(0)

In [66]:
#Print train data
train_input.head()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.34,40.89,32.56,34.09,77.77,0.0,0.0,0.0,0.0,0.0
1,T050307,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38.89,42.82,43.92,35.34,72.55,0.0,0.0,0.0,0.0,0.0
2,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.19,36.65,42.47,36.53,78.35,0.0,0.0,0.0,0.0,0.0
3,T050307,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,37.74,39.17,52.17,30.58,71.78,0.0,0.0,0.0,0.0,0.0
4,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38.7,41.89,46.93,33.09,76.97,0.0,0.0,0.0,0.0,0.0


In [67]:
train_input.describe()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
count,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0,...,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0
mean,1.406355,55.51505,0.0,26.262542,6.065217,0.0,28.481605,5.864548,24.202341,1.167224,...,8.496204,8.964498,8.295351,8.638763,11.145435,0.165552,0.0,0.0,0.0,0.0
std,4.655332,47.03581,0.0,22.201747,5.14097,0.0,24.308159,4.960485,21.978767,0.986744,...,19.189554,20.294317,18.755287,19.632753,24.961071,0.371989,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,91.0,0.0,45.0,10.0,0.0,45.0,10.0,31.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,95.0,0.0,45.0,10.0,0.0,50.0,10.0,52.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,103.0,102.0,0.0,45.0,11.0,0.0,62.0,11.0,52.0,2.0,...,60.24,68.66,60.41,59.93,79.75,1.0,0.0,0.0,0.0,0.0


In [68]:
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_input[i])
    train_input[i] = le.transform(train_input[i])
    
    for label in np.unique(test_input[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_input[i] = le.transform(test_input[i]) 
print('Done.')

Done.


In [69]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

print(train_input)
print(train_target)

model = RandomForestClassifier(n_jobs=-1, random_state=37)

model.fit(train_input, train_target)

scores = cross_validate(model, train_input, train_target, return_train_score=True, n_jobs=-1)

#Checkout
print(np.mean(scores["train_score"]), np.mean(scores["test_score"]))

     LINE  PRODUCT_CODE   X_1   X_2  X_3   X_4   X_5  X_6   X_7   X_8  ...  \
0       2             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
1       3             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
2       2             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
3       3             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
4       2             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
..    ...           ...   ...   ...  ...   ...   ...  ...   ...   ...  ...   
593     5             2   2.0  95.0  0.0  45.0  10.0  0.0  50.0  10.0  ...   
594     2             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
595     2             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
596     4             1  40.0  94.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
597     5             1  21.0  87.0  0.0  45.0  10.0  0.0  61.0  10.0  ...   

     X_2866  X_2867  X_2868  X_2869  X_2870  X_2871  X_2872  X_

In [71]:
pred = model.predict(test_x)
print('Done.')
print(pred)

Done.
[1 1 1 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [73]:
submit = pd.read_csv('/Users/yujinkim/Desktop/lg-auto-factory/data/sample_submission.csv')
submit['Y_Class'] = pred
submit.to_csv('/Users/yujinkim/Desktop/lg-auto-factory/best_submission.csv', index=False)
print("Done.")

Done.
