In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import random
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

In [4]:
train_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/LG Aimers Dacon/open/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/LG Aimers Dacon/open/test.csv")

In [5]:
# data에서 모든 값이 결측치인 X 삭제

train = train_df.copy()

null = train.isnull().sum()
null = null.to_frame().transpose()
Xs = null.columns[6:]
Nan_list = []

for x in Xs:
  if float(null[x]) == len(train):
    Nan_list.append(x)

train.drop(Nan_list, axis = 1, inplace = True)

test = test_df.copy()
test.drop(Nan_list, axis = 1, inplace = True)

***

In [6]:
# test용
x = train.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
y = train["Y_Class"]

x = x.fillna(0)
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3, random_state=37)

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 

In [None]:
x_train, y_train = train_x.copy(), train_y.copy()
x_test = test_x.copy()
# remove_features_count = []

for i in range(39):
  model = RandomForestClassifier(random_state = 37).fit(x_train, y_train)

  f_i = pd.Series(model.feature_importances_, index = x_train.columns)
  f_i = f_i.sort_values(ascending = False)
  features = list(pd.DataFrame(f_i).transpose().columns)

  features_df = pd.DataFrame(f_i, columns = ["importance"])

  # 중요도가 낮은 (importance = 0.0) feature 제거
  remove_Xs = list(features_df.where(features_df["importance"] == 0).dropna().transpose().columns)
  x_train.drop(remove_Xs, axis = 1, inplace = True)
  x_test.drop(remove_Xs, axis = 1, inplace = True)

  important_features = list(set(features) - set(remove_Xs))

  # print(i+1, "번째 total features : ", len(features))
  # print(i+1, "번째 remove features : ", len(remove_Xs))
  # print(i+1, "번째 remain features : ", len(important_features))
  # print()

  # remove_features_count.append(len(remove_Xs))

model = RandomForestClassifier(random_state = 37).fit(x_train, y_train)

In [None]:
# 모델 평가를 위한 metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, plot_confusion_matrix

re_test_x = x_test.copy()
removes = list(set(re_test_x.columns) - set(important_features))
re_test_x.drop(removes, axis = 1, inplace = True)

preds = model.predict(re_test_x)

train_acc = model.score(x_train, y_train)
test_acc = model.score(x_test, test_y)
y_pred = model.predict(x_test)

print(f"Accuracy: {accuracy_score(test_y, y_pred):.3f}") # 정확도

pre_macro = precision_score(test_y, y_pred, average = "macro")
print(f"Precision: {pre_macro:.3f}") # 정밀도

recall_macro = recall_score(test_y, y_pred, average = "macro")
print(f"Recall: {recall_macro:.3f}") # 재현율

f1_macro = f1_score(test_y, y_pred, average = "macro")
print(f"F1-score: {f1_macro:.3f}") # F1 스코어

Accuracy: 0.789
Precision: 0.757
Recall: 0.530
F1-score: 0.578


1번  
Accuracy: 0.778  
Precision: 0.814  
Recall: 0.492  
F1-score: 0.539  

10번  
Accuracy: 0.761  
Precision: 0.796  
Recall: 0.453  
F1-score: 0.484  

15번  
Accuracy: 0.772  
Precision: 0.839  
Recall: 0.471  
F1-score: 0.509  

25번  
Accuracy: 0.778  
Precision: 0.825  
Recall: 0.496  
F1-score: 0.540  

30번  
Accuracy: 0.789  
Precision: 0.853  
Recall: 0.511  
F1-score: 0.564  

35번  
Accuracy: 0.756  
Precision: 0.770  
Recall: 0.463  
F1-score: 0.492  

36번  
Accuracy: 0.756  
Precision: 0.715  
Recall: 0.473  
F1-score: 0.506  

37번  
Accuracy: 0.772  
Precision: 0.759  
Recall: 0.480  
F1-score: 0.520  

38번  
Accuracy: 0.772  
Precision: 0.824  
Recall: 0.484  
F1-score: 0.521   

39번  
Accuracy: 0.789  
Precision: 0.757  
Recall: 0.530  
F1-score: 0.578  

40번  
Accuracy: 0.789  
Precision: 0.757  
Recall: 0.530  
F1-score: 0.578  

45번  
Accuracy: 0.789  
Precision: 0.757  
Recall: 0.530  
F1-score: 0.578  

50번  
Accuracy: 0.789  
Precision: 0.757  
Recall: 0.530  
F1-score: 0.578  

***

In [None]:
# 데이터 로드
train_x = train.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality']) #original code
# train_x = train.drop(columns=['PRODUCT_ID', 'Y_Class', 'Y_Quality']) 
train_y = train['Y_Class']

test_x = test.drop(columns=['PRODUCT_ID', 'TIMESTAMP']) #original code
# test_x = test.drop(columns=['PRODUCT_ID']) #original code

# 결측치 처리 (마이너스로도 한번 해보셈)
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE'] #original code
# qual_col = ['LINE', 'PRODUCT_CODE', "TIMESTAMP"]

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 

In [8]:
def repeat_RF(x, y, i):
  re_x = x.copy()
  model = RandomForestClassifier(random_state = 37).fit(x, y)

  # 특성별 중요도 
  f_i = pd.Series(model.feature_importances_, index = x.columns)
  f_i = f_i.sort_values() # 내림차순

  # 중요도 그래프 저장
  # f_i.sort_values().plot(kind = "barh", figsize = (60, 600))
  # plt.savefig("/content/drive/MyDrive/Colab Notebooks/LG Aimers Dacon/plots/RF" + str(i) + ".png")

  features_df = pd.DataFrame(f_i, columns = ["importance"])

  # 중요도가 낮은 (importance = 0.0) feature 제거
  remove_Xs = list(features_df.where(features_df["importance"] == 0).dropna().transpose().columns)
  re_x.drop(remove_Xs, axis = 1, inplace = True)

  return re_x, remove_Xs, model

In [None]:
x, y = train_x.copy(), train_y.copy()
# remove_features_count = []

for i in range(3):
  model = RandomForestClassifier(random_state = 37).fit(x, y)

  f_i = pd.Series(model.feature_importances_, index = x.columns)
  f_i = f_i.sort_values(ascending = False)
  features = list(pd.DataFrame(f_i).transpose().columns)

  features_df = pd.DataFrame(f_i, columns = ["importance"])

  # 중요도가 낮은 (importance = 0.0) feature 제거
  remove_Xs = list(features_df.where(features_df["importance"] == 0).dropna().transpose().columns)
  x.drop(remove_Xs, axis = 1, inplace = True)

  important_features = list(set(features) - set(remove_Xs))

  print(i+1, "번째 total features : ", len(features))
  print(i+1, "번째 remove features : ", len(remove_Xs))
  print(i+1, "번째 remain features : ", len(important_features))
  print()

  # remove_features_count.append(len(remove_Xs))

model = RandomForestClassifier(random_state = 37).fit(x, y)

1 번째 total features :  2795
1 번째 remove features :  1093
1 번째 remain features :  1702

2 번째 total features :  1702
2 번째 remove features :  206
2 번째 remain features :  1496

3 번째 total features :  1496
3 번째 remove features :  90
3 번째 remain features :  1406



In [None]:
re_test_x = test_x.copy()
removes = list(set(re_test_x.columns) - set(important_features))
re_test_x.drop(removes, axis = 1, inplace = True)

preds = model.predict(re_test_x)

In [9]:
"""처음 RandomForest 실시 """

x, y, remove_Xs = train_x.copy(), train_y.copy(), []
# re_test_x = test_x.copy()

re_x, remove_Xs, model = repeat_RF(x, y, 1)
# re_x.to_csv("re_x_timestamp.csv")

# re_test_x.drop(remove_Xs, axis = 1, inplace = True)
preds = model.predict(test_x)

In [11]:
"""2 RandomForest 실시 """

x, y, remove_Xs = train_x.copy(), train_y.copy(), []
re_test_x = test_x.copy()

re_x, remove_Xs, model = repeat_RF(x, y, 2)
# re_x.to_csv("re_x_timestamp.csv")

re_test_x.drop(remove_Xs, axis = 1, inplace = True)
preds = model.predict(test_x)

In [13]:
len(re_x.columns)

1605

In [None]:
submit = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/LG Aimers Dacon/open/sample_submission.csv")

submit['Y_Class'] = preds

submit.to_csv('3_sample_submission.csv', index = False)