In [1]:
from tools.preprocess import LoadDataset

import pandas as pd
import numpy as np

from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest

train_path = "../data/train.csv"
test_path = "../data/valid.csv"
valid_path = "../data/valid.csv"

# do with train, test, valid
data = LoadDataset(train_path,
                   test_path, 
                   valid_path)

data.nan_processing({"ALL": "DROP COLUMN"})
data.sampling("OFF")
data.scaling({"ALL": "STANDARD"})
data.encoding({"ALL": "LABEL"})
data.feature_engineering("PCA PER PROCESS", {"min_proba": 0.9999})

Missing Value Processing...

(before processing)The number of nan
- train: 59028
- valid: 13149

  1. Processing <ALL> Columns with 'DROP COLUMN'

Finish!
(after processing)The number of nan
- train: 0
- valid: 0

Sampling with OFF...

(before sampling)Value count
target
0    31156
1     2000
Name: count, dtype: int64 

Finish!
(after sampling)Value count
target
0    31156
1     2000
Name: count, dtype: int64

Scaling our dataset...

  1. Scaling <ALL> Columns with 'STANDARD'

Finish! (the number of columns: 130)

Encoding our dataset...

  1. Encoding <ALL> Columns with 'LABEL'

Finish! (the number of columns: 12)

Feature Engineering with PCA PER PROCESS...

  Execute PCA PER PROCESS with minimum probability 0.999

  Stage 'Dam': the number of target cols is '68'
  => 2_components with Explained Variance Ratio '0.9990371866976341'

  Stage 'Fill1': the number of target cols is '32'
  => 2_components with Explained Variance Ratio '0.9995381646499276'

  Stage 'Fill2': the number of ta

In [2]:
RANDOM_SEED = 42
Dam_model = RandomForestClassifier(random_state=RANDOM_SEED)
Dam_model.fit(data.train.get('Dam'), data.train.y)

Fill1_model = RandomForestClassifier(random_state=RANDOM_SEED)
Fill1_model.fit(data.train.get('Fill1'), data.train.y)

Fill2_model = RandomForestClassifier(random_state=RANDOM_SEED)
Fill2_model.fit(data.train.get('Fill2'), data.train.y)

Auto_model = RandomForestClassifier(random_state=RANDOM_SEED)
Auto_model.fit(data.train.get('AutoClave'), data.train.y)

In [3]:
dam_proba = Dam_model.predict_proba(data.train.get('Dam'))
fill1_proba = Fill1_model.predict_proba(data.train.get('Fill1'))
fill2_proba = Fill2_model.predict_proba(data.train.get('Fill2'))
auto_proba = Auto_model.predict_proba(data.train.get('AutoClave'))

probability = pd.DataFrame(np.column_stack((dam_proba, fill1_proba, fill2_proba, auto_proba)))
Main_Model = IsolationForest(contamination=0.0275, random_state=RANDOM_SEED)
Main_Model.fit(probability)

In [4]:
dam_proba = Dam_model.predict_proba(data.valid.get('Dam'))
fill1_proba = Fill1_model.predict_proba(data.valid.get('Fill1'))
fill2_proba = Fill2_model.predict_proba(data.valid.get('Fill2'))
auto_proba = Auto_model.predict_proba(data.valid.get('AutoClave'))

probability = pd.DataFrame(np.column_stack((dam_proba, fill1_proba, fill2_proba, auto_proba)))

test_pred = Main_Model.predict(probability)
test_pred = pd.DataFrame(test_pred).replace({1:"Normal", -1:"AbNormal"})
test_pred.value_counts()

Normal      7116
AbNormal     234
Name: count, dtype: int64

In [5]:
answer = data.valid.y.replace({0:"Normal", 1:"AbNormal"})
f1 = f1_score(answer, test_pred, pos_label = "AbNormal")
print(f1)

0.23972602739726026


In [6]:
0.2360248447204969

0.2360248447204969