# Preparation 

## [MLBox framework](https://mlbox.readthedocs.io/en/latest/installation.html) setup
Please make sure that you are running this in standalone Python virtual env and that you are using interactive Python for that env. This should help to vastly reduce amount of dependency clashes.

MLBox is using [OpenMP](https://www.openmp.org/) and [LightGBM](https://lightgbm.readthedocs.io/en/latest/)

In [None]:
%%bash
brew install cmake
brew install libomp

In [None]:
%%bash 
pip install setuptools
pip install wheel
pip install pandas
pip install numpy
pip install mlbox

## Enviroment variables setup

In [None]:
paths = ["tmp_mlbox/train_mlbox.csv", "tmp_mlbox/eval_mlbox.csv"]
target_name = "y" #feature("column") with the result
input_file = "tmp_mlbox/input_file.csv"

## New "random" data generation

In [None]:
from random import uniform
from random import randint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

row_num=400
min_num=0
max_num=100

with open(input_file, "w+") as f: 
    f.write("x1,x2,x3,x4,x5,x6,x7,x8,x9,y\n") 
    for i in range(row_num):
        x1 = randint(min_num, max_num)
        x2 = randint(min_num, max_num)
        x3 = randint(min_num, max_num)
        x4 = randint(min_num, max_num)
        x5 = randint(min_num, max_num)
        x6 = randint(min_num, max_num)
        x7 = randint(min_num, max_num)
        x8 = randint(min_num, max_num)        
        x9 = randint(min_num, max_num)
        y = 1 if( x1 + x2 > x3) else 0
        
        f.write("{},{},{},{},{},{},{},{},{},{}\n".format(x1,x2,x3,x4,x5,x6,x7,x8,x9,y))

In [None]:
df = pd.read_csv(input_file,index_col=None, header=0, delimiter=",")

In [None]:
df.head()

In [None]:
X = df
y = df[target_name]
#MLBox does not seem to be able to do the proper spliting thus it is done manually here
X_train, X_test, _, _ = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
X_train.to_csv(paths[0], encoding='utf8',index=False)

#as per MLBox documentation the test dataset is such that does NOT contain target feature!!!
X_test = X_test.drop(target_name, axis=1)
X_test.to_csv(paths[1], encoding='utf8',index=False)

## Training with MLBox

In [None]:
from mlbox.preprocessing import *
from mlbox.optimisation import *
from mlbox.prediction import *

In [None]:
data = Reader(sep=",").train_test_split(paths, target_name)  #reading

In [None]:
data = Drift_thresholder().fit_transform(data)  #deleting non-stable variables

[Optimizer documentation](https://mlbox.readthedocs.io/en/latest/features.html#optimisation)

[Scoring options](https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules)

In [None]:
opt = Optimiser(scoring = 'accuracy', n_folds = 5)
opt.evaluate(None, data)

In [None]:
space = {
        'fs__strategy' : {"space" : ["variance", "rf_feature_importance"]},
        'fs__threshold': {"search" : "choice", "space" : [0.1, 0.2, 0.3, 0.4]},

        'est__strategy' : {"space" : ["LightGBM"]},
        'est__max_depth' : {"search" : "choice", "space" : [2,5,6,10,14,16,20,50]},
        'est__subsample' : {"search" : "uniform", "space" : [0.6,0.9]}
        }

best = opt.optimise(space, data, max_evals = 100)

In [None]:
Predictor().fit_predict(best, data)