# Preparation 

## [Ludwig framework](https://uber.github.io/ludwig/?from=%40) setup
Please make sure that you are running this in standalone Python virtual env and that you are using interactive Python for that env. This should help to vastly reduce amount of dependency clashes.

In [None]:
!pip install matplotlib
!pip install seaborn
!pip install -U ludwig[all]

## Enviroment variables setup

In [None]:
from random import uniform
from random import randint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
target_name = "y" #feature("column") with the result
input_file = "tmp_ludwig/input_file_ludwig.csv"

## New "random" data generation

In [None]:
row_num=40000
min_num=0
max_num=100

with open(input_file, "w+") as f: 
    f.write("x1,x2,x3,x4,x5,x6,x7,x8,x9,y\n") 
    for i in range(row_num):
        x1 = randint(min_num, max_num)
        x2 = randint(min_num, max_num)
        x3 = randint(min_num, max_num)
        x4 = randint(min_num, max_num)
        x5 = randint(min_num, max_num)
        x6 = randint(min_num, max_num)
        x7 = randint(min_num, max_num)
        x8 = randint(min_num, max_num)        
        x9 = randint(min_num, max_num)
        y = 1 if( x1 + x2 > x3) else 0
        
        f.write("{},{},{},{},{},{},{},{},{},{}\n".format(x1,x2,x3,x4,x5,x6,x7,x8,x9,y))

In [None]:
df = pd.read_csv(input_file,index_col=None, header=0, delimiter=",")

In [None]:
df.head()

## Training with Ludwig

In [None]:
import os
from ludwig.api import LudwigModel
from ludwig.visualize import learning_curves, confusion_matrix

Column type information is avaible [here](https://uber.github.io/ludwig/getting_started/)

In [None]:
model_definition = {}
input_features = []

df = pd.read_csv(input_file,index_col=None, header=0, delimiter=",")
for column in list(df.columns.values):
    if not column == target_name:
        input_features.append({'name': column, 'type':'numerical'})
        
print(input_features)
model_definition['input_features'] = input_features
model_definition['output_features'] = [{'name': target_name, 'type': 'category'}]

In [None]:
model = LudwigModel(model_definition, logging_level=20)

In [None]:
tmp_file_hdf5 = "tmp_ludwig/input_file_ludwig.hdf5"
tmp_file_json = "tmp_ludwig/input_file_ludwig.json"

if os.path.isfile(tmp_file_hdf5):
    os.remove(tmp_file_hdf5)
if os.path.isfile(tmp_file_json):
    os.remove(tmp_file_json)

train_stats = model.train(data_csv=input_file, data_hdf5=tmp_file_hdf5, output_directory='ludwig/results', skip_save_preprocessing=False)

## Visualization

In [None]:
from ludwig import visualize

In [None]:
field = [target_name]
model_names = ['results/api_experiment_run_8/model/','results/api_experiment_run_9/model/']
visualize.learning_curves(train_stats, field, model_names, file_format='png')