### Dataset Names

#### Training Dataset:
   -> `iris_setosa_train`, `iris_versicolor_train`, `iris_virginica_train`
   
#### Testing Dataset:
   -> `iris_test_features`, `iris_setosa_test`, `iris_versicolor_test`, `iris_virginica_test`

In [386]:
import pandas as pd

The function `import_data` has two parameters. The `file_name` parameter takes on dataset names while the `data_type` parameter is either set to `train` or `test`.

In [387]:
def import_data(file_name, data_type):
    if data_type == 'train':
        return pd.read_csv('Training Dataset/'+ file_name + '.csv', delimiter=',')
    elif data_type == 'test':
        return pd.read_csv('Testing Dataset/'+ file_name + '.csv', delimiter=',')
    else:
        return

In [388]:
iris_versicolor_train = import_data('iris_versicolor_train', 'train')
iris_virginica_train = import_data('iris_virginica_train', 'train')

In [389]:
iris_setosa_train = import_data('iris_setosa_train', 'train')

In [390]:
iris_setosa_train.head()

Unnamed: 0,SL,SW,PL,PW,CLASS
0,4.9,3.0,1.4,0.2,1
1,4.7,3.2,1.3,0.2,1
2,4.6,3.1,1.5,0.2,1
3,4.6,3.4,1.4,0.3,1
4,5.0,3.4,1.5,0.2,1


In [391]:
iris_setosa_test_features = import_data('iris_setosa_test_features', 'test')
iris_versicolor_test_features = import_data('iris_versicolor_test_features', 'test')
iris_virginica_test_features = import_data('iris_virginica_test_features', 'test')

In [392]:
iris_setosa_test_features.head()

Unnamed: 0,SL,SW,PL,PW,CLASS
0,5.4,3.4,1.7,0.2,Iris-setosa
1,4.9,3.1,1.5,0.1,Iris-setosa
2,4.8,3.1,1.6,0.2,Iris-setosa
3,5.2,3.4,1.4,0.2,Iris-setosa
4,5.2,4.1,1.5,0.1,Iris-setosa


### Binarizing Dataset

In [393]:
iris_setosa_train.describe()

Unnamed: 0,SL,SW,PL,PW,CLASS
count,99.0,99.0,99.0,99.0,99.0
mean,5.874747,3.041414,3.79697,1.20303,0.333333
std,0.862169,0.379852,1.802971,0.759895,0.473804
min,4.4,2.2,1.0,0.1,0.0
25%,5.1,2.8,1.5,0.3,0.0
50%,5.8,3.0,4.5,1.3,0.0
75%,6.5,3.2,5.1,1.8,1.0
max,7.7,4.0,6.9,2.5,1.0


In [394]:
iris_setosa_train.std()

SL       0.862169
SW       0.379852
PL       1.802971
PW       0.759895
CLASS    0.473804
dtype: float64

In [395]:
number_of_SL_bins = (iris_setosa_train.SL.max() - iris_setosa_train.SL.min()) // iris_setosa_train.SL.std()
number_of_SL_bins

3.0

In [396]:
number_of_SW_bins = (iris_setosa_train.SW.max() - iris_setosa_train.SW.min()) // iris_setosa_train.SW.std()
number_of_SW_bins

4.0

In [397]:
number_of_PL_bins = (iris_setosa_train.PL.max() - iris_setosa_train.PL.min()) // iris_setosa_train.PL.std()
number_of_PL_bins

3.0

In [398]:
number_of_PW_bins = (iris_setosa_train.PW.max() - iris_setosa_train.PW.min()) // iris_setosa_train.PW.std()
number_of_PW_bins

3.0

#### Parameter values for bin function
#### data for setosa:

`iris_setosa_train.SL`, `iris_setosa_train.SW`, `iris_setosa_train.PL`, and `iris_setosa_train.PW`

#### data for versicolor:

`iris_versicolor_train_SL`, `iris_versicolor_train_SW`, `iris_versicolor_train_PL`, and `iris_versicolor_train_PW`

#### data for virginica:

`iris_virginica_train_SL`, `iris_virginica_train_SW`, `iris_virginica_train_PL`, and `iris_virginica_train_PW`

#### col:
`'SL'`, `'SW'`, `'PL'`, and `'PW'`

In [399]:
def bins(data, col):
    number_of_bins = int((data.max() - data.min()) // data.std()) + 1
    
    std = data.std()
    
    col_min = data.min()
    col_max = data.max()
    mins = []
    mins.append(col_min)
    for k in range(number_of_bins):
        if (mins[k] + std) < col_max:
            mins.append(mins[k] + std)
        elif mins[k] + std >= col_max:
            mins.append(col_max)
            break
    
    bin_vals = dict()
    for j in range(len(data)):
        for i in range(1, (number_of_bins+1)):
            if data[j] == mins[i-1] and i <= number_of_bins:
                if col+str(i-1) not in bin_vals:
                    bin_vals[col+str(i-1)]=[0 for k in range(len(data))]
                bin_vals[col+str(i-1)][j] = 1
                
            elif data[j] > mins[i-1] and data[j] <= mins[i] and i < number_of_bins:
                if col+str(i) not in bin_vals:
                    bin_vals[col+str(i)]=[0 for k in range(len(data))]
                bin_vals[col+str(i)][j]= 1
                
            elif data[j]== mins[number_of_bins-1] and i == number_of_bins:
                if col+str(i-1) not in bin_vals:
                    bin_vals[col+str(i-1)]=[0 for k in range(len(data))]
                bin_vals[col+str(i-1)][j]= 1
            else:
                pass
               
    return pd.DataFrame(bin_vals)

#### Binarized Setosa Training Data

In [400]:
SL_S_bins = bins(iris_setosa_train.SL, 'S_SL')
SL_S_bins.head()

Unnamed: 0,S_SL1,S_SL0,S_SL2,S_SL3
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [401]:
SW_S_bins = bins(iris_setosa_train.SW, 'S_SW')
SW_S_bins.head()

Unnamed: 0,S_SW3,S_SW4,S_SW2,S_SW1,S_SW0
0,1,0,0,0,0
1,1,0,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,1,0,0,0


In [402]:
PL_S_bins = bins(iris_setosa_train.PL, 'S_PL')
PL_S_bins.head()

Unnamed: 0,S_PL1,S_PL0,S_PL3,S_PL2
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [403]:
PW_S_bins = bins(iris_setosa_train.PW, 'S_PW')
PW_S_bins.head()

Unnamed: 0,S_PW1,S_PW0,S_PW2,S_PW3
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [404]:
binarized_setosa_train = iris_setosa_train[['CLASS']].join(SL_S_bins).join(SW_S_bins).join( PL_S_bins).join(PW_S_bins)
binarized_setosa_train.head()

Unnamed: 0,CLASS,S_SL1,S_SL0,S_SL2,S_SL3,S_SW3,S_SW4,S_SW2,S_SW1,S_SW0,S_PL1,S_PL0,S_PL3,S_PL2,S_PW1,S_PW0,S_PW2,S_PW3
0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
2,1,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
3,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0
4,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0


#### Binarizing Versicolor training data

In [405]:
SL_ver_bins = bins(iris_versicolor_train.SL, 'Ver_SL')
SW_ver_bins = bins(iris_versicolor_train.SW, 'Ver_SW')
PL_ver_bins = bins(iris_versicolor_train.PL, 'Ver_PL')
PW_ver_bins = bins(iris_versicolor_train.PW, 'Ver_PW')

In [406]:
binarized_versicolor_train = iris_versicolor_train[['CLASS']].join(SL_ver_bins).join(SW_ver_bins).join( PL_ver_bins).join(PW_ver_bins)
binarized_versicolor_train.head()

Unnamed: 0,CLASS,Ver_SL1,Ver_SL0,Ver_SL2,Ver_SL3,Ver_SW3,Ver_SW4,Ver_SW2,Ver_SW1,Ver_SW0,Ver_PL1,Ver_PL0,Ver_PL3,Ver_PL2,Ver_PW1,Ver_PW0,Ver_PW2,Ver_PW3
0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
2,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
3,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0
4,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0


#### Binarizing Virginica training data

In [407]:
SL_vir_bins = bins(iris_virginica_train.SL, 'Vir_SL')
SW_vir_bins = bins(iris_virginica_train.SW, 'Vir_SW')
PL_vir_bins = bins(iris_virginica_train.PL, 'Vir_PL')
PW_vir_bins = bins(iris_virginica_train.PW, 'Vir_PW')

In [408]:
binarized_virginica_train = iris_virginica_train[['CLASS']].join(SL_vir_bins).join(SW_vir_bins).join( PL_vir_bins).join(PW_vir_bins)
binarized_virginica_train.head()

Unnamed: 0,CLASS,Vir_SL1,Vir_SL0,Vir_SL2,Vir_SL3,Vir_SW3,Vir_SW4,Vir_SW2,Vir_SW1,Vir_SW0,Vir_PL1,Vir_PL0,Vir_PL3,Vir_PL2,Vir_PW1,Vir_PW0,Vir_PW2,Vir_PW3
0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
2,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
3,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0
4,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0


#### Binarizing Setosa testing data

In [409]:
SL_S_bins_test = bins(iris_setosa_test_features.SL, 'S_SL')
SW_S_bins_test = bins(iris_setosa_test_features.SW, 'S_SW')
PL_S_bins_test = bins(iris_setosa_test_features.PL, 'S_PL')
PW_S_bins_test = bins(iris_setosa_test_features.PW, 'S_PW')

In [410]:
binarized_setosa_test = iris_setosa_test_features[['CLASS']].join(SL_S_bins_test).join(SW_S_bins_test).join(PL_S_bins_test).join(PW_S_bins_test)
binarized_setosa_test.head()

Unnamed: 0,CLASS,S_SL4,S_SL2,S_SL3,S_SL1,S_SL0,S_SW1,S_SW3,S_SW2,S_SW0,S_PL4,S_PL3,S_PL2,S_PL0,S_PW2,S_PW0,S_PW3
0,Iris-setosa,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0
1,Iris-setosa,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0
2,Iris-setosa,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0
3,Iris-setosa,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0
4,Iris-setosa,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0


#### Binarizing Versicolor testing data

In [411]:
SL_ver_bins_test = bins(iris_versicolor_test_features.SL, 'Ver_SL')
SW_ver_bins_test = bins(iris_versicolor_test_features.SW, 'Ver_SW')
PL_ver_bins_test = bins(iris_versicolor_test_features.PL, 'Ver_PL')
PW_ver_bins_test = bins(iris_versicolor_test_features.PW, 'Ver_PW')

In [412]:
binarized_versicolor_test = iris_versicolor_test_features[['CLASS']].join(SL_ver_bins_test).join(SW_ver_bins_test).join( PL_ver_bins_test).join(PW_ver_bins_test)
binarized_versicolor_test.head()

Unnamed: 0,CLASS,Ver_SL0,Ver_SL3,Ver_SL1,Ver_SL2,Ver_SW0,Ver_SW1,Ver_SW3,Ver_SW2,Ver_PL1,Ver_PL2,Ver_PL0,Ver_PL3,Ver_PW0,Ver_PW2,Ver_PW1,Ver_PW3
0,Iris-versicolor,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0
1,Iris-versicolor,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0
2,Iris-versicolor,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,Iris-versicolor,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0
4,Iris-versicolor,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0


#### Binarizing Virginica testing data

In [413]:
SL_vir_bins_test = bins(iris_virginica_test_features.SL, 'Vir_SL')
SW_vir_bins_test = bins(iris_virginica_test_features.SW, 'Vir_SW')
PL_vir_bins_test = bins(iris_virginica_test_features.PL, 'Vir_PL')
PW_vir_bins_test = bins(iris_virginica_test_features.PW, 'Vir_PW')

In [414]:
binarized_virginica_test = iris_virginica_test_features[['CLASS']].join(SL_vir_bins_test).join(SW_vir_bins_test).join( PL_vir_bins_test).join(PW_vir_bins_test)
binarized_virginica_test.head()

Unnamed: 0,CLASS,Vir_SL1,Vir_SL2,Vir_SL0,Vir_SL3,Vir_SW2,Vir_SW3,Vir_SW1,Vir_SW0,Vir_PL1,Vir_PL2,Vir_PL0,Vir_PW2,Vir_PW3,Vir_PW4,Vir_PW0
0,Iris-virginica,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0
1,Iris-virginica,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,Iris-virginica,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0
3,Iris-virginica,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0
4,Iris-virginica,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0


### Creating Training and Validation sets for each flower type

In [415]:
from sklearn.model_selection import train_test_split

In [416]:
# Training and Validation set for Setosa

bin_iris_setosa_train, bin_iris_setosa_validation = train_test_split(binarized_setosa_train, test_size=0.2)

In [417]:
bin_iris_setosa_train = bin_iris_setosa_train.reset_index(drop=True)
print(bin_iris_setosa_train.shape)
bin_iris_setosa_train.head()

(79, 18)


Unnamed: 0,CLASS,S_SL1,S_SL0,S_SL2,S_SL3,S_SW3,S_SW4,S_SW2,S_SW1,S_SW0,S_PL1,S_PL0,S_PL3,S_PL2,S_PW1,S_PW0,S_PW2,S_PW3
0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
3,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0
4,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0


In [418]:
bin_iris_setosa_validation = bin_iris_setosa_validation.reset_index(drop=True)
print(bin_iris_setosa_validation.shape)
bin_iris_setosa_validation.head()

(20, 18)


Unnamed: 0,CLASS,S_SL1,S_SL0,S_SL2,S_SL3,S_SW3,S_SW4,S_SW2,S_SW1,S_SW0,S_PL1,S_PL0,S_PL3,S_PL2,S_PW1,S_PW0,S_PW2,S_PW3
0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0
2,1,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
3,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1
4,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0


In [419]:
# Training and Validation set for Versicolor

bin_iris_versicolor_train, bin_iris_versicolor_validation = train_test_split(binarized_versicolor_train, test_size=0.2)

In [420]:
bin_iris_versicolor_train = bin_iris_versicolor_train.reset_index(drop=True)
print(bin_iris_versicolor_train.shape)
bin_iris_versicolor_train.head()

(79, 18)


Unnamed: 0,CLASS,Ver_SL1,Ver_SL0,Ver_SL2,Ver_SL3,Ver_SW3,Ver_SW4,Ver_SW2,Ver_SW1,Ver_SW0,Ver_PL1,Ver_PL0,Ver_PL3,Ver_PL2,Ver_PW1,Ver_PW0,Ver_PW2,Ver_PW3
0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0
1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1
2,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0
3,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
4,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0


In [421]:
bin_iris_versicolor_validation = bin_iris_versicolor_validation.reset_index(drop=True)
print(bin_iris_versicolor_validation.shape)
bin_iris_versicolor_validation.head()

(20, 18)


Unnamed: 0,CLASS,Ver_SL1,Ver_SL0,Ver_SL2,Ver_SL3,Ver_SW3,Ver_SW4,Ver_SW2,Ver_SW1,Ver_SW0,Ver_PL1,Ver_PL0,Ver_PL3,Ver_PL2,Ver_PW1,Ver_PW0,Ver_PW2,Ver_PW3
0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0
1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0
2,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1
3,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0


In [422]:
# Training and Validation set for Virginica

bin_iris_virginica_train, bin_iris_virginica_validation = train_test_split(binarized_virginica_train, test_size=0.2)

In [423]:
bin_iris_virginica_train = bin_iris_virginica_train.reset_index(drop=True)
print(bin_iris_virginica_train.shape)
bin_iris_virginica_train.head()

(79, 18)


Unnamed: 0,CLASS,Vir_SL1,Vir_SL0,Vir_SL2,Vir_SL3,Vir_SW3,Vir_SW4,Vir_SW2,Vir_SW1,Vir_SW0,Vir_PL1,Vir_PL0,Vir_PL3,Vir_PL2,Vir_PW1,Vir_PW0,Vir_PW2,Vir_PW3
0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0
1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1
2,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0
3,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0
4,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0


In [424]:
bin_iris_virginica_validation = bin_iris_virginica_validation.reset_index(drop=True)
print(bin_iris_virginica_validation.shape)
bin_iris_virginica_validation.head()

(20, 18)


Unnamed: 0,CLASS,Vir_SL1,Vir_SL0,Vir_SL2,Vir_SL3,Vir_SW3,Vir_SW4,Vir_SW2,Vir_SW1,Vir_SW0,Vir_PL1,Vir_PL0,Vir_PL3,Vir_PL2,Vir_PW1,Vir_PW0,Vir_PW2,Vir_PW3
0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0
4,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0


### Saving Binarized Training, Validation and Testing Data to Computer

#### Training Data

In [428]:
bin_iris_setosa_train.to_csv('Binarized Data/Training Data/bin_iris_setosa_train.csv', index=False)
bin_iris_versicolor_train.to_csv('Binarized Data/Training Data/bin_iris_versicolor_train.csv', index=False)
bin_iris_virginica_train.to_csv('Binarized Data/Training Data/bin_iris_virginica_train.csv', index=False)

#### Validation Data

In [429]:
bin_iris_setosa_validation.to_csv('Binarized Data/Validation Data/bin_iris_setosa_validation.csv', index=False)
bin_iris_versicolor_validation.to_csv('Binarized Data/Validation Data/bin_iris_versicolor_validation.csv', index=False)
bin_iris_virginica_validation.to_csv('Binarized Data/Validation Data/bin_iris_virginica_validation.csv', index=False)

#### Testing Data

In [430]:
binarized_setosa_test.to_csv('Binarized Data/Testing Data/binarized_setosa_test.csv', index=False)
binarized_versicolor_test.to_csv('Binarized Data/Testing Data/binarized_versicolor_test.csv', index=False)
binarized_virginica_test.to_csv('Binarized Data/Testing Data/binarized_virginica_test.csv', index=False)

In [3]:
!moses

  -h [ --help ]                         Produce help message.
                                        
  -K [ --ip_kld_weight ] arg (=1)       Interesting patterns (ip). Weight of 
                                        the KLD.
                                        
  -J [ --ip_skewness_weight ] arg (=1)  Interesting patterns (ip). Weight of 
                                        skewness.
                                        
  -U [ --ip_stdU_weight ] arg (=1)      Interesting patterns (ip). Weight of 
                                        stdU.
                                        
  -X [ --ip_skew_U_weight ] arg (=1)    Interesting patterns (ip). Weight of 
                                        skew_U.
                                        
  -i [ --input-file ] arg               Input table file in DSV format (with 
                                        comma, whitespace and tabulation as 
                                        seperator). Colum

### Generating COMBO program using MOSES

In [376]:
! moses -i binarized_setosa_train.csv -m 100000 -u CLASS -W1

0 or($PL1 $PW1) 
0 or($PL1 $PL0 $PW1) 
0 or(and(or($SL1 $PW1) $PL1) $PW1) 
-1 $PL1 
-1 and($PL1 !$PL3) 
-1 or($PL1 $PW0) 
-1 and(!$SL4 $PL1 !$PW2) 
-1 and(!$SL3 $PL1 !$PL2) 
-1 and(or($SL1 $PW1) $PL1) 
-1 or(and(!$SW4 $PW1) $PL1) 


### Evaluating generated COMBO programs using MOSES

In [385]:
! eval-table -i bin_iris_setosa_validation.csv -c 'or(and(!$SW4 $PW1) $PL1)' -u CLASS

CLASS
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
