In [87]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [110]:
import evalml
from evalml.automl import AutoMLSearch

In [88]:
df = pd.read_csv("pizza_v2.csv")

In [90]:
df.isnull().sum()

company            0
price_rupiah       0
diameter           0
topping            0
variant            0
size               0
extra_sauce        0
extra_cheese       0
extra_mushrooms    0
dtype: int64

In [91]:
df.shape

(129, 9)

In [92]:
df["price_rupiah"] = df["price_rupiah"].str.replace(",", "")

In [93]:
df["price_rupiah"] = df["price_rupiah"].str.replace("Rp", "")

In [94]:
df["price_rupiah"] = df["price_rupiah"].str.replace("000", "")

In [95]:
def int_rupi(text):
    return int(text)

In [96]:
df["price_rupiah"] = df["price_rupiah"].apply(int_rupi)

In [98]:
df["diameter"] = df["diameter"].str.replace(" inch", "")
df["diameter"] = df["diameter"].str.replace("\n", "")

In [100]:
def float_diameter(text):
    return float(text)

In [101]:
df["diameter"] = df["diameter"].apply(float_diameter)

In [103]:
cols = ["topping", "variant", "size", "extra_sauce", "extra_cheese", "extra_mushrooms"]

In [104]:
def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

In [105]:
for col in cols:
    df[col] = label_encoder(df[col])

topping ['beef' 'black_papper' 'chicken' 'meat' 'mozzarella' 'mushrooms' 'onion'
 'papperoni' 'sausage' 'smoked_beef' 'tuna' 'vegetables']
variant ['BBQ_meat_fiesta' 'BBQ_sausage' 'american_classic' 'american_favorite'
 'classic' 'crunchy' 'double_decker' 'double_mix' 'double_signature'
 'extravaganza' 'gournet_greek' 'italian_veggie' 'meat_eater'
 'meat_lovers' 'neptune_tuna' 'new_york' 'spicy tuna' 'spicy_tuna'
 'super_supreme' 'thai_veggie']
size ['XL' 'jumbo' 'large' 'medium' 'reguler' 'small']
extra_sauce ['no' 'yes']
extra_cheese ['no' 'yes']
extra_mushrooms ['no' 'yes']


In [107]:
X = df.drop(["price_rupiah"], axis=1)
y = df["price_rupiah"]

In [109]:
X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(X, y, problem_type="regression")

In [111]:
automl = AutoMLSearch(X_train=X_train, y_train=y_train, problem_type="regression")

In [112]:
automl.search()

numpy.ufunc size changed, may indicate binary incompatibility. Expected 216 from C header, got 232 from PyObject
	High coefficient of variation (cv >= 0.5) within cross validation scores.
	Random Forest Regressor w/ Imputer + One Hot Encoder may not perform as estimated on unseen data.
	High coefficient of variation (cv >= 0.5) within cross validation scores.
	Random Forest Regressor w/ Imputer + One Hot Encoder + RF Regressor Select From Model may not perform as estimated on unseen data.
	High coefficient of variation (cv >= 0.5) within cross validation scores.
	Decision Tree Regressor w/ Select Columns By Type Transformer + Imputer + Select Columns Transformer + Select Columns Transformer + Imputer + One Hot Encoder may not perform as estimated on unseen data.
	High coefficient of variation (cv >= 0.5) within cross validation scores.
	Extra Trees Regressor w/ Select Columns By Type Transformer + Imputer + Select Columns Transformer + Select Columns Transformer + Imputer + One Hot Enc

{1: {'Random Forest Regressor w/ Imputer + One Hot Encoder': 0.27617478370666504,
  'Total time of batch': 0.38944268226623535},
 2: {'Random Forest Regressor w/ Imputer + One Hot Encoder + RF Regressor Select From Model': 0.31792211532592773,
  'Total time of batch': 0.4376201629638672},
 3: {'Decision Tree Regressor w/ Select Columns By Type Transformer + Imputer + Select Columns Transformer + Select Columns Transformer + Imputer + One Hot Encoder': 0.11572027206420898,
  'Extra Trees Regressor w/ Select Columns By Type Transformer + Imputer + Select Columns Transformer + Select Columns Transformer + Imputer + One Hot Encoder': 0.2624850273132324,
  'XGBoost Regressor w/ Select Columns By Type Transformer + Imputer + Select Columns Transformer + Select Columns Transformer + Imputer + One Hot Encoder': 0.29027318954467773,
  'CatBoost Regressor w/ Select Columns By Type Transformer + Imputer + Select Columns Transformer + Select Columns Transformer + Imputer': 0.1862189769744873,
  'L

In [113]:
automl.rankings

Unnamed: 0,id,pipeline_name,search_order,ranking_score,mean_cv_score,standard_deviation_cv_score,percent_better_than_baseline,high_variance_cv,parameters
0,6,CatBoost Regressor w/ Select Columns By Type T...,6,-0.323261,-0.323261,0.383174,15.668888,False,{'Numeric Pipeline - Select Columns By Type Tr...
1,0,Mean Baseline Regression Pipeline,0,-0.383323,-0.383323,0.418643,0.0,False,{'Baseline Regressor': {'strategy': 'mean'}}
2,7,LightGBM Regressor w/ Select Columns By Type T...,7,-0.49221,-0.49221,0.642095,-28.406126,True,{'Numeric Pipeline - Select Columns By Type Tr...
3,1,Random Forest Regressor w/ Imputer + One Hot E...,1,-1.727918,-1.727918,1.996635,-350.773273,True,{'Imputer': {'categorical_impute_strategy': 'm...
4,8,Elastic Net Regressor w/ Select Columns By Typ...,8,-1.770648,-1.770648,1.691809,-361.920777,True,{'Numeric Pipeline - Select Columns By Type Tr...
5,2,Random Forest Regressor w/ Imputer + One Hot E...,2,-3.047527,-3.047527,3.293028,-695.028632,True,{'Imputer': {'categorical_impute_strategy': 'm...
6,5,XGBoost Regressor w/ Select Columns By Type Tr...,5,-4.664351,-4.664351,8.90698,-1116.819993,True,{'Numeric Pipeline - Select Columns By Type Tr...
7,4,Extra Trees Regressor w/ Select Columns By Typ...,4,-7.062258,-7.062258,8.637438,-1742.37782,True,{'Numeric Pipeline - Select Columns By Type Tr...
8,3,Decision Tree Regressor w/ Select Columns By T...,3,-9.899798,-9.899798,17.500026,-2482.625665,True,{'Numeric Pipeline - Select Columns By Type Tr...


In [114]:
automl.best_pipeline

pipeline = RegressionPipeline(component_graph={'Numeric Pipeline - Select Columns By Type Transformer': ['Select Columns By Type Transformer', 'X', 'y'], 'Numeric Pipeline - Imputer': ['Imputer', 'Numeric Pipeline - Select Columns By Type Transformer.x', 'y'], 'Numeric Pipeline - Select Columns Transformer': ['Select Columns Transformer', 'Numeric Pipeline - Imputer.x', 'y'], 'Categorical Pipeline - Select Columns Transformer': ['Select Columns Transformer', 'X', 'y'], 'Categorical Pipeline - Imputer': ['Imputer', 'Categorical Pipeline - Select Columns Transformer.x', 'y'], 'CatBoost Regressor': ['CatBoost Regressor', 'Numeric Pipeline - Select Columns Transformer.x', 'Categorical Pipeline - Imputer.x', 'y']}, parameters={'Numeric Pipeline - Select Columns By Type Transformer':{'column_types': ['category', 'EmailAddress', 'URL'], 'exclude': True}, 'Numeric Pipeline - Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'boolean_impute_strategy': 

In [115]:
pipe = automl.best_pipeline

In [116]:
automl.describe_pipeline(automl.rankings.iloc[0]["id"])


******************************************************************************************************************************************
* CatBoost Regressor w/ Select Columns By Type Transformer + Imputer + Select Columns Transformer + Select Columns Transformer + Imputer *
******************************************************************************************************************************************

Problem Type: regression
Model Family: CatBoost

Pipeline Steps
1. Select Columns By Type Transformer
	 * column_types : ['category', 'EmailAddress', 'URL']
	 * exclude : True
2. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : mean
	 * boolean_impute_strategy : most_frequent
	 * categorical_fill_value : None
	 * numeric_fill_value : None
	 * boolean_fill_value : None
3. Select Columns Transformer
	 * columns : ['diameter', 'topping', 'variant', 'size', 'extra_cheese']
4. Select Columns Transformer
	 * columns : ['company']
5. Imputer
	 *

In [122]:
pipe.score(X_test, y_test, objectives=["mse", "mae"])

OrderedDict([('MSE', 10458941.82064853), ('MAE', 3085.999996468896)])