## Load the Libraries

In [1]:
# importing the required libraries
from mlbox.preprocessing import *
from mlbox.optimisation import *
from mlbox.prediction import *

import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Preprocessing

In [2]:
# reading and cleaning the train and test files
df=Reader(sep=",").train_test_split(['train_bm.csv', 'test_bm.csv'],'Item_Outlet_Sales')


reading csv : train_bm.csv ...
cleaning data ...
CPU time: 0.39519381523132324 seconds

reading csv : test_bm.csv ...
cleaning data ...
CPU time: 0.1332106590270996 seconds

> Number of common features : 11

gathering and crunching for train and test datasets ...
reindexing for train and test datasets ...
dropping training duplicates ...
dropping constant variables on training set ...

> Number of categorical features: 7
> Number of numerical features: 4
> Number of training samples : 8523
> Number of test samples : 5681

> Top sparse features (% missing values on train set):
Outlet_Size    28.3
Item_Weight    17.2
dtype: float64

> Task : regression
count     8523.000000
mean      2181.288914
std       1706.499616
min         33.290000
25%        834.247400
50%       1794.331000
75%       3101.296400
max      13086.964800
Name: Item_Outlet_Sales, dtype: float64


In [3]:
type(df)

dict

In [4]:
df.keys()

dict_keys(['train', 'test', 'target'])

In [5]:
df['train'].head()

Unnamed: 0,Item_Fat_Content,Item_Identifier,Item_MRP,Item_Type,Item_Visibility,Item_Weight,Outlet_Establishment_Year,Outlet_Identifier,Outlet_Location_Type,Outlet_Size,Outlet_Type
0,Low Fat,FDA15,249.8092,Dairy,0.016047,9.3,1999.0,OUT049,Tier 1,Medium,Supermarket Type1
1,Regular,DRC01,48.2692,Soft Drinks,0.019278,5.92,2009.0,OUT018,Tier 3,Medium,Supermarket Type2
2,Low Fat,FDN15,141.618,Meat,0.01676,17.5,1999.0,OUT049,Tier 1,Medium,Supermarket Type1
3,Regular,FDX07,182.095,Fruits and Vegetables,0.0,19.2,1998.0,OUT010,Tier 3,,Grocery Store
4,Low Fat,NCD19,53.8614,Household,0.0,8.93,1987.0,OUT013,Tier 3,High,Supermarket Type1


In [6]:
df['train'].isnull().sum()

Item_Fat_Content                0
Item_Identifier                 0
Item_MRP                        0
Item_Type                       0
Item_Visibility                 0
Item_Weight                  1463
Outlet_Establishment_Year       0
Outlet_Identifier               0
Outlet_Location_Type            0
Outlet_Size                  2410
Outlet_Type                     0
dtype: int64

## Optimisation

In [7]:
# setting the hyperparameter space
space={'ne__numerical_strategy':{"space":['median']},
'ne__categorical_strategy':{"space":['mode']},
       
'ce__strategy':{"search":"choice","space":['label_encoding','entity_embedding']},
'fs__strategy':{"search":"choice","space":['variance','rf_feature_importance']},
'fs__threshold':{"search":"uniform","space":[0.01, 0.3]},
       
'est__strategy' : {"space" : ["RandomForest"]},
'est__max_depth':{"search":"choice","space":[3,5,7,9]},
'est__n_estimators':{"search":"choice","space":[250,500,700,1000]}}

In [8]:
# calculating the best hyper-parameter
best=Optimiser(scoring="r2",n_folds=5).optimise(space,df,10)

##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'median', 'categorical_strategy': 'mode'}
>>> CA ENCODER :{'strategy': 'entity_embedding'}                               
>>> FEATURE SELECTOR :{'strategy': 'variance', 'threshold': 0.057413996965743154}
>>> ESTIMATOR :{'strategy': 'RandomForest', 'max_depth': 5, 'n_estimators': 250, 'bootstrap': True, 'criterion': 'mse', 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


MEAN SCORE : r2 = 0.4959252502798286                                           
VARIANCE : 0.0119494352256

VARIANCE : 0.011352049394388745 (fold 1 = 0.4941184201454283, fold 2 = 0.48258843303806576, fold 3 = 0.4879805769481118, fold 4 = 0.5048380270434719, fold 5 = 0.47084967585756676)
CPU time: 5.41963267326355 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'median', 'categorical_strategy': 'mode'}
>>> CA ENCODER :{'strategy': 'entity_embedding'}                               
>>> FEATURE SELECTOR :{'strategy': 'rf_feature_importance', 'threshold': 0.013096644987007706}
>>> ESTIMATOR :{'strategy': 'RandomForest', 'max_depth': 9, 'n_estimators': 250, 'bootstrap': True, 'criterion': 'mse', 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1, 'oob_score': False, 'random_

In [10]:
best

{'ce__strategy': 'label_encoding',
 'est__max_depth': 7,
 'est__n_estimators': 700,
 'est__strategy': 'RandomForest',
 'fs__strategy': 'rf_feature_importance',
 'fs__threshold': 0.23006026541623578,
 'ne__categorical_strategy': 'mode',
 'ne__numerical_strategy': 'median'}

## Predictions

In [11]:
# predicting on the test dataset
Predictor().fit_predict(best,df)


fitting the pipeline ...
CPU time: 4.8787150382995605 seconds

predicting...
CPU time: 0.59377121925354 seconds

> Overview on predictions : 

   Item_Outlet_Sales_predicted
0                  1755.048998
1                  1542.491869
2                   595.397670
3                  2655.001251
4                  5371.522827
5                  1814.930857
6                  1044.839319
7                  2314.797057
8                  1668.260330
9                  2937.634611

dumping predictions into directory : save ...


<mlbox.prediction.predictor.Predictor at 0x1e14478f198>