In [1]:
from nicefitbro.ingestors.local_ingestor import LocalFileIngestor
from nicefitbro.preprocess.missing_value_processor import MissingValuePreprocessor
from nicefitbro.preprocess.outlier_detector import OutlierDetector
from nicefitbro.feature_engineering.categorical_encoding import CategoricalEncoder
from nicefitbro.feature_engineering.correlation_analysis import CorrelationAnalysis
from nicefitbro.feature_engineering.feature_scaling import FeatureScaler
from nicefitbro.feature_engineering.feature_selection import FeatureSelection
from nicefitbro.feature_engineering.feature_transformations import FeatureTransformer
from nicefitbro.pipeliners.fe_pipeliner import FtEngineeringPipeliner
from nicefitbro.pipeliners.preprocessor_pipeliner import PreprocessorPipeliner
from nicefitbro.pipeliners.prepper import DataPrepper
from nicefitbro.models.auto_model import AutoModel
from nicefitbro.config.run_config import RunConfig

## Config

In [2]:
run_config = RunConfig(
    target = "medv",
    file_path = "../data/boston_housing.csv",
    missing_value_method = 'median',
    outlier_detector_method = 'lof',
    feature_transformer_method = 'log',
    feature_transformer_features = ["dis","rm","crim"],
    feature_scaler_method = 'minmax',
    model_types = [
        "lr","ridge","lasso",
        "elastic", "bayesridge",
        "knn", "dtr","rfr", 
        "gbr", "xgb", "poly"
    ],
)

# Preprocessor Pipeline Example

First define your target, filepath, and the method of data ingestion.

In [3]:
local_file_ingestor = LocalFileIngestor()

Next, define the preprocessing steps you'd like to perform.

In [4]:
missing_value_preprocessor = MissingValuePreprocessor(method=run_config.missing_value_method)
outlier_detector = OutlierDetector(method=run_config.outlier_detector_method)

Then define the feature engineering steps you'd like to perform

In [5]:
feature_transformer = FeatureTransformer(features=run_config.feature_transformer_features, method=run_config.feature_transformer_method)
feature_selector = FeatureSelection(k=run_config.feature_selector_k)
feature_scaler = FeatureScaler(method=run_config.feature_scaler_method)

Create the pipeliner objects.

In [6]:
preprocessor_pipeliner = PreprocessorPipeliner([missing_value_preprocessor, outlier_detector])
fe_pipeliner = FtEngineeringPipeliner([feature_transformer, feature_scaler])

And create a DataPrepper object that takes in an import method, a preprocessing pipeline, and a feature engineering pipeline.

In [7]:
data_prepper = DataPrepper(local_file_ingestor, preprocessor_pipeliner, fe_pipeliner, run_config.target)

Execute the pipeline.

In [8]:
processed_data = data_prepper.load_and_preprocess_data(run_config.file_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = features_transformed[col]


View processed dataset.

In [10]:
processed_data

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.000000,0.18,0.067815,0.0,0.314815,0.665067,0.641607,0.511192,0.000000,0.206501,0.287234,1.000000,0.089680,0.414414
1,0.004795,0.00,0.242302,0.0,0.172840,0.638140,0.782698,0.604462,0.043478,0.103250,0.553191,1.000000,0.204470,0.360360
2,0.004791,0.00,0.242302,0.0,0.172840,0.766601,0.599382,0.604462,0.043478,0.103250,0.553191,0.989737,0.063466,0.655405
3,0.005937,0.00,0.063050,0.0,0.150206,0.736302,0.441813,0.703314,0.086957,0.065010,0.648936,0.994276,0.033389,0.626126
4,0.014047,0.00,0.063050,0.0,0.150206,0.760500,0.528321,0.703314,0.086957,0.065010,0.648936,1.000000,0.099338,0.689189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461,0.012647,0.00,0.420455,0.0,0.386831,0.668178,0.681771,0.287877,0.000000,0.162524,0.893617,0.987619,0.219095,0.378378
462,0.008821,0.00,0.420455,0.0,0.386831,0.583858,0.760041,0.254728,0.000000,0.162524,0.893617,1.000000,0.202815,0.337838
463,0.012238,0.00,0.420455,0.0,0.386831,0.732691,0.907312,0.232913,0.000000,0.162524,0.893617,1.000000,0.107892,0.412162
464,0.022693,0.00,0.420455,0.0,0.386831,0.702431,0.889804,0.272550,0.000000,0.162524,0.893617,0.991301,0.131071,0.369369


## AutoML

In [9]:
am = AutoModel(processed_data, run_config.model_types, run_config.target)

am.auto_model()

{'lr': {'R2': 0.6479612630397704, 'RMSE': 0.08449673880995816},
 'ridge': {'R2': 0.6724268497478131, 'RMSE': 0.08150774330671319},
 'lasso': {'R2': -0.202785091895185, 'RMSE': 0.156184790242858},
 'elastic': {'R2': 0.36289725727480415, 'RMSE': 0.11367094166392348},
 'bayesridge': {'R2': 0.6563192907049062, 'RMSE': 0.0834876618481901},
 'knn': {'R2': 0.6812685213593017, 'RMSE': 0.08040021297064938},
 'dtr': {'R2': 0.6925346393526808, 'RMSE': 0.07896648698562989},
 'rfr': {'R2': 0.8385891504732534, 'RMSE': 0.05721515542400262},
 'gbr': {'R2': 0.836598059800736, 'RMSE': 0.0575669638522388},
 'xgb': {'R2': 0.8688426542657365, 'RMSE': 0.051575212968683316},
 'poly': {'R2': 0.7152781023609055, 'RMSE': 0.07598977439849326}}