In [16]:
import json
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

In [17]:
# Load the Iris dataset
iris = load_iris()

In [18]:
# Convert to pandas DataFrame
data = pd.DataFrame(data=iris.data, columns=iris.feature_names)
data['target'] = iris.target

In [19]:
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [20]:
#descriptive statistics of the numerical columns in the DataFrame.
data.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [21]:
# Define the JSON configuration
ml_steps = {
    "data_file": None,
    "target_column": "target",
    "feature_handling": {
        "numeric_columns": ["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"],  # use the feature names from iris
        "categorical_columns": []
    },
    "feature_generation": {
        "text_column": None
    },
    "model_building": {
        "hyperparameters": {
            "n_estimators": [100, 200],
            "max_depth": [10, 20]
        }
    }
}

In [22]:
# Define the feature handling steps
num_cols = ml_steps['feature_handling']['numeric_columns']
cat_cols = ml_steps['feature_handling']['categorical_columns']

In [23]:
num_cols

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [24]:
cat_cols

[]

In [25]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [26]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [27]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

In [28]:
# Define the feature generation steps
text_col = ml_steps['feature_generation']['text_column']

In [29]:
text_transformer = Pipeline(steps=[
    ('count', CountVectorizer())
])

In [30]:
feature_generator = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_col)
    ])


In [31]:
# Combine feature handling and feature generation steps into a single pipeline
preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_generator', feature_generator)
])


In [32]:
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                      ('scaler', StandardScaler())])

In [33]:
preprocessing_pipeline = ColumnTransformer(transformers=[('num', numeric_transformer, num_cols),
                                                         ('cat', OneHotEncoder(), cat_cols)])

In [34]:
preprocessing_pipeline = ColumnTransformer(transformers=[('num', numeric_transformer, num_cols),
                                                         ('cat', OneHotEncoder(), cat_cols)])


In [35]:
# Define the model building step with hyperparameters to tune
model = RandomForestClassifier()

param_grid = ml_steps['model_building']['hyperparameters']

grid_search = GridSearchCV(model, param_grid=param_grid, cv=5)

In [36]:
# Fit the preprocessing pipeline and model building step in sequence
X = data.drop(columns=[ml_steps['target_column']])
y = data[ml_steps['target_column']]

In [37]:
preprocessed_X = preprocessing_pipeline.fit_transform(X)

In [38]:
grid_search.fit(preprocessed_X, y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [10, 20], 'n_estimators': [100, 200]})

In [39]:
# Print the best hyperparameters and the corresponding accuracy
print(f"Best hyperparameters: {grid_search.best_params_}")
print(f"Best accuracy: {grid_search.best_score_}")


Best hyperparameters: {'max_depth': 10, 'n_estimators': 100}
Best accuracy: 0.9666666666666668
