In [84]:
import pandas as pd
from striprtf.striprtf import rtf_to_text
import json
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor, ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, ElasticNet, Lasso, SGDClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from xgboost import XGBRegressor, XGBClassifier
from sklearn.decomposition import PCA

from sklearn.metrics import make_scorer, f1_score
from sklearn.pipeline import Pipeline

In [70]:
# Reading JSON
with open('algoparams_from_ui.json.rtf', 'r') as f: 
    rtfText = f.read() 
plainText = rtf_to_text(rtfText)
mainDict = json.loads(plainText)

In [71]:
# Extracting the Dicts in mainDict
session_info = mainDict['design_state_data']['session_info']
target = mainDict['design_state_data']['target']
train = mainDict['design_state_data']['train']
metrics = mainDict['design_state_data']['metrics']
feature_handling = mainDict['design_state_data']['feature_handling']
feature_generation = mainDict['design_state_data']['feature_generation']
feature_reduction = mainDict['design_state_data']['feature_reduction']
hyperparameters = mainDict['design_state_data']['hyperparameters']
weighting_stratergy = mainDict['design_state_data']['weighting_stratergy']
probability_calibration = mainDict['design_state_data']['probability_calibration']
algorithms = mainDict['design_state_data']['algorithms']

In [72]:
# Modifying algorithms to test
algorithms['RandomForestClassifier']['is_selected'] = True
algorithms['GBTRegressor']['is_selected'] = True
algorithms['GBTRegressor']['use_deviance'] = False

In [73]:
# Get the target
def getTarget(targetDict):
    target = targetDict['target']
    regtype = targetDict['type']
    return target, regtype

In [74]:
# Helper for GBT
def lossGBT(algoDict, typ):
    if typ == 'clf':
        if algoDict['GBTClassifier']['use_deviance'] == True:
            return 'deviance'
        elif algoDict['GBTClassifier']['use_exponential'] == True:
            return 'exponential'
        else:
            return 'exponential'
    elif typ == 'reg':
        if algoDict['GBTRegressor']['use_deviance'] == True:
            return 'deviance'
        elif algoDict['GBTRegressor']['use_exponential'] == True:
            return 'exponential'
        else:
            return 'exponential' 

In [75]:
# Helper for DT
def critDT(algoDict, typ):
    if typ == 'clf':
        if algoDict['DecisionTreeClassifier']['use_gini'] == True:
            return 'gini'
        elif algoDict['DecisionTreeClassifier']['use_entropy'] == True:
            return 'entropy'
        else:
            return 'gini'
    elif typ == 'reg':
        if algoDict['DecisionTreeRegressor']['use_gini'] == True:
            return 'gini'
        elif algoDict['DecisionTreeRegressor']['use_entropy'] == True:
            return 'entropy'
        else:
            return 'gini'

In [76]:
# Helper for DT
def splitDT(algoDict, typ):
    if typ == 'clf':
        if algoDict['DecisionTreeClassifier']['use_best'] == True:
            return 'best'
        elif algoDict['DecisionTreeClassifier']['use_random'] == True:
            return 'random'
        else:
            return 'best'
    elif typ == 'reg':
        if algoDict['DecisionTreeRegressor']['use_best'] == True:
            return 'best'
        elif algoDict['DecisionTreeRegressor']['use_random'] == True:
            return 'random'
        else:
            return 'best'

In [77]:
# Helper for SVM
def kernelSVM(algoDict):
    if algoDict['SVM']['linear_kernel'] == True:
        return 'linear'
    elif algoDict['SVM']['polynomial_kernel'] == True:
        return 'poly'
    elif algoDict['SVM']['sigmoid_kernel'] == True:
        return 'sigmoid'
    else:
        return 'rbf'

In [78]:
# Helper for SVM
def gammaSVM(algoDict):
    if algoDict['SVM']['auto'] == True:
        return 'auto'
    elif algoDict['SVM']['scale'] == True:
        return 'scale'
    else:
        return 'scale'

In [79]:
# Helper for SGD
def lossSGD(algoDict):
    if algoDict['SGD']['use_logistics'] == True:
        return 'log'
    elif algoDict['SGD']['use_modified_huber_loss'] == True:
        return 'modified_huber'
    else:
        return 'hinge'

In [80]:
# Helper for SGD
def penaltySGD(algoDict):
    if algoDict['SGD']['use_elastic_net_regularization'] == True:
        return 'elasticnet'
    elif algoDict['SGD']['use_l1_regularization'] == 'on':
        return 'l1'
    else:
        return 'l2'

In [81]:
# Helper for kNN
def weightKNN(algoDict):
    if algoDict['KNN']['distance_weighting'] == True:
        return 'distance'
    else:
        return 'uniform'

In [82]:
# Take in modelStr to return ScikitLearn object
def transformStrToModelObj(modelStr, algoDict, regtype):
    if modelStr == 'RandomForestClassifier':
        return RandomForestClassifier(
            n_estimators = algoDict[modelStr]['max_trees'],
            max_depth = algoDict[modelStr]['max_depth'],
            min_samples_split = algoDict[modelStr]['min_samples_per_leaf_min_value']
        )
    elif modelStr == 'RandomForestRegressor':
        return RandomForestRegressor(
            n_estimators = algoDict[modelStr]['max_trees'],
            max_depth = algoDict[modelStr]['max_depth'],
            min_samples_split = algoDict[modelStr]['min_samples_per_leaf_min_value']
        )
    elif modelStr == 'GBTClassifier':
        return GradientBoostingClassifier(
            subsample = algoDict[modelStr]['min_subsample'],
            max_depth = algoDict[modelStr]['max_depth'],
            loss = lossGBT(algoDict, 'clf')
        )
    elif modelStr == 'GBTRegressor':
        return GradientBoostingRegressor(
            subsample = algoDict[modelStr]['min_subsample'],
            max_depth = algoDict[modelStr]['max_depth'],
            loss = lossGBT(algoDict, 'reg')
        )
    elif modelStr == 'LinearRegression':
        return LinearRegression()
    
    elif modelStr == 'LogisticRegression':
        return LogisticRegression(
            max_iter = algoDict[modelStr]['max_iter'],
            penalty='elasticnet',
            l1_ratio=algoDict[modelStr]['max_elasticnet']
        )
    elif modelStr == 'RidgeRegression':
        if type(algoDict[modelStr]['regularization_term']) in [int, float]:
            return Ridge(
                alpha = algoDict[modelStr]['regularization_term'],
                max_iter = algoDict[modelStr]['max_iter']
            )
        else:
            return Ridge(
                max_iter = algoDict[modelStr]['max_iter']
            )
    elif modelStr == 'LassoRegression':
        if type(algoDict[modelStr]['regularization_term']) in [int, float]:
            return Lasso(
                alpha = algoDict[modelStr]['regularization_term'],
                max_iter = algoDict[modelStr]['max_iter']
            )
        else:
            return Lasso(
                max_iter = algoDict[modelStr]['max_iter']
            )
    elif modelStr == 'ElasticNetRegression':
        if type(algoDict[modelStr]['regularization_term']) in [int, float]:
            return ElasticNet(
                alpha = algoDict[modelStr]['regularization_term'],
                max_iter = algoDict[modelStr]['max_iter']
            )
        else:
            return ElasticNet(
                max_iter = algoDict[modelStr]['max_iter']
            )
    elif modelStr == 'xgboost':
        if regtype == 'regression':
            if algoDict[modelStr]['dart'] == True:
                return XGBRegressor(
                    max_depth = algoDict[modelStr]['max_depth_of_tree'][0],
                    learning_rate = algoDict[modelStr]['learningRate'],
                    booster = 'dart',
                    gamma = algoDict[modelStr]['gamma'][0],
                    min_child_weight = algoDict[modelStr]['min_child_weight'][0],
                    reg_alpha = algoDict[modelStr]['l1_regularization'],
                    reg_lambda = algoDict[modelStr]['l2_regularization'],
                    random_state = algoDict[modelStr]['random_state'],
                    early_stopping_rounds = algoDict[modelStr]['early_stopping_rounds']
                    
                )
            else:
                return XGBRegressor(
                    max_depth = algoDict[modelStr]['max_depth_of_tree'][0],
                    learning_rate = algoDict[modelStr]['learningRate'],
                    gamma = algoDict[modelStr]['gamma'][0],
                    min_child_weight = algoDict[modelStr]['min_child_weight'][0],
                    reg_alpha = algoDict[modelStr]['l1_regularization'],
                    reg_lambda = algoDict[modelStr]['l2_regularization'],
                    random_state = algoDict[modelStr]['random_state'],
                    early_stopping_rounds = algoDict[modelStr]['early_stopping_rounds']
                    
                )
        elif regtype == 'classification':
            if algoDict[modelStr]['dart'] == True:
                return XGBClassifier(
                    max_depth = algoDict[modelStr]['max_depth_of_tree'][0],
                    learning_rate = algoDict[modelStr]['learningRate'],
                    booster = 'dart',
                    gamma = algoDict[modelStr]['gamma'][0],
                    min_child_weight = algoDict[modelStr]['min_child_weight'][0],
                    reg_alpha = algoDict[modelStr]['l1_regularization'],
                    reg_lambda = algoDict[modelStr]['l2_regularization'],
                    random_state = algoDict[modelStr]['random_state'],
                    early_stopping_rounds = algoDict[modelStr]['early_stopping_rounds']
                    
                )
            else:
                return XGBClassifier(
                    max_depth = algoDict[modelStr]['max_depth_of_tree'][0],
                    learning_rate = algoDict[modelStr]['learningRate'],
                    gamma = algoDict[modelStr]['gamma'][0],
                    min_child_weight = algoDict[modelStr]['min_child_weight'][0],
                    reg_alpha = algoDict[modelStr]['l1_regularization'],
                    reg_lambda = algoDict[modelStr]['l2_regularization'],
                    random_state = algoDict[modelStr]['random_state'],
                    early_stopping_rounds = algoDict[modelStr]['early_stopping_rounds']
                    
                )
    elif modelStr == 'DecisionTreeClassifier':
        return DecisionTreeClassifier(
            max_depth = algoDict[modelStr]['max_depth'],
            min_samples_leaf= algoDict[modelStr]['min_samples_per_leaf'][0],
            criterion = critDT(algoDict, 'clf'),
            splitter = splitDT(algoDict, 'clf')
        )
    elif modelStr == 'DecisionTreeRegressor':
        return DecisionTreeRegressor(
            max_depth = algoDict[modelStr]['max_depth'],
            min_samples_leaf= algoDict[modelStr]['min_samples_per_leaf'][0],
            splitter = splitDT(algoDict, 'reg')
        )
    elif modelStr == 'SVM':
        if regtype == 'regression':
            return SVR(
                kernel = kernelSVM(algoDict),
                C = algoDict[modelStr]['c_value'][0],
                tol = algoDict[modelStr]['tolerance'],
                max_iter= algoDict[modelStr]['max_iterations'],
                gamma = gammaSVM(algoDict)
            )
        elif regtype == 'classification':
            return SVC(
                kernel = kernelSVM(algoDict),
                C = algoDict[modelStr]['c_value'][0],
                tol = algoDict[modelStr]['tolerance'],
                max_iter= algoDict[modelStr]['max_iterations'],
                gamma = gammaSVM(algoDict)
            )
    elif modelStr == 'SGD':
        return SGDClassifier(
            tol = algoDict[modelStr]['tolerance'],
            alpha = algoDict[modelStr]['alpha_value'][0],
            loss = lossSGD(algoDict),
            penalty = penaltySGD(algoDict)

        )
    elif modelStr == 'KNN':
        if regtype == 'regression':
            return KNeighborsRegressor(
                n_neighbors = algoDict[modelStr]['k_value'][0],
                p = algoDict[modelStr]['p_value'],
                weights = weightKNN(algoDict)

            )
        elif regtype == 'classification':
            return KNeighborsClassifier(
                n_neighbors = algoDict[modelStr]['k_value'][0],
                p = algoDict[modelStr]['p_value'],
                weights = weightKNN(algoDict)
            )
    elif modelStr == 'extra_random_trees':
        if regtype == 'regression':
            return ExtraTreesRegressor(
                n_estimators = algoDict[modelStr]['num_of_trees'][0],
                max_depth = algoDict[modelStr]['max_depth'[0]],
                min_samples_leaf = algoDict[modelStr]['min_sample_per_leaf'][0]
            )
        elif regtype == 'classification':
            return ExtraTreesClassifier(
                n_estimators = algoDict[modelStr]['num_of_trees'][0],
                max_depth = algoDict[modelStr]['max_depth'[0]],
                min_samples_leaf = algoDict[modelStr]['min_sample_per_leaf'][0]
            )
    elif modelStr == 'neural_netwrok':
        if regtype == 'regression':
            return MLPRegressor(
                hidden_layer_sizes = algoDict[modelStr]['hidden_layer_sizes'],
                alpha = algoDict[modelStr]['alpha_value'],
                max_iter = algoDict[modelStr]['max_iterations'],
                tol = algoDict[modelStr]['convergence_tolerance'],
                early_stopping = algoDict[modelStr]['early_stopping'],
                shuffle = algoDict[modelStr]['shuffle_data'],
                learning_rate_init = algoDict[modelStr]['initial_learning_rate'],
                beta_1 = algoDict[modelStr]['beta_1'],
                beta_2 = algoDict[modelStr]['beta_2'],
                epsilon = algoDict[modelStr]['epsilon'],
                power_t = algoDict[modelStr]['power_t'],
                momentum = algoDict[modelStr]['momentum'],
            )
        elif regtype == 'classification':
            return MLPClassifier(
                hidden_layer_sizes = algoDict[modelStr]['hidden_layer_sizes'],
                alpha = algoDict[modelStr]['alpha_value'],
                max_iter = algoDict[modelStr]['max_iterations'],
                tol = algoDict[modelStr]['convergence_tolerance'],
                early_stopping = algoDict[modelStr]['early_stopping'],
                shuffle = algoDict[modelStr]['shuffle_data'],
                learning_rate_init = algoDict[modelStr]['initial_learning_rate'],
                beta_1 = algoDict[modelStr]['beta_1'],
                beta_2 = algoDict[modelStr]['beta_2'],
                epsilon = algoDict[modelStr]['epsilon'],
                power_t = algoDict[modelStr]['power_t'],
                momentum = algoDict[modelStr]['momentum'],
            )

In [42]:
# Use getTarget() and transformStrToModelObj() to return model object list
def getAlgorithm(algoDict, targetDict):
    _, regtype = getTarget(targetDict)
    if regtype.lower() == 'regression':
        possibleModels = ['RandomForestRegressor', 'GBTRegressor', 'LinearRegression',
                          'RidgeRegression', 'LassoRegression', 'ElasticNetRegression',
                          'xg_boost', 'DecisionTreeRegressor', 'neural_network',
                          'SVM', 'KNN', 'extra_random_trees']
        selectedModels = []
        for models in possibleModels:
            if algoDict[models]['is_selected'] == True:
                selectedModels.append(models)
        modelsObjDict = {}
        for models in selectedModels:
            modelsObjDict[models] = transformStrToModelObj(models, algorithms, regtype)
        return modelsObjDict
        

    elif regtype.lower() == 'classification':
        possibleModels = ['RandomForestClassifier', 'GBTClassifier', 'LogisticRegression',
                          'xg_boost', 'DecisionTreeClassifier', 'neural_network',
                          'SVM', 'SGD', 'KNN', 'extra_random_trees']
        selectedModels = []
        for models in possibleModels:
            if algoDict[models]['is_selected'] == True:
                selectedModels.append(models)
        for i in selectedModels:
            modelsObjDict[i] = transformStrToModelObj(models, algorithms, regtype)
        return modelsObjDict

In [43]:
getAlgorithm(algorithms, target)

{'RandomForestRegressor': RandomForestRegressor(max_depth=25, min_samples_split=5, n_estimators=20),
 'GBTRegressor': GradientBoostingRegressor(loss='exponential', max_depth=7, subsample=1)}