## Reading and getting the df ready etc

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("dataset1.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
str(df['Education'].unique())

'[2 1 4 3 5]'

## Helper functions

In [4]:
import pandas as pd
import random


# 1. Train-Test-Split, 
# test size is the percentage of the dataset that should be in the test set
def train_test_split(df, test_size):
    # make sure it is a float and get the number of instances in the test set
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))
    # get the indices for the test set
    indices = df.index.tolist()
    # choose them randomly
    test_indices = random.sample(population=indices, k=test_size)
    # separate into test and train
    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df


# 2. Distinguish categorical and continuous features
def determine_type_of_feature(df):
    # in order to properly split the nodes, we need to know if a feature is categorical or continuous
    feature_types = []
    # this threshold is used to determine if a feature is categorical, it can be changed
    n_unique_values_treshold = 15
    for feature in df.columns:
        # get all features except for the label
        if feature != "label":
            unique_values = df[feature].unique()
            example_value = unique_values[0]
            # we use the number of unique values and the type of the first value to determine the type of the feature
            if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
                feature_types.append("categorical")
            else:
                feature_types.append("continuous")
    
    return feature_types


# 3. Accuracy
def calculate_accuracy(predictions, labels):
    # calculate the accuracy of the predictions
    # we compare the predictions to the labels and count the number of correct predictions
    predictions_correct = predictions == labels
    accuracy = predictions_correct.mean()
    
    return accuracy

# another method to calculate accuracy
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

def precision(y_true, y_pred):
    tp = np.sum(y_true * y_pred)
    fp = np.sum((1 - y_true) * y_pred)
    return tp / (tp + fp)

def sensitivity(y_true, y_pred):
    tp = np.sum(y_true * y_pred)
    fn = np.sum(y_true * (1 - y_pred))
    return tp / (tp + fn)

def specificity(y_true, y_pred):
    tn = np.sum((1 - y_true) * (1 - y_pred))
    fp = np.sum((1 - y_true) * y_pred)
    return tn / (tn + fp)

def f_score(y_true, y_pred):
    tp = np.sum(y_true * y_pred)
    fn = np.sum(y_true * (1 - y_pred))
    fp = np.sum((1 - y_true) * y_pred)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    return 2 * precision * recall / (precision + recall)

# confusion matrix containing true positives, false negatives, false positives and true negatives
def confusion_matrix(y_true, y_pred):
    tp = np.sum(y_true * y_pred)
    fn = np.sum(y_true * (1 - y_pred))
    fp = np.sum((1 - y_true) * y_pred)
    tn = np.sum((1 - y_true) * (1 - y_pred))
    return np.array([[tp, fn], [fp, tn]])


## Decision tree

In [5]:
# 1. Decision Tree helper functions 

# 1.1 Data pure
def check_purity(data):
    # check if labels are pure, i.e. all belong to the same class
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

    
# 1.2 Classify
def classify_data(data):
    # get the labels of the data and return the most common one (majority vote)
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)

    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    
    return classification


# 1.3 Potential splits?
def get_potential_splits(data, random_subspace):
    
    potential_splits = {}
    _, n_columns = data.shape
    column_indices = list(range(n_columns - 1))    # excluding the last column which is the label
    
    # if we decide to take a limited amount of features, we randomly choose a subset of them, this is only used for random forests
    if random_subspace and random_subspace <= len(column_indices):
        column_indices = random.sample(population=column_indices, k=random_subspace)
    
    # we iterate over all columns and find the unique values in each column, in order to determine the splits in our decision tree
    for column_index in column_indices:          
        values = data[:, column_index]
        unique_values = np.unique(values)
        
        potential_splits[column_index] = unique_values
    
    return potential_splits


# 1.4 Lowest Overall Entropy?
def calculate_entropy(data):
    # simply calculate the entropy of the data
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
     
    return entropy

# this is done to calculate the entropy of the split leaves
def calculate_overall_entropy(data_below, data_above):
    # data below is the left leaf and data above is the right leaf
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n
    
    #calculate both their entropy, sum them and return the overall entropy
    overall_entropy =  (p_data_below * calculate_entropy(data_below) 
                      + p_data_above * calculate_entropy(data_above))
    
    return overall_entropy


def determine_best_split(data, potential_splits):
    # we iterate over all potential splits and calculate the overall entropy of the split leaves
    overall_entropy = 9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            # remove empty lines
            if(len(data_below)==0 or len(data_above)==0):
                break
            
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)
            #compare last calculated overall entropy with the current one, if it is lower, we update the best split
            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
    #once we have iterated over all potential splits and compared them, we return the best split
    return best_split_column, best_split_value


# 1.5 Split data
def split_data(data, split_column, split_value):
    
    split_column_values = data[:, split_column]
    #check the type of the feature whether it is continuous or categorical
    type_of_feature = FEATURE_TYPES[split_column]
    if type_of_feature == "continuous":
        #if continuous we compare in values larger or smaller
        data_below = data[split_column_values <= split_value]
        data_above = data[split_column_values >  split_value]
    
    # feature is categorical   
    else:
        #if feature is categorical we compare in values equal or not equal
        data_below = data[split_column_values == split_value]
        data_above = data[split_column_values != split_value]
    # we return the leaves
    return data_below, data_above


# 2. Decision Tree Algorithm
def decision_tree_algorithm(df, counter=0, min_samples=2, max_depth=5, random_subspace=None):
    
    # data preparations
    if counter == 0:
        # we need to store the data globally, so we can use it in the helper functions
        global COLUMN_HEADERS, FEATURE_TYPES
        COLUMN_HEADERS = df.columns
        FEATURE_TYPES = determine_type_of_feature(df)
        data = df.values
    else:
        data = df           
    
    
    # base cases
    # since our loop is recursive, this will be our end case for a leaf node
    # if the node only has data of one class, we return the class and stop splitting, it will then be a terminal leaf node
    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        classification = classify_data(data)
        
        return classification

    
    # recursive part
    else:    
        counter += 1

        # helper functions 
        potential_splits = get_potential_splits(data, random_subspace)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        
        # check for empty data
        if len(data_below) == 0 or len(data_above) == 0:
            classification = classify_data(data)
            # the node cannod be split anymore, so we return the class, it is a terminal node
            return classification
        
        # determine question
        feature_name = COLUMN_HEADERS[split_column]
        type_of_feature = FEATURE_TYPES[split_column]
        # feature is continuous, we use <=
        if type_of_feature == "continuous":
            question = "{}__<=__{}".format(feature_name, split_value)
            
        # feature is categorical, we use =
        else:
            question = "{}__=__{}".format(feature_name, split_value)
        
        # instantiate sub-tree
        sub_tree = {question: []}
        
        # find answers (recursion)
        # yes is the left leaf, no is the right leaf
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples, max_depth, random_subspace)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples, max_depth, random_subspace)
        
        # If the answers are the same, then there is no point in asking the qestion.
        # This could happen when the data is classified even though it is not pure
        # yet (min_samples or max_depth base case).
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        # we construct the tree and return it
        return sub_tree


# 3. Make predictions
# 3.1 One example
def predict_example(example, tree):
    # this is to predict only one instance
    # get the node questions in order to classify
    question = list(tree.keys())[0]
    #print(question + '\n')
    
    #print(example.dtype)
    feature_name, comparison_operator, value = question.split("__")

    # ask question
    if comparison_operator == "<=":
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    
    # feature is categorical
    else:
        if str(example[feature_name]) == value:
            # we go to the left leaf
            answer = tree[question][0]
        else:
            # we go to the right leaf
            answer = tree[question][1]

    # base case
    # if the answer is not a sub-tree, that means it's a terminal node, we return the classification
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    # we go deeper into the tree using the sub-tree returned in the answer
    else:
        residual_tree = answer
        return predict_example(example, residual_tree)

    
# 3.2 All examples of the test data
def decision_tree_predictions(test_df, tree):
    # we predict all the instances in the test data using a loop
    predictions = test_df.apply(predict_example, args=(tree,), axis=1)
    return predictions

## Using the functions

In [6]:
from pprint import pprint


In [7]:
data = pd.read_csv("new_dataset.csv");
data['label'] = data.Attrition
data = data.drop("Attrition", axis=1)

#data.columns = data.columns.str.replace(' ', '_')

data.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,label
0,0.547619,1.0,0.71582,Sales,0.0,0.25,Life Sciences,0.333333,0.0,0.914286,...,0.0,0.0,0.0,0.0,0.0,0.15,0.222222,0.0,0.344828,1
1,0.814978,0.5,0.185735,Research & Development,0.275862,0.157895,Life Sciences,0.727273,1.0,0.606401,...,1.0,0.5,0.025,0.5,0.75,0.25,0.388889,0.066667,0.482759,0
2,0.613136,1.0,0.915934,Research & Development,0.068966,0.380435,Other,1.0,1.0,0.919512,...,0.5,0.0,0.025,0.5,0.75,0.0,0.0,0.0,0.0,1
3,0.545855,0.5,0.92861,Research & Development,0.103448,0.793478,Life Sciences,1.0,0.0,0.557316,...,0.75,0.0,0.025,0.5,0.75,0.2,0.388889,0.2,0.0,0
4,0.44495,1.0,0.394188,Research & Development,0.068966,0.173913,Medical,0.181818,1.0,0.396637,...,1.0,0.5,0.025,0.5,0.75,0.05,0.111111,0.133333,0.137931,0


In [8]:

train_df, test_df = train_test_split(data, test_size=0.2)

test_df.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,label
375,0.815757,1.0,0.84122,Research & Development,0.241379,0.586957,Other,0.47619,1.0,0.307939,...,0.75,0.0,0.1,0.333333,0.75,0.138889,0.117647,0.0,0.0,0
1469,1.0,1.0,1.0,Research & Development,1.0,1.0,Medical,1.0,1.0,1.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
1146,0.564516,0.5,0.284917,Research & Development,0.344828,0.793478,Life Sciences,0.738095,1.0,0.418267,...,0.25,0.5,0.4,0.5,0.75,0.391304,0.411765,0.5,0.137931,0
934,0.413771,1.0,0.177533,Research & Development,0.034483,0.586957,Medical,1.0,0.0,0.398208,...,1.0,0.0,0.25,0.5,0.5,0.055556,0.117647,0.133333,0.068966,0
123,0.849172,1.0,0.456279,Research & Development,0.206897,0.586957,Life Sciences,0.214286,1.0,0.508536,...,0.75,0.0,0.025,0.833333,0.75,0.5,1.0,1.0,1.0,0


In [9]:

tree = decision_tree_algorithm(train_df, max_depth=15)

predictions = decision_tree_predictions(test_df, tree)

#test_df.head()

In [10]:
#test_df['label'] = data[data.index.isin(list(test_df.index))]
accuracy_ = calculate_accuracy(predictions, test_df.label)

print("Accuracy = {}".format(accuracy_))


Accuracy = 0.7244897959183674


In [11]:
f_score(predictions, test_df.label)

0.3305785123966943

In [12]:
pd.DataFrame(confusion_matrix(predictions, test_df.label))

Unnamed: 0,0,1
0,20,51
1,30,193


## Random Forest

In [13]:
# creating the dataset for each decision tree of the forest
def bootstrapping(train_df, n_bootstrap):
    bootstrap_indices = np.random.randint(low=0, high=len(train_df), size=n_bootstrap)
    df_bootstrapped = train_df.iloc[bootstrap_indices]
    
    return df_bootstrapped

# creating the forest
def random_forest_algorithm(train_df,dt_max_depth, n_trees=10, n_bootstrap=250, n_features=None):
    forest = []
    for i in range(n_trees):
        # we generate a sub dataset for each tree
        df_bootstrapped = bootstrapping(train_df, n_bootstrap)
        # we train each tree using the sub dataset
        tree = decision_tree_algorithm(df_bootstrapped, max_depth=dt_max_depth, random_subspace=n_features)
        # we add the tree to the forest
        forest.append(tree)
    
    return forest

def random_forest_predictions(test_df, forest):
    df_predictions = {}
    for i in range(len(forest)):
        # we organize every prediction in a dictionary
        column_name = "tree_{}".format(i)
        predictions = decision_tree_predictions(test_df, tree=forest[i])
        df_predictions[column_name] = predictions
    # we create a dataframe from the dictionary of predictions
    df_predictions = pd.DataFrame(df_predictions)
    # we make the final prediction by taking the mode of the predictions, aka majority vote
    random_forest_predictions = df_predictions.mode(axis=1)[0]
    
    return random_forest_predictions


In [14]:
forest = random_forest_algorithm(train_df, n_trees=10, n_bootstrap=250, n_features=None, dt_max_depth=20)

predictions = random_forest_predictions(test_df, forest)
accuracy_ = calculate_accuracy(predictions, test_df.label)

print("Accuracy = {}".format(accuracy_))


Accuracy = 0.8537414965986394


In [15]:
test_df.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,label
375,0.815757,1.0,0.84122,Research & Development,0.241379,0.586957,Other,0.47619,1.0,0.307939,...,0.75,0.0,0.1,0.333333,0.75,0.138889,0.117647,0.0,0.0,0
1469,1.0,1.0,1.0,Research & Development,1.0,1.0,Medical,1.0,1.0,1.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
1146,0.564516,0.5,0.284917,Research & Development,0.344828,0.793478,Life Sciences,0.738095,1.0,0.418267,...,0.25,0.5,0.4,0.5,0.75,0.391304,0.411765,0.5,0.137931,0
934,0.413771,1.0,0.177533,Research & Development,0.034483,0.586957,Medical,1.0,0.0,0.398208,...,1.0,0.0,0.25,0.5,0.5,0.055556,0.117647,0.133333,0.068966,0
123,0.849172,1.0,0.456279,Research & Development,0.206897,0.586957,Life Sciences,0.214286,1.0,0.508536,...,0.75,0.0,0.025,0.833333,0.75,0.5,1.0,1.0,1.0,0


In [16]:
values = [0.6,1.0,0.8,'Sales',0.3,0.5,'Other',0,1,0.75,0.66,0.25,'Sales Executive',0.72,'Married',0.2,0.5,0.11,'Yes',0.0,1.0,0.5,0.025,0.5,0.75,0.25,0.3,0.0,0.25]
series = pd.Series(values, index = test_df.columns.delete(-1))
series= series.to_frame().T

In [17]:
prediction = random_forest_predictions(series, forest)
prediction[0]

0

In [18]:
accuracy(test_df.label, predictions)

0.8537414965986394

In [19]:
sensitivity(test_df.label, predictions)

0.16

In [20]:
specificity(test_df.label, predictions)

0.9959016393442623

In [21]:
f_score(test_df.label, predictions)

0.27118644067796616

In [22]:
confusion_matrix(test_df.label, predictions)

array([[  8.,  42.],
       [  1., 243.]])

## Grid search but manual

In [23]:
import itertools
import timeit

In [45]:
def gridsearch(algo_name, algorithm, parameters):
        # we create a list of all the possible combinations of parameters
        parameter_values = [v for v in parameters.values()]
        combinations = list(itertools.product(*parameter_values))
        # we create a dataframe to store the results
        columns = ["parameters"] + list(parameters.keys()) + ["score"] + ["accuracy"] + ["specificity"] + ["sensitivity"] + ["training_time"] + ["prediction_time"]
        df_results = pd.DataFrame(columns=columns)
        # we loop over all the combinations of parameters
        for c in combinations:
                # we create a dictionary of parameters
                params = dict(zip(parameters.keys(), c))
                # we train the model with the current combination of parameters
                start = timeit.default_timer()
                #print(params)
                try:
                    model = algorithm(train_df,**params)
                    stop = timeit.default_timer()
                    # we calculate the time it took to train the model
                    training_time = round(stop - start,3)
                    # we predict the test set
                    if(algo_name == "random_forest_algorithm"):
                        start=timeit.default_timer()
                        predictions = random_forest_predictions(test_df, model)
                        stop=timeit.default_timer()
                        # we calculate the time it took to predict the test set
                        prediction_time = round(stop - start,3)
                    elif(algo_name == "decision_tree_algorithm"):
                        start=timeit.default_timer()
                        predictions = decision_tree_predictions(test_df, model)
                        stop=timeit.default_timer()
                        # we calculate the time it took to predict the test set
                        prediction_time = round(stop - start,3)
                    # we calculate the accuracy
                    acc = accuracy(test_df.label,predictions )
                    score = f_score(test_df.label,predictions)
                    spec = specificity(test_df.label,predictions)
                    sens = sensitivity(test_df.label,predictions)

                    # we store the results in the dataframe
                    df_results.loc[len(df_results)] = [params] + list(params.values()) + [score] + [acc] + [spec] + [sens] + [training_time] + [prediction_time]
                except:
                    pass
        # we sort the dataframe by accuracy in descending order
        df_results.sort_values(by=["score"], ascending=False, inplace=True)
        df_results.reset_index(inplace=True, drop=True)
        
        return df_results

In [25]:
# scores = {}
# for i in range(2,50):
#         model = decision_tree_algorithm(train_df, max_depth=i)
#         score = f_score(test_df.label, decision_tree_predictions(test_df, model))
#         scores[i] = score

# max_value = max(scores, key=scores.get)
# max_value

In [26]:
gs = gridsearch('decision_tree_algorithm',decision_tree_algorithm, {'max_depth':range(5, 51, 5)})
gs

Unnamed: 0,parameters,max_depth,score,accuracy,specificity,sensitivity,training_time,prediction_time
0,{'max_depth': 10},10,0.333333,0.727891,0.795082,0.4,12.796,0.021
1,{'max_depth': 15},15,0.330579,0.72449,0.790984,0.4,11.461,0.018
2,{'max_depth': 20},20,0.330579,0.72449,0.790984,0.4,13.856,0.017
3,{'max_depth': 25},25,0.330579,0.72449,0.790984,0.4,13.147,0.019
4,{'max_depth': 30},30,0.330579,0.72449,0.790984,0.4,11.77,0.02
5,{'max_depth': 35},35,0.330579,0.72449,0.790984,0.4,11.41,0.015
6,{'max_depth': 40},40,0.330579,0.72449,0.790984,0.4,11.668,0.015
7,{'max_depth': 45},45,0.330579,0.72449,0.790984,0.4,10.594,0.017
8,{'max_depth': 50},50,0.330579,0.72449,0.790984,0.4,10.578,0.017
9,{'max_depth': 5},5,0.309859,0.833333,0.959016,0.22,10.925,0.01


In [27]:
# gs.to_excel('benchmark_tree.xlsx', index=False)

In [46]:
gsf =gridsearch('random_forest_algorithm',random_forest_algorithm, {'n_trees':range(2, 11, 2), 'n_bootstrap':range(100, 251, 50), 'n_features':range(4, 31, 4), 'dt_max_depth':range(5, 51, 5)})



In [47]:
gsf

Unnamed: 0,parameters,n_trees,n_bootstrap,n_features,dt_max_depth,score,accuracy,specificity,sensitivity,training_time,prediction_time
0,"{'n_trees': 8, 'n_bootstrap': 250, 'n_features...",8,250,20,50,0.369231,0.860544,0.987705,0.24,6.031,0.229
1,"{'n_trees': 2, 'n_bootstrap': 150, 'n_features...",2,150,28,20,0.368421,0.836735,0.950820,0.28,0.985,0.139
2,"{'n_trees': 2, 'n_bootstrap': 100, 'n_features...",2,100,24,50,0.368421,0.836735,0.950820,0.28,0.556,0.125
3,"{'n_trees': 4, 'n_bootstrap': 100, 'n_features...",4,100,16,20,0.368421,0.836735,0.950820,0.28,1.025,0.160
4,"{'n_trees': 4, 'n_bootstrap': 250, 'n_features...",4,250,24,35,0.366197,0.846939,0.967213,0.26,3.731,0.221
...,...,...,...,...,...,...,...,...,...,...,...
1363,"{'n_trees': 10, 'n_bootstrap': 100, 'n_feature...",10,100,8,50,,0.826531,0.995902,0.00,1.096,0.229
1364,"{'n_trees': 10, 'n_bootstrap': 100, 'n_feature...",10,100,20,15,,0.823129,0.991803,0.00,2.627,0.239
1365,"{'n_trees': 10, 'n_bootstrap': 100, 'n_feature...",10,100,28,45,,0.826531,0.995902,0.00,3.420,0.242
1366,"{'n_trees': 10, 'n_bootstrap': 150, 'n_feature...",10,150,4,5,,0.829932,1.000000,0.00,0.865,0.176


In [None]:
gsf.to_excel('benchmark_forest.xlsx', index=False)

In [None]:
example = [()]