## Random Forrest & Decision Forrest

In [None]:
## imports 
import numpy as np 
import pandas as pd 

## dictionary for the dataset name & paths 
DATASETS ={
    "small":"../Data/small/glass.data",
    "medium":"../Data/medium/drug_consumption.data",
    "large":"../Data/large/CTG.csv"
               }

In [None]:
def load_dataset(dataset_name):
    """
    Loads the dataset from the dictionary.
    """
    if dataset_name == 'small':
        ## exclude the first column ID (labeled 0 to 10, so use 1-10)
        df = pd.read_csv(DATASETS[dataset_name],header=None)
        df.drop(df.columns[0], axis=1, inplace=True)
        df.columns = list(range(0,10))
        return df
    if dataset_name == 'medium':
        df= pd.read_csv(DATASETS[dataset_name],header=None)
        tgt = 21
        cols_to_keep = list(range(0,13)) + [tgt]
        # df.select_dtypes(include='object')
        df = df[cols_to_keep]
        ## convert the categorical to numbers 
        txt2num = {x: i for i, x in enumerate(df[tgt].unique())}
        df[tgt] = df[tgt].map(txt2num)
        return df
    if dataset_name == 'large':
        df= pd.read_csv(DATASETS[dataset_name])
        ## fill nans with median value of the column 
        df.fillna(df.median(),inplace=True)
        return df
    else:
        print("Dataset not found")

In [53]:
## load the data
DATASET_NAME = 'large'
df = load_dataset(DATASET_NAME)
df.head()

Unnamed: 0,b,e,LBE,LB,AC,FM,UC,ASTV,MSTV,ALTV,...,C,D,E,AD,DE,LD,FS,SUSP,CLASS,NSP
0,240.0,357.0,120.0,120.0,0.0,0.0,0.0,73.0,0.5,43.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9.0,2.0
1,5.0,632.0,132.0,132.0,4.0,0.0,4.0,17.0,2.1,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6.0,1.0
2,177.0,779.0,133.0,133.0,2.0,0.0,5.0,16.0,2.1,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6.0,1.0
3,411.0,1192.0,134.0,134.0,2.0,0.0,6.0,16.0,2.4,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6.0,1.0
4,533.0,1147.0,132.0,132.0,4.0,0.0,5.0,16.0,2.4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0


In [72]:
features = df.columns[:-1]
target = df.columns[-1]
## split the data into training and test sets
nb_train = int(0.8*len(df))
df = df.sample(frac=1, random_state=217)
X_train = df[features][:nb_train]
y_train = df[target][:nb_train].values
X_test = df[features][nb_train:]
y_test = df[target][nb_train:].values

In [73]:
NUM_TREES = [1,10,25,50,75,100]
F = [1,
     3,
     int(np.log2(len(X_train)+1)),
     int(np.sqrt(len(X_train)))
     ]
## make combinations of all the parameters 
import itertools 
combs = list(itertools.product(NUM_TREES, F))

In [74]:
from termcolor import colored
import time
import gc 
## iterate through the combinations and train the random forest
holders = []
for nt, f in combs:
    ## instantiate the random forest
    tic = time.time()
    model = random_forest(X_train, y_train,
                      n_estimators=nt,
                      max_features=f,
                      max_depth=10,
                      min_samples_split=2)
    toc = time.time()
    tic_toc = toc - tic
    preds = predict_rf(model, X_test)
    acc = sum(preds == y_test) / len(y_test)
    ## print the results: Trees | Features | Accuracy | Time , in green 
    print(colored(f"Trees {nt} | Number of Features {f} | Accuracy {acc*100:.3f} | Time {tic_toc:.3f}s", "green"))
    ## save the results to a csv file
    model_df = pd.DataFrame.from_records(model)
    model_df.drop(['left_split', 'right_split'], axis=1, inplace=True)
    ## add the columns for nt & f 
    model_df['Num_trees'] = nt
    model_df['Num_features'] = f
    model_df['Accuracy'] = acc
    holders.append(model_df)
    del preds
    del acc
    del model_df
    del model
    gc.collect()

[32mTrees 1 | Number of Features 1 | Accuracy 43.889 | Time 3.659s[0m
[32mTrees 1 | Number of Features 3 | Accuracy 53.889 | Time 11.015s[0m
[32mTrees 1 | Number of Features 9 | Accuracy 65.556 | Time 26.358s[0m
[32mTrees 1 | Number of Features 26 | Accuracy 28.333 | Time 26.616s[0m
[32mTrees 10 | Number of Features 1 | Accuracy 30.556 | Time 32.446s[0m
[32mTrees 10 | Number of Features 3 | Accuracy 57.778 | Time 115.887s[0m
[32mTrees 10 | Number of Features 9 | Accuracy 81.111 | Time 188.757s[0m
[32mTrees 10 | Number of Features 26 | Accuracy 76.111 | Time 314.100s[0m
[32mTrees 25 | Number of Features 1 | Accuracy 49.444 | Time 123.054s[0m
[32mTrees 25 | Number of Features 3 | Accuracy 63.889 | Time 224.715s[0m
[32mTrees 25 | Number of Features 9 | Accuracy 82.222 | Time 510.076s[0m
[32mTrees 25 | Number of Features 26 | Accuracy 63.333 | Time 1092.945s[0m
[32mTrees 50 | Number of Features 1 | Accuracy 30.556 | Time 202.547s[0m


In [None]:
## combine the list of dataframes 
combined_df = pd.concat(holders)
combined_df.to_csv(f"../Data/out/{DATASET_NAME}_df_results.csv", index=False)

In [52]:
## combine the list of dataframes 
combined_df = pd.concat(holders)
combined_df.to_csv(f"../Data/out/{DATASET_NAME}_df_results.csv", index=False)
metrics = "../Data/out/small_rf_results.csv"
#metrics = "../Data/out/medium_rf_results.csv"
#metrics = "../Data/out/large_rf_results.csv"
df= pd.read_csv(metrics)
## convert accuracy to percentage 
df['Accuracy'] = df['Accuracy']*100
df.groupby(['Num_trees','Num_features']).mean().round(2)

Unnamed: 0,information_gain,split_point,feature_idx,Num_trees,Num_features,Accuracy
0,0.165505,13.0,9,1,1,75.586854
1,0.524151,7.0,35,1,3,84.976526
2,0.054557,30.0,15,1,10,74.413146
3,0.07378,0.0,33,1,41,84.037559
4,0.06949,0.0,25,10,1,75.117371


Unnamed: 0_level_0,Unnamed: 1_level_0,information_gain,split_point,feature_idx,Accuracy
Num_trees,Num_features,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,0.165505,13.0,9.0,0.755869
1,3,0.524151,7.0,35.0,0.849765
1,10,0.054557,30.0,15.0,0.744131
1,41,0.07378,0.0,33.0,0.840376
10,1,0.089626,23.5,22.5,0.751174
10,3,0.099766,0.45,22.5,0.859155
10,10,0.164879,35.2,19.5,0.746479
