In [1]:
# The following is from "big run"
import sys
sys.path.append('..')
from lib import *
import pandas as pd
import numpy as np
import time

# Setup
lsa_np = np.load('../data/parsed/lsa_output.npy')
metadata = pd.read_pickle('../data/parsed/pickles/pickled_data_test.pickle')
metadata = metadata.loc[metadata['Scenario'] == '401']
metadata = metadata.reset_index(drop=True)
lsa_df = pd.DataFrame(lsa_np)
df = pd.concat([metadata, lsa_df], axis=1, join_axes=[metadata.index])
df = df.loc[df['Label'] != '-1']
df = df.reset_index(drop=True)
cat_features = ['To','From']
features = list(range(100))
features.extend(cat_features + ['Date'])
# features.extend(cat_features + ['ID'])

df = df[features + ['Label'] + ['ID']]
# df = df[features + ['ID']]

In [4]:
def one_run(df, n_trees, tree_depth, random_seed, n_max_features, n_max_input, cat_features, folds, filename):
    '''Do n-fold crossvalidation for a forest built with some set of params'''
    df = df.sample(frac=1) #shuffling
    foldsize = df.shape[0] // folds
    fold_stats = []
    for fold in range(1, folds + 1):
        test_set = df[(fold - 1) * foldsize : fold * foldsize]
        test_set = test_set.reset_index(drop=True)
        
        train_set = df[:(fold - 1) * foldsize].append(df[fold * foldsize:])
        train_set = train_set.reset_index(drop=True)
        
        forest = RNF(train_set, n_trees, tree_depth, random_seed, n_max_features, n_max_input, cat_features)

        start = time.time()
        forest.fit_parallel()
        end = time.time()
        fit_time = end - start
        
        start = time.time()
        predictions = forest.predict_parallel(test_set)[1]
        end = time.time()

        prediction_time = end - start

        stats = list(evalStats(predictions, test_set))
        stats.append(fit_time)
        stats.append(prediction_time)
        fold_stats.append(stats)
    
    with open(filename, 'w') as f:
        for stat in fold_stats:
            f.write("\n" + str(stat))
        fold_stats = np.array(fold_stats)
        f.write("\naverage precision: {}\n".format(np.mean(fold_stats[0,:])))
        f.write("average recall: {}\n".format(np.mean(fold_stats[1,:])))
        f.write("average accuracy: {}\n".format(np.mean(fold_stats[2,:])))
        f.write("average f1: {}\n".format(np.mean(fold_stats[3,:])))
        f.write("average fit time: {}\n".format(np.mean(fold_stats[4,:])))
        f.write("average prediction time: {}\n".format(np.mean(fold_stats[5,:])))
        
        
    

In [None]:
n_trees = 2
tree_depth = 10
random_seed = 42
num_features = 11
max_input = 600
n_folds = 10
filename = 'results.txt'
one_run(df, n_trees, tree_depth, random_seed, num_features, num_features, ["To", "From"], n_folds, filename)

Recall:56.52173913043478%
Precision:72.22222222222221%
Accuracy:57.74647887323944%
F1:0.6341463414634146
Recall:50.0%
Precision:60.0%
Accuracy:57.74647887323944%
F1:0.5454545454545454
Recall:25.0%
Precision:62.5%
Accuracy:49.29577464788733%
F1:0.35714285714285715
Recall:43.24324324324324%
Precision:72.72727272727273%
Accuracy:61.97183098591549%
F1:0.5423728813559323
Recall:33.33333333333333%
Precision:82.35294117647058%
Accuracy:56.33802816901409%
F1:0.4745762711864407
Recall:39.39393939393939%
Precision:65.0%
Accuracy:61.97183098591549%
F1:0.490566037735849
Recall:36.11111111111111%
Precision:68.42105263157895%
Accuracy:59.154929577464785%
F1:0.4727272727272728
Recall:47.05882352941176%
Precision:76.19047619047619%
Accuracy:67.6056338028169%
F1:0.5818181818181817
