In [None]:
#Copyright April 1. 2018, Warren E. Agin
#Code released under the Creative Commons Attribution-NonCommercial-
#ShareAlike 4.0 International License. You may obtain a copy of the license at 
#https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode

In [None]:
DATA_URL = ''
FEATURE_NAMES = 'featureNames.csv'
TRAINING_FILE = 'trainingFile.csv'
TRAINING_URL = '%s/%s' % (DATA_URL, TRAINING_FILE)
EVAL_FILE1 = 'test1File.csv'
EVAL_FILE2 = 'test2File.csv'
EVAL_FILE3 = 'test3File.csv'
EVAL_URL = '%s/%s' % (DATA_URL, EVAL_FILE)
DELETED_FEATURES = [
    'REALPROPVALUESQR',
    'REALPROPVALUELOG',
    'PERSPROPVALUESQR',
    'PERSPROPVALUELOG',
    'UNSECNPRVALUESQR',
    'UNSECNPRVALUELOG',
    'UNSECPRVALUESQR',
    'UNSECPRVALUELOG',
    'AVGMNTHIVALUESQR',
    'AVGMNTHIVALUELOG',
    'NTRDBT',
#    'JOINT',
#    'ORGD1FPRSE',
#    'PRFILE',
#    'DISTSUCCESS',
    'FEEP',
#    'FEEI',
    'FEEW',
    'REALPROPNULL',
#    'REALPROPNONE',
#    'REALPROPVALUE',
    'PERSPROPNULL',
#    'PERSPROPVALUE',
#    'UNSECNPRNULL',
#    'UNSECNPRVALUE',
    'UNSECEXCESS',
    'UNSECPRNULL',
#    'UNSECPRVALUE',
    'AVGMNTHINULL',
#    'AVGMNTHIVALUE',
#    'IEINDEX',
#    'IEGAP'
]

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import csv
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [None]:
#read training and test sets into panda arrays
features = pd.read_csv(FEATURE_NAMES)
featureNames = list(features.columns)
training = pd.read_csv(TRAINING_FILE, names=featureNames)
testing = pd.read_csv(EVAL_FILE, names=featureNames)

In [None]:
#commands to explore data - optional to run
training.head(5)
print('The shape of our features is:', training.shape)  #shape of training file
training.describe()  #basic stastistics for different features

In [None]:
#convert the panda arrays to numpy arrays for use with the learner
#creates training and testing as numpy arrays, trainingLabels and testLabels as numpy
#arrays holding the success field, and featureNames as a list of the features

def removeFeatures(file):
    for each in DELETED_FEATURES:
        file=file.drop(each, axis=1)
    return(file)

def convert2np(file):   
    labels = np.array(file['SUCCESS'])   #copy out the success column as a numpy array
    file = file.drop('SUCCESS', axis=1)  #remove the success column
    file = removeFeatures(file)          #remove the features not being used
    file = np.array(file)                #convert the data to a numpy array
    return(file,labels)
    
training,trainingLabels = convert2np(training)
testing,testingLabels = convert2np(testing)
for each in DELETED_FEATURES:
    featureNames.remove(each)
featureNames.remove('SUCCESS')

In [None]:
#to look at the shapes of the numpy arrays created - optional to run

print('Training Features Shape:', training.shape)
print('Training Labels Shape:', trainingLabels.shape)
print('Testing Features Shape:', testing.shape)
print('Testing Labels Shape:', testingLabels.shape)
print(len(featureNames))
print(featureNames)

In [None]:
#create, run and evaluate the model

#set logging file
log = 'Features used: ' + str(featureNames) + '\r\n'

#function to calculate metrics
def calcResults(set, labels):
    # Use the predict method on the training and test data
    predictions = rf.predict(set)
    # Calculate the number of errors
    numberErrors = sum(abs(predictions - labels))

    # Calculate and display accuracy and other statistics
    accuracy = (1-(numberErrors/len(set)))*100
    aucResult = metrics.roc_auc_score(labels, predictions)
    cMatrix = metrics.confusion_matrix(labels, predictions)

    return(accuracy, aucResult, cMatrix)


#define characteristics for the learner
n_estimators = 1000
max_features = 'auto' #default is 'auto' which considers sqrt(n_features) at each split - alt are 1 to consider n_features or a decimal
max_depth = 20    #default is 'None'
min_samples_split = 150 #default is 2
min_samples_leaf = 1  #default is 1
random_state = 26 #default is 'None' but use a number for testing variations

log += ' n_estimators: %s \r\n max_features: %s \r\n max_depth: %s \r\n min_samples_split: %s \r\n min_samples_leaf: %s \r\n' % (n_estimators,max_features,max_depth,min_samples_split,min_samples_leaf)

#define parameter being tested and the variations on the parameter being tested
charBeingVaried = 'min_samples_leaf'
variations = [2,3,4,5,10,15]

for variation in variations:  
    
    log += 'Running model with %s set to %s.\r\n' % (charBeingVaried,variation)
    
    min_samples_leaf = variation    #when running multiple criterion, replace null with characteristic being varied
    
    # Instantiate model with n_estimators decision trees
    rf = sk.ensemble.RandomForestClassifier(criterion = 'gini', n_estimators = n_estimators, max_features = max_features, max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, random_state = random_state)

    # criterion = 'gini' or 'entropy'

    # Train the model on training data
    rf.fit(training, trainingLabels);

    #run predictions on the training set and the test set and calculate metrics

    accuracy, aucResult, cMatrix = calcResults(training, trainingLabels)

    log += 'Train Set Accuracy: %s \r\n' %  round(accuracy, 2)
    log += 'Train set AUC: %s  \r\n' % round(aucResult, 4)

    accuracy, aucResult, cMatrix = calcResults(testing, testingLabels)

    log += 'Test Set Accuracy: %s \r\n' %  round(accuracy, 2)
    log += 'Test set AUC: %s  \r\n' % round(aucResult, 4)
    log += '\r\n'

    # Print out the confusion matrix.
    print(variation)
    print('accuracy: %s' % round(accuracy, 2))
    print(cMatrix)
    
# Get numerical feature importances 
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 6)) for feature, importance in zip(featureNames, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
#[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
for pair in feature_importances:
    log += 'Variable: {:20} Importance: {}\r\n'.format(*pair)
    
print(log)

In [None]:
#write the log file to LOG.txt

with open('LOG.txt', 'w', newline='') as f:
    f.write(log)
    