In [1]:
# Welcome to the nested cross-valiudation program
# this program goes through our interpretation of
# nested cross-validation using random forest models
# This program also goes through how we evaluated 
# each model and how we filtered through them
import pandas as pd
import numpy as np
import sys
import os
from sklearn.model_selection import train_test_split 

np.set_printoptions(threshold=sys.maxsize)
# This used for the nested cross validation method:
# Read the Koff and Kon Dataset
koff = pd.read_csv("Dataset_S2.csv") # dataset
kon = pd.read_csv("Dataset_S1.csv") # dataset
# Remove all empty or NA cells
koff.dropna()
kon.dropna()

xf = koff.iloc[:,2:] # X Koff data
yf = koff.iloc[:,1] # Label Koff data
xn = kon.iloc[:,2:] # X Kon data
yn = kon.iloc[:,1] # Label Kon data

In [None]:
##Regressor Setup

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error 
# Split into train and test sets
xTrain, xTest, yTrainOff, yTestOff = train_test_split(xf, yf, test_size = 1/9, random_state = 4)
xTrain, xTest, yTrainOn, yTestOn = train_test_split(xn, yn, test_size = 1/9, random_state = 4)

# Creating Koff Regressor
regressorf = RandomForestRegressor(n_estimators = 100, random_state = 0)
# Fitting Regressor
regressorf.fit(xTrain, yTrainOff)
# Creating Kon Regressor
regressorn = RandomForestRegressor(n_estimators = 100, random_state = 0)
# Fitting Regressor
regressorn.fit(xTrain, yTrainOn)

# Making a prediction
yPredOff = regressorf.predict(xTest)
yPredOn = regressorn.predict(xTest)

# Creating variable for standard MSE for Koff and Kon
standardOff = mean_squared_error(yTestOff, yPredOff)
standardOn = mean_squared_error(yTestOn, yPredOn)
print(standardOn)
print(standardOff)
# Output:
# StandardOff = 0.1043105671301628
# StandardOn = 0.41629041684426005

In [None]:
# Conversion from feature to number
def featToNum(columns):
  colToNum = {} 
  num = 1
  for i in columns: 
    colToNum[i] = num
    num += 1
  return colToNum
# Conversion from number to feature
def numToFeat(columns):
  numToCol = {}
  num = 1 
  for i in columns: 
    numToCol[num] = i
    num += 1
  return numToCol

In [None]:
# Create Dictionary for Feature to number vise versa

colToNum = featToNum(xf.columns)
numToCol = numToFeat(xf.columns)

print(colToNum)
print(numToCol)

In [None]:
# Feature selection via bias draw
import random
def importanceDraw(importance,index):
  counter = 99999
  while not counter in index:
    importanceSum = sum(importance)
    # Inverse importance values
    K = []
    for i in importance: 
      K.append(1 - i) 
    inverseSum = sum(K)
    # Weighing them all so that the sum is 1
    T = []
    for i in K: 
      T.append(i/inverseSum)
    # Drawing a number between 0,1 
    draw = random.random()
    counter = 1
    # while number is not negative, subtract draw from feature weighting
    for i in T: 
      draw -= i
      counter += 1
      
      if(draw < 0):
        break
    
  return index[counter]

In [None]:
#random Forest Regressor and Evaluation
def RFRegressor(xTrain, xTest,yTrain, yTest, lookup, importance):
  index = numToFeat(xTrain.columns)
  
  # Call Importance Draw
  draw = importanceDraw(importance, index)
  # Remove drawn label from the dataset
  xTrain = xTrain.drop(labels=draw, axis=1)
  xTest = xTest.drop(labels=draw, axis=1)
  
  key = 0
  concat = ""
  features = 0

  # Creating two keys for lookup table
  for i in xTrain.columns:
    key += colToNum[i]
    concat += str(colToNum[i])
    #key1 represents all features translated into concatenated number cuz its faster to compare than strings
    key1 = int(concat)
    features += 1

  # check the lookup to see if the model has been done
  if not (key in lookup and key1 in lookup[key]):
    #create and fit model
    regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
    regressor.fit(xTrain, yTrain)
    yPred = regressor.predict(xTest)
    # gather importance 
    importance = regressor.feature_importances_
    lookup[key] = {}
    # gather data then add to lookup table
    data = [ mean_squared_error(yTest, yPred) , features , xTrain.columns]
    lookup[key][key1] = data
  return [xTrain, xTest, importance]

In [None]:
# Calculate J score for each model in the lookup table
def calculateRange(dictJ, lookup, std):
  bestJ = []
  for key in lookup: 
    for key1 in lookup[key]:
      data = lookup[key][key1]
      J = (std - data[0])/std * (201 - data[1])/201
      temp = [J, data[1], data[2]]
      dictJ[key] = {}
      dictJ[key][key1] = temp
      # If J score is greater than 0, than keep it
      if J >= 0.0 and data[1] <=2 and data[0] < std:
        bestJ.append(temp)
  return bestJ

In [None]:
# Find best J score
def calculateJ(dictJ, lookup, std):
  bestJ = [ 0, xTrain.columns]
  for key in lookup: 
    for key1 in lookup[key]:
      data = lookup[key][key1]
      J = (std - data[0])/std * (201 - data[1])/201
      temp = [J, data[1], data[2]]
      dictJ[key] = {}
      dictJ[key][key1] = temp
      if bestJ[0] < J:
        bestJ = temp
  return bestJ

In [None]:
import csv
# Write dataset with J scores and model results in to csv
def writeCSV(name, mydict):
  name += ".csv"
  with open(name, 'w') as csv_file:  
      csv_file.truncate()
      writer = csv.writer(csv_file)
      for key in mydict: 
        for key1 in mydict[key]:
          writer.writerow([key, key1, mydict[key][key1][0], mydict[key][key1][1], mydict[key][key1][2]])

# Read dataset with J scores and model results in to csv
def readCSV(name):
  mydict ={}
  name += ".csv"
  with open(name, 'r') as csv_file:  
      reader = csv.reader(csv_file)
      for row in reader:
        key = float(row[0])
        key1 = float(row[1])
        data = [float(row[2]), int(row[3]),row[4]]
        mydict[key] = {}
        mydict[key][key1] = data
  return mydict
        

In [None]:
# #Actual Script

# lookupOff = {}
# lookupOn = {}
# dictOffJ = {}
# dictOnJ = {} 
#Read datasets
lookupOff = readCSV("dataOff")
lookupOn = readCSV("dataOn")
dictOffJ = readCSV("dataOffJ")
dictOnJ = readCSV("dataOnJ")

In [None]:
# Run 200 nests with 200 layers
for y in range(1,200):
  for x in range(1,200):
    print("y " ,y," x " x)
    # split training data
    xTrainOff, xTestOff, yTrainOff, yTestOff = train_test_split(xf, yf, test_size = 1/9, random_state = 4) 
    xTrainOn, xTestOn, yTrainOn, yTestOn = train_test_split(xn, yn, test_size = 1/9, random_state = 4)
    # Gather label importances
    importanceOff = regressorf.feature_importances_
    importanceOn = regressorn.feature_importances_

    # while number of labels is greater than 1
    while len(xTrainOff.columns) > 1:
      # Call Random Forest Regressor
      dataOff = RFRegressor(xTrainOff, xTestOff, yTrainOff, yTestOff,lookupOff, importanceOff)
      dataOn = RFRegressor(xTrainOn, xTestOn, yTrainOn, yTestOn,lookupOn, importanceOn)
      # Set variables so they can be recalled in the next loop
      xTrainOff = dataOff[0]
      xTrainOn = dataOn[0]
      xTestOff = dataOff[1]
      xTestOn = dataOn[1]
      importanceOff = dataOff[2]
      importanceOn = dataOn[2]
  #write to csv 
  writeCSV("dataOff", lookupOff)
  writeCSV("dataOn", lookupOn)
  writeCSV("dataOffJ", dictOffJ)
  writeCSV("dataOnJ", dictOnJ)

In [None]:
# Find best J Score
jOff = calculateJ(dictOffJ, lookupOff, standardOff)
jOn = calculateJ(dictOnJ, lookupOn, standardOn)
mseOff = standardOff - (jOff[0] /((201 - jOff[1])/201) * standardOff ) 
mseOn = standardOn - (jOn[0] /((201 - jOn[1])/201) * standardOn ) 

print("JOff Value ", jOff[0],  "MSE", mseOff, "JOff Param", jOff[1], jOff[2])
print("JOn Value " ,jOn[0] , "MSE", mseOn, "JOn Param" ,jOn[1], jOn[2])

In [None]:
# FInd best J Score based on given range
jOffRange = calculateRange(dictOffJ, lookupOff, standardOff)
jOnRange = calculateRange(dictOnJ, lookupOn, standardOn)
for jOn in jOnRange: 
  mseOn = standardOn - (jOn[0] /((201 - jOn[1])/201) * standardOn ) 
  print("JOn Value " ,jOn[0] , "MSE", mseOn, "JOn Param" ,jOn[1], jOn[2])
for jOff in jOffRange: 
  mseOff = standardOff - (jOff[0] /((201 - jOff[1])/201) * standardOff ) 
  print("JOff Value ", jOff[0],  "MSE", mseOff, "JOff Param", jOff[1], jOff[2])

In [None]:
print(standardOff)
print(standardOn)