In [163]:
import pandas as pd
import numpy as np

data = {
        "Outlook": ["Sunny", "Sunny", "Overcast", "Rainy", "Rainy", "Rainy", "Overcast", "Sunny", "Sunny", "Rainy", "Sunny", "Overcast", "Overcast", "Rainy"],
        "Temperature": ["Hot", "Hot", "Hot", "Mild", "Cool", "Cool", "Cool", "Mild", "Cool", "Mild", "Mild", "Mild", "Hot", "Mild"],
        "Humidity": ["High", "High", "High", "High", "Normal", "Normal", "Normal", "High", "Normal", "Normal", "Normal", "High", "Normal", "High"],
        "Wind": [False, True, False, False, False, True, True, False, False, False, True, True, False, True],
        "Class": ["N", "N", "P", "P", "P", "N", "P", "N", "P", "P", "P", "P", "P", "N"]
    }

# Create the DataFrame
######
dataSet = pd.DataFrame(data)

# define the features to be tuned
#####
featuresMap = {'Outlook': 'Cat',
               'Temperature': 'Cat',
               'Humidity': 'Cat',
               'Wind': 'Cat'
               }



#find the columns that we need to focus on 
currentFeatures = list(featuresMap.keys())
currentColumns = currentFeatures.copy()
currentColumns.append('Class')

# subset our data set by the features that we need
dataSet = dataSet[currentColumns]

# fine unique class values
uniqueClasses = dataSet['Class'].unique()

# create a table to show all possible outcomes
entropyOutput = pd.DataFrame({'Class': [], 'Variable': [], 'Value': []})

for currentClass in uniqueClasses:
    for currentFeature, featureType in featuresMap.items():
        for currentFeatureValue in dataSet[currentFeature].unique():
            newRow = pd.DataFrame({'Class': currentClass, 'Variable': currentFeature, 'Value': currentFeatureValue}, index=[len(entropyOutput)])
            entropyOutput = pd.concat([entropyOutput, newRow], ignore_index = False)

# melt our dataset to show all the records in question
meltedDataSet = pd.melt(dataSet, id_vars=['Class'], value_vars=currentFeatures, var_name='Variable', value_name='Value')
meltedDataSet['Count'] = 1

# find the count of each specific record
bottomLevelStats = meltedDataSet.groupby(['Class','Variable','Value'], as_index=False).sum('Count')
bottomLevelStats.rename(columns = {"Count": "bottomCount"}, inplace = True) 

# roll up the bottom level not accounting for class outcome
middleLevelStats = meltedDataSet.groupby(['Variable','Value'], as_index=False).sum('Count')
middleLevelStats.rename(columns = {"Count": "middleCount"}, inplace = True) 

# find the count by class outcome
topLevelStats = meltedDataSet.groupby(['Variable'], as_index=False).sum('Count')
topLevelStats.rename(columns = {"Count": "topCount"}, inplace = True) 

# find the top level entropy as starting point for gain ratio
classLevelStats = meltedDataSet.groupby(['Class','Variable'], as_index=False).sum('Count')
classLevelStats.rename(columns = {"Count": "classCount"}, inplace = True)
classLevelStats['topEntropy'] = -1 * classLevelStats['classCount']/len(dataSet) * np.log2(classLevelStats['classCount']/len(dataSet))
classLevelStats = classLevelStats.groupby(['Variable'], as_index=False).sum('topEntropy')

# merge in all the levels of data points
entropyOutput = entropyOutput.merge(bottomLevelStats, how = 'left', on = ['Class', 'Variable', 'Value'])
entropyOutput = entropyOutput.merge(middleLevelStats, how = 'left', on = ['Variable', 'Value'])
entropyOutput = entropyOutput.merge(topLevelStats, how = 'left', on = ['Variable'])
         
# fill nas with very small number to not break log calc
entropyOutput = entropyOutput.fillna(.000001)

# calculate entropy at feature type level
entropyOutput['bottomEntropy'] = -1 * (entropyOutput['bottomCount']/entropyOutput['middleCount']) * np.log2(entropyOutput['bottomCount']/entropyOutput['middleCount'])

# calculate the entropy at the feature level 
featureSpecific = entropyOutput.groupby(['Variable', 'Value', 'middleCount', 'topCount'], as_index=False).sum('bottomEntropy')
featureSpecific['middleEntropy'] = featureSpecific['middleCount']/featureSpecific['topCount'] * featureSpecific['bottomEntropy']

# calculate our IV function to correct the gain ratio for features with a lot of possibilities
featureSpecific['IV'] = -1 * featureSpecific['middleCount'] / featureSpecific['topCount'] * np.log2(featureSpecific['middleCount'] / featureSpecific['topCount'])

# roll up the middle entropy by the feature
featureSpecific = featureSpecific[['Variable', 'middleEntropy', 'IV']].groupby(['Variable'], as_index=False).sum(['middleEntropy', 'IV'])

# merge in the class level entropy
featureSpecific = featureSpecific.merge(classLevelStats, how = 'left', on = ['Variable'])

# calc the numerator of ratio and ratio to pick the best next feature
featureSpecific['gain'] = featureSpecific['topEntropy'] - featureSpecific['middleEntropy']
featureSpecific['gainRatio'] = featureSpecific['gain'] / featureSpecific['IV']

display(featureSpecific)
display(featureSpecific.nlargest(1, 'gainRatio'))


Unnamed: 0,Variable,middleEntropy,IV,classCount,topEntropy,gain,gainRatio
0,Humidity,0.78845,1.0,14,0.940286,0.151836,0.151836
1,Outlook,0.693538,1.577406,14,0.940286,0.246748,0.156427
2,Temperature,0.911063,1.556657,14,0.940286,0.029223,0.018773
3,Wind,0.892159,0.985228,14,0.940286,0.048127,0.048849


Unnamed: 0,Variable,middleEntropy,IV,classCount,topEntropy,gain,gainRatio
1,Outlook,0.693538,1.577406,14,0.940286,0.246748,0.156427


In [170]:
import pandas as pd
import numpy as np

data = {
        "Outlook": ["Sunny", "Sunny", "Overcast", "Rainy", "Rainy", "Rainy", "Overcast", "Sunny", "Sunny", "Rainy", "Sunny", "Overcast", "Overcast", "Rainy"],
        "Temperature": ["Hot", "Hot", "Hot", "Mild", "Cool", "Cool", "Cool", "Mild", "Cool", "Mild", "Mild", "Mild", "Hot", "Mild"],
        "Humidity": ["High", "High", "High", "High", "Normal", "Normal", "Normal", "High", "Normal", "Normal", "Normal", "High", "Normal", "High"],
        "Wind": [False, True, False, False, False, True, True, False, False, False, True, True, False, True],
        "Class": ["N", "N", "P", "P", "P", "N", "P", "N", "P", "P", "P", "P", "P", "N"]
    }

# Create the DataFrame
######
dataSet = pd.DataFrame(data)

# define the features to be tuned
#####
featuresMap = {'Outlook': 'Cat',
               'Temperature': 'Cat',
               'Humidity': 'Cat',
               'Wind': 'Cat'
               }




#find the columns that we need to focus on 
currentFeatures = list(featuresMap.keys())
currentColumns = currentFeatures.copy()
currentColumns.append('Class')

# subset our data set by the features that we need
dataSet = dataSet[currentColumns]

# melt our dataset to show all the records in question
meltedDataSet = pd.melt(dataSet, id_vars=['Class'], value_vars=currentFeatures, var_name='Variable', value_name='Value')

# find the count of each specific record
regressionEstimateByClass = meltedDataSet.groupby(['Variable','Value'], as_index=False).mean('Class')
regressionEstimateByClass.rename(columns = {"Class": "estimatedOutput"}, inplace = True) 

mseCalc = meltedDataSet.merge(regressionEstimateByClass, how = 'left', on = ['Class', 'Variable', 'Value'])
mseCalc['SE'] = (mseCalc['estimatedOutput'] - mseCalc['Class'])**2
mseOutput = mseCalc.groupby(['Variable'], as_index=False).mean('SE')

featureSpecific.nlargest(1, 'SE')


SyntaxError: invalid syntax (229994795.py, line 61)

In [None]:
# melt our dataset to show all the records in question
meltedDataSet = pd.melt(dataSet, id_vars=['Class'], value_vars=currentFeatures, var_name='Variable', value_name='Value')
meltedDataSet['Count'] = 1

# find the count of each specific record
bottomLevelStats = meltedDataSet.groupby(['Class','Variable','Value'], as_index=False).sum('Count')
bottomLevelStats.rename(columns = {"Count": "bottomCount"}, inplace = True) 

# roll up the bottom level not accounting for class outcome
middleLevelStats = meltedDataSet.groupby(['Variable','Value'], as_index=False).sum('Count')
middleLevelStats.rename(columns = {"Count": "middleCount"}, inplace = True) 

# find the count by class outcome
topLevelStats = meltedDataSet.groupby(['Variable'], as_index=False).sum('Count')
topLevelStats.rename(columns = {"Count": "topCount"}, inplace = True) 

# find the top level entropy as starting point for gain ratio
classLevelStats = meltedDataSet.groupby(['Class','Variable'], as_index=False).sum('Count')
classLevelStats.rename(columns = {"Count": "classCount"}, inplace = True)
classLevelStats['topEntropy'] = -1 * classLevelStats['classCount']/len(dataSet) * np.log2(classLevelStats['classCount']/len(dataSet))
classLevelStats = classLevelStats.groupby(['Variable'], as_index=False).sum('topEntropy')

# merge in all the levels of data points
entropyOutput = entropyOutput.merge(bottomLevelStats, how = 'left', on = ['Class', 'Variable', 'Value'])
entropyOutput = entropyOutput.merge(middleLevelStats, how = 'left', on = ['Variable', 'Value'])
entropyOutput = entropyOutput.merge(topLevelStats, how = 'left', on = ['Variable'])
         
# fill nas with very small number to not break log calc
entropyOutput = entropyOutput.fillna(.000001)

# calculate entropy at feature type level
entropyOutput['bottomEntropy'] = -1 * (entropyOutput['bottomCount']/entropyOutput['middleCount']) * np.log2(entropyOutput['bottomCount']/entropyOutput['middleCount'])

# calculate the entropy at the feature level 
featureSpecific = entropyOutput.groupby(['Variable', 'Value', 'middleCount', 'topCount'], as_index=False).sum('bottomEntropy')
featureSpecific['middleEntropy'] = featureSpecific['middleCount']/featureSpecific['topCount'] * featureSpecific['bottomEntropy']

# calculate our IV function to correct the gain ratio for features with a lot of possibilities
featureSpecific['IV'] = -1 * featureSpecific['middleCount'] / featureSpecific['topCount'] * np.log2(featureSpecific['middleCount'] / featureSpecific['topCount'])

# roll up the middle entropy by the feature
featureSpecific = featureSpecific[['Variable', 'middleEntropy', 'IV']].groupby(['Variable'], as_index=False).sum(['middleEntropy', 'IV'])

# merge in the class level entropy
featureSpecific = featureSpecific.merge(classLevelStats, how = 'left', on = ['Variable'])

# calc the numerator of ratio and ratio to pick the best next feature
featureSpecific['gain'] = featureSpecific['topEntropy'] - featureSpecific['middleEntropy']
featureSpecific['gainRatio'] = featureSpecific['gain'] / featureSpecific['IV']

display(featureSpecific)
display(featureSpecific.nlargest(1, 'gainRatio'))

In [142]:
display(dataSet)

subsetDict = {"Outlook": "Sunny"}

for currentFeature, featureType in subsetDict.items():
    dataSet = dataSet[dataSet[currentFeature] == featureType]
                      
display(dataSet)

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Class
0,Sunny,Hot,High,False,N
1,Sunny,Hot,High,True,N
2,Overcast,Hot,High,False,P
3,Rainy,Mild,High,False,P
4,Rainy,Cool,Normal,False,P
5,Rainy,Cool,Normal,True,N
6,Overcast,Cool,Normal,True,P
7,Sunny,Mild,High,False,N
8,Sunny,Cool,Normal,False,P
9,Rainy,Mild,Normal,False,P


Unnamed: 0,Outlook,Temperature,Humidity,Wind,Class
0,Sunny,Hot,High,False,N
1,Sunny,Hot,High,True,N
7,Sunny,Mild,High,False,N
8,Sunny,Cool,Normal,False,P
10,Sunny,Mild,Normal,True,P


In [178]:
def find_smallest_difference(original_dict, value_to_subtract):
    # Initialize a new dictionary to store the differences
    differences_dict = {}

    # Iterate through the original dictionary
    for key, val in original_dict.items():
        # Calculate the difference
        difference = abs(key - value_to_subtract)
        # Store the key and the difference in the new dictionary
        differences_dict[key] = difference

    # Find the key with the smallest difference
    smallest_key = min(differences_dict, key=differences_dict.get)
    smallest_value = differences_dict[smallest_key]

    return smallest_key, smallest_value

# Example usage
original_dict = {10: 'a', 0: 'b', 30: 'c', 40: 'd'}
value_to_subtract = 5

smallest_key, smallest_value = find_smallest_difference(original_dict, value_to_subtract)
print(f"The key with the smallest difference is {smallest_key} with a difference of {smallest_value}.")


The key with the smallest difference is 10 with a difference of 5.


In [197]:
for tree_id in range(1, 11):
    print(tree_id)

1
2
3
4
5
6
7
8
9
10
