In [2]:
import pandas as pd
import numpy as np
import os

categoricalDataSets = ['BreastCancer', 'CarEval', 'CongressVoting']
regressionSets = ['Abalone', 'ComputerHardware', 'ForestFires']
dataSets = categoricalDataSets + regressionSets

catOutputPre = []
catOutputPost = []
catOutputExtra = []
regOutputPre = []
regOutputPost = []
regOutputExtra = []

for currentDataSet in dataSets:
    # PRE PRUNED
    filePath = currentDataSet + "/UsedTestCases_COPY/TestResults/prePrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)

    summary = rawData['success'].mean()
    
    if currentDataSet in categoricalDataSets:
        catOutputPre.append(summary)
    else:
        regOutputPre.append(summary)
        
    # POST PRUNED
    filePath = currentDataSet + "/UsedTestCases/TestResults/postPrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)

    summary = rawData['success'].mean()
    
    if currentDataSet in categoricalDataSets:
        catOutputPost.append(summary)
    else:
        regOutputPost.append(summary)
        
    # EXTRA PRUNING
    filePath = currentDataSet + "/UsedTestCases_COPY/TestResults/postPrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)

    summary = rawData['success'].mean()
    
    if currentDataSet in categoricalDataSets:
        catOutputExtra.append(summary)
    else:
        regOutputExtra.append(summary)
    

catTable = pd.DataFrame({'Pre-Pruned': catOutputPre, 'Post-Pruned': catOutputPost, 'Extra-Pruned': catOutputExtra}, index = categoricalDataSets)
regTable = pd.DataFrame({'Pre-Pruned': regOutputPre, 'Post-Pruned': regOutputPost, 'Extra-Pruned': regOutputExtra}, index = regressionSets)

catTable['Prune Improve %'] = (catTable['Extra-Pruned'] - catTable['Pre-Pruned']) / catTable['Pre-Pruned'] * 100
regTable['Prune Improve %'] = ( regTable['Pre-Pruned'] - regTable['Extra-Pruned']) / regTable['Pre-Pruned'] * 100

display(catTable)
print(catTable.to_latex(index=True,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))

display(regTable)
print(regTable.to_latex(index=True,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))


Unnamed: 0,Pre-Pruned,Post-Pruned,Extra-Pruned,Prune Improve %
BreastCancer,0.405128,0.918315,0.918315,126.672694
CarEval,0.899276,0.894067,0.834443,-7.209527
CongressVoting,0.931034,0.943678,0.946552,1.666667


\begin{tabular}{lrrrr}
\toprule
{} &  Pre-Pruned &  Post-Pruned &  Extra-Pruned &  Prune Improve \% \\
\midrule
BreastCancer   &       0.405 &        0.918 &         0.918 &          126.673 \\
CarEval        &       0.899 &        0.894 &         0.834 &           -7.210 \\
CongressVoting &       0.931 &        0.944 &         0.947 &            1.667 \\
\bottomrule
\end{tabular}



  print(catTable.to_latex(index=True,


Unnamed: 0,Pre-Pruned,Post-Pruned,Extra-Pruned,Prune Improve %
Abalone,8.643106,8.659243,7.818574,9.53977
ComputerHardware,43138.174886,30817.428564,31273.954218,27.502834
ForestFires,4.252277,4.165376,3.171203,25.423418


\begin{tabular}{lrrrr}
\toprule
{} &  Pre-Pruned &  Post-Pruned &  Extra-Pruned &  Prune Improve \% \\
\midrule
Abalone          &       8.643 &        8.659 &         7.819 &            9.540 \\
ComputerHardware &   43138.175 &    30817.429 &     31273.954 &           27.503 \\
ForestFires      &       4.252 &        4.165 &         3.171 &           25.423 \\
\bottomrule
\end{tabular}



  print(regTable.to_latex(index=True,


In [3]:
categoricalDataSets = ['BreastCancer', 'CarEval', 'CongressVoting']

for currentDataSet in categoricalDataSets:
    # PRE PRUNED
    filePath = currentDataSet + "/UsedTestCases/TestResults/prePrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)

    summaryCount = list(rawData['Class'].value_counts())
    summaryPre = rawData[['Class', 'success']].groupby('Class', as_index=False).mean(['success'])
    summaryPre['Percent of Sample'] = [currentCount / sum(summaryCount) for currentCount in summaryCount]
    
    # POST PRUNED
    filePath = currentDataSet + "/UsedTestCases/TestResults/postPrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)

    summaryCount = list(rawData['Class'].value_counts())
    summaryPost = rawData[['Class', 'success']].groupby('Class', as_index=False).mean(['success'])
    summaryPost['Percent of Sample'] = [currentCount / sum(summaryCount) for currentCount in summaryCount]
    
    # EXTRA PRUNED
    filePath = currentDataSet + "/UsedTestCases_COPY/TestResults/postPrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)

    summaryCount = list(rawData['Class'].value_counts())
    summaryExtra = rawData[['Class', 'success']].groupby('Class', as_index=False).mean(['success'])
    summaryExtra['Percent of Sample'] = [currentCount / sum(summaryCount) for currentCount in summaryCount]
    
    successOverview = pd.DataFrame({currentDataSet + ' Class': summaryPre['Class'], 
                                    'Percent of Sample': summaryPre['Percent of Sample'],
                                    'Pre-Prune': summaryPre['success'],
                                    'Post-Prune': summaryPost['success'],
                                    'Extra-Prune': summaryExtra['success'],
                                    'Prune Improve %': (summaryExtra['success'] - summaryPre['success']) / summaryPre['success'] * 100
                                   })
    display(successOverview)
    print(successOverview.to_latex(index=True,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))
    
    
                                    
    
    

Unnamed: 0,BreastCancer Class,Percent of Sample,Pre-Prune,Post-Prune,Extra-Prune,Prune Improve %
0,2,0.650183,0.096901,0.95662,0.95662,887.209302
1,4,0.349817,0.97801,0.84712,0.84712,-13.383298


\begin{tabular}{lrrrrrr}
\toprule
{} &  BreastCancer Class &  Percent of Sample &  Pre-Prune &  Post-Prune &  Extra-Prune &  Prune Improve \% \\
\midrule
0 &                   2 &              0.650 &      0.097 &       0.957 &        0.957 &          887.209 \\
1 &                   4 &              0.350 &      0.978 &       0.847 &        0.847 &          -13.383 \\
\bottomrule
\end{tabular}



  print(successOverview.to_latex(index=True,


Unnamed: 0,CarEval Class,Percent of Sample,Pre-Prune,Post-Prune,Extra-Prune,Prune Improve %
0,acc,0.700434,0.818241,0.833225,0.889902,8.757962
1,good,0.222142,0.512727,0.48,0.214545,-58.156028
2,unacc,0.039797,0.955785,0.948967,0.878099,-8.127972
3,vgood,0.037627,0.734615,0.669231,0.35,-52.356021


\begin{tabular}{llrrrrr}
\toprule
{} & CarEval Class &  Percent of Sample &  Pre-Prune &  Post-Prune &  Extra-Prune &  Prune Improve \% \\
\midrule
0 &           acc &              0.700 &      0.818 &       0.833 &        0.890 &            8.758 \\
1 &          good &              0.222 &      0.513 &       0.480 &        0.215 &          -58.156 \\
2 &         unacc &              0.040 &      0.956 &       0.949 &        0.878 &           -8.128 \\
3 &         vgood &              0.038 &      0.735 &       0.669 &        0.350 &          -52.356 \\
\bottomrule
\end{tabular}



  print(successOverview.to_latex(index=True,


Unnamed: 0,CongressVoting Class,Percent of Sample,Pre-Prune,Post-Prune,Extra-Prune,Prune Improve %
0,democrat,0.614943,0.942056,0.941121,0.942056,0.0
1,republican,0.385057,0.90597,0.947761,0.953731,5.271829


\begin{tabular}{llrrrrr}
\toprule
{} & CongressVoting Class &  Percent of Sample &  Pre-Prune &  Post-Prune &  Extra-Prune &  Prune Improve \% \\
\midrule
0 &             democrat &              0.615 &      0.942 &       0.941 &        0.942 &            0.000 \\
1 &           republican &              0.385 &      0.906 &       0.948 &        0.954 &            5.272 \\
\bottomrule
\end{tabular}



  print(successOverview.to_latex(index=True,


In [186]:
regressionSets = ['Abalone', 'ComputerHardware', 'ForestFires']

for currentDataSet in regressionSets:
    # PRE PRUNED
    filePath = currentDataSet + "/UsedTestCases/TestResults/prePrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)
    
    rawData['ClassQuartile'] = pd.qcut(rawData['Class'], q = 4, duplicates = 'drop')

    dataCopy = rawData.copy()
    summaryCount = dataCopy['ClassQuartile'].value_counts()
    summaryPre = dataCopy[['ClassQuartile', 'success']].groupby('ClassQuartile', as_index=False).mean(['success'])
    summaryPre['PercentOfSample'] = [currentCount / sum(summaryCount) for currentCount in summaryCount]
    
    # POST PRUNED
    filePath = currentDataSet + "/UsedTestCases/TestResults/postPrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)
    
    rawData['ClassQuartile'] = pd.qcut(rawData['Class'], q = 4, duplicates = 'drop')
    rawData['ClassQuartile'] = rawData['ClassQuartile'].astype(str)
    
    dataCopy = rawData.copy()
    summaryCount = list(dataCopy['ClassQuartile'].value_counts())
    summaryPost = dataCopy[['ClassQuartile', 'success']].groupby('ClassQuartile', as_index=False).mean(['success'])
    summaryPost['PercentOfSample'] = [currentCount / sum(summaryCount) for currentCount in summaryCount]
    
    # EXTRA PRUNED
    filePath = currentDataSet + "/UsedTestCases_COPY/TestResults/postPrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)

    rawData['ClassQuartile'] = pd.qcut(rawData['Class'], q = 4, duplicates = 'drop')
    rawData['ClassQuartile'] = rawData['ClassQuartile'].astype(str)
    
    dataCopy = rawData.copy()
    summaryCount = list(dataCopy['ClassQuartile'].value_counts())
    summaryExtra = dataCopy[['ClassQuartile', 'success']].groupby('ClassQuartile', as_index=False).mean(['success'])
    summaryExtra['PercentOfSample'] = [currentCount / sum(summaryCount) for currentCount in summaryCount]
    
    successOverview = pd.DataFrame({currentDataSet + 'Class Quartile': summaryPre['ClassQuartile'], 
                                    'Percent of Sample': summaryPre['PercentOfSample'],
                                    'Pre-Prune': summaryPre['success'],
                                    'Post-Prune': summaryPost['success'],
                                    'Extra-Prune': summaryExtra['success'],
                                    'Prune Improve %': (summaryPre['success'] - summaryExtra['success']) / summaryPre['success'] * 100
                                   })
    
    display(successOverview.sort_values(by=currentDataSet + ' Class Quartile'))

Unnamed: 0,Abalone Class Quartile,Percent of Sample,Pre-Prune,Post-Prune,Extra-Prune,Prune Improve %
0,"(0.999, 8.0]",0.337522,8.828934,8.878092,5.061432,42.672218
1,"(8.0, 9.0]",0.270138,3.475069,20.183661,21.605371,-521.72497
2,"(9.0, 11.0]",0.229204,1.763337,3.462779,3.393441,-92.444217
3,"(11.0, 29.0]",0.163136,20.156232,1.745804,2.238103,88.896225


Unnamed: 0,ComputerHardware Class Quartile,Percent of Sample,Pre-Prune,Post-Prune,Extra-Prune,Prune Improve %
0,"(5.999, 26.0]",0.25509,31709.551794,67340.363678,69773.214005,-120.038474
1,"(26.0, 49.0]",0.249102,41036.28004,22124.755037,21986.912382,46.420796
2,"(49.0, 111.0]",0.247904,36706.930521,25939.155318,25602.575764,30.251385
3,"(111.0, 1150.0]",0.247904,63333.638996,8340.505925,8215.489261,87.028237


Unnamed: 0,ForestFires Class Quartile,Percent of Sample,Pre-Prune,Post-Prune,Extra-Prune,Prune Improve %
0,"(-0.001, 0.435]",0.5,3.87419,3.809486,2.726819,29.615743
1,"(0.435, 2.026]",0.251691,2.876566,2.761945,1.643372,42.870361
2,"(2.026, 6.996]",0.248309,6.408047,6.304549,5.614659,12.381118


In [6]:
categoricalDataSets = ['BreastCancer', 'CarEval', 'CongressVoting']
regressionSets = ['Abalone', 'ComputerHardware', 'ForestFires']
dataSets = categoricalDataSets + regressionSets

for currentDataSet in regressionSets:
    currentTreeOutput = pd.DataFrame()
    for i in range(1,11):

        filePath = currentDataSet + "/UsedTestCases_COPY/Tree" + str(i) + "/postPruneTree.csv"
        rawData = pd.read_csv(filePath, index_col=0)
        
        currentTreeOutput = pd.concat([currentTreeOutput, rawData])

    currentTreeOutput['originalLeaf'] = ~currentTreeOutput['pruned'] & currentTreeOutput['isLeaf']
    leaves = rawData[['originalLeaf', 'pruned']].value_counts()

    display(currentTreeOutput)



Unnamed: 0,nodeID,parentNode,dataSetFilters,evaluated,nodeFeature,nodePrediction,childrenNodes,gainOrMSE,treeLevel,pruned,isLeaf
0,0,-1,{},True,Shell_weight,10.014961,"{'belowMean': 1, 'aboveMean': 2}",8.062380,0,False,False
1,1,0,{'Shell_weight': 'belowMean'},True,Sex,8.380070,"{'I': 3, 'F': 4, 'M': 5}",5.175663,1,True,True
2,2,0,{'Shell_weight': 'aboveMean'},True,Shucked_weight,11.761139,"{'aboveMean': 6, 'belowMean': 7}",9.742160,1,False,False
3,3,1,"{'Shell_weight': 'belowMean', 'Sex': 'I'}",True,Height,7.476891,"{'belowMean': 8, 'aboveMean': 9}",3.504458,2,False,False
4,4,1,"{'Shell_weight': 'belowMean', 'Sex': 'F'}",True,Height,10.136095,"{'aboveMean': 10, 'belowMean': 11}",7.311354,2,False,False
...,...,...,...,...,...,...,...,...,...,...,...
216,216,134,"{'Shell_weight': 'aboveMean', 'Shucked_weight'...",True,,13.666667,{},0.000000,7,False,True
217,217,134,"{'Shell_weight': 'aboveMean', 'Shucked_weight'...",True,,10.000000,{},0.000000,7,False,True
218,218,135,"{'Shell_weight': 'aboveMean', 'Shucked_weight'...",True,,16.000000,{},0.000000,7,False,True
219,219,135,"{'Shell_weight': 'aboveMean', 'Shucked_weight'...",True,,18.000000,{},0.000000,7,False,True


Unnamed: 0,nodeID,parentNode,dataSetFilters,evaluated,nodeFeature,nodePrediction,childrenNodes,gainOrMSE,treeLevel,pruned,isLeaf
0,0,-1,{},True,CHMIN,116.797619,"{'belowMean': 1, 'aboveMean': 2}",18084.623689,0,False,False
1,1,0,{'CHMIN': 'belowMean'},True,CACH,53.315789,"{'belowMean': 3, 'aboveMean': 4}",1011.543860,1,True,True
2,2,0,{'CHMIN': 'aboveMean'},True,MMAX,250.814815,"{'belowMean': 5, 'aboveMean': 6}",43433.069959,1,False,False
3,3,1,"{'CHMIN': 'belowMean', 'CACH': 'belowMean'}",True,MMAX,36.333333,"{'belowMean': 7, 'aboveMean': 8}",353.412466,2,False,False
4,4,1,"{'CHMIN': 'belowMean', 'CACH': 'aboveMean'}",True,MMAX,117.000000,"{'belowMean': 9, 'aboveMean': 10}",1867.552381,2,True,True
...,...,...,...,...,...,...,...,...,...,...,...
38,38,24,"{'MMIN': 'belowMean', 'CACH': 'belowMean', 'MM...",True,,84.000000,{},0.000000,5,False,True
39,39,25,"{'MMIN': 'belowMean', 'CACH': 'belowMean', 'MM...",True,,34.500000,{},0.000000,5,False,True
40,40,25,"{'MMIN': 'belowMean', 'CACH': 'belowMean', 'MM...",True,,105.000000,{},0.000000,5,False,True
41,41,26,"{'MMIN': 'belowMean', 'CACH': 'belowMean', 'MM...",True,,26.250000,{},0.000000,5,False,True


Unnamed: 0,nodeID,parentNode,dataSetFilters,evaluated,nodeFeature,nodePrediction,childrenNodes,gainOrMSE,treeLevel,pruned,isLeaf
0,0,-1,{},True,month,1.143290,"{7: 1, 8: 2, 9: 3, 3: 4, 4: 5, 2: 6, 6: 7, 10:...",1.708150,0,False,False
1,1,0,{'month': 7},True,day,1.203184,"{7: 11, 5: 12, 1: 13, 6: 14}",0.444833,1,False,False
2,2,0,{'month': 8},True,day,1.077444,"{6: 15, 7: 16, 1: 17, 5: 18, 2: 19, 4: 20, 3: 21}",1.269460,1,True,True
3,3,0,{'month': 9},True,day,1.277972,"{1: 22, 7: 23, 5: 24, 2: 25, 4: 26, 6: 27, 3: 28}",1.992363,1,True,True
4,4,0,{'month': 3},True,day,0.903586,"{5: 29, 2: 30, 7: 31, 6: 32, 1: 33, 4: 34, 3: 35}",1.309460,1,True,True
...,...,...,...,...,...,...,...,...,...,...,...
331,331,319,"{'month': 9, 'wind': 'aboveMean', 'day': 5, 'X...",True,,2.100401,{},0.000000,11,False,True
332,332,320,"{'month': 9, 'wind': 'belowMean', 'Y': 'belowM...",True,,2.021876,{},0.000000,11,False,True
333,333,321,"{'month': 9, 'wind': 'belowMean', 'Y': 'belowM...",True,,2.861545,{},0.000000,11,False,True
334,334,322,"{'month': 9, 'wind': 'belowMean', 'Y': 'aboveM...",True,,4.371073,{},0.000000,11,False,True


In [122]:
import pandas as pd

# Create a sample DataFrame
data = {
    'Values': [1.5, 2.3, 3.7, 4.1, 5.8, 6.2, 7.5, 8.0, 9.3, 10.5]
}
df = pd.DataFrame(data)

# Define bins and labels
bins = [0, 2, 4, 6, 8, 10]
labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']

# Create the bucketing column
df['Bucket'] = pd.cut(df['Values'], bins=bins, labels=labels, right=False)

print(df)


   Values     Bucket
0     1.5   Very Low
1     2.3        Low
2     3.7        Low
3     4.1     Medium
4     5.8     Medium
5     6.2       High
6     7.5       High
7     8.0  Very High
8     9.3  Very High
9    10.5        NaN
