In [1]:
import pandas as pd
import numpy as np
import os

categoricalDataSets = ['BreastCancer', 'CarEval', 'CongressVoting']
regressionSets = ['Abalone', 'ComputerHardware', 'ForestFires']
dataSets = categoricalDataSets + regressionSets

catOutputPre = []
catOutputPost = []
catOutputExtra = []
regOutputPre = []
regOutputPost = []
regOutputExtra = []

for currentDataSet in dataSets:
    # PRE PRUNED
    filePath = currentDataSet + "/UsedTestCases_COPY/TestResults/prePrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)

    summary = rawData['success'].mean()
    
    if currentDataSet in categoricalDataSets:
        catOutputPre.append(summary)
    else:
        regOutputPre.append(summary)
        
    # POST PRUNED
    filePath = currentDataSet + "/UsedTestCases/TestResults/postPrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)

    summary = rawData['success'].mean()
    
    if currentDataSet in categoricalDataSets:
        catOutputPost.append(summary)
    else:
        regOutputPost.append(summary)
        
    # EXTRA PRUNING
    filePath = currentDataSet + "/UsedTestCases_COPY/TestResults/postPrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)

    summary = rawData['success'].mean()
    
    if currentDataSet in categoricalDataSets:
        catOutputExtra.append(summary)
    else:
        regOutputExtra.append(summary)
    

catTable = pd.DataFrame({'Pre-Pruned': catOutputPre, 'Post-Pruned': catOutputPost, 'Extra-Pruned': catOutputExtra}, index = categoricalDataSets)
regTable = pd.DataFrame({'Pre-Pruned': regOutputPre, 'Post-Pruned': regOutputPost, 'Extra-Pruned': regOutputExtra}, index = regressionSets)

catTable['Prune Improve %'] = (catTable['Extra-Pruned'] - catTable['Pre-Pruned']) / catTable['Pre-Pruned'] * 100
regTable['Prune Improve %'] = ( regTable['Pre-Pruned'] - regTable['Extra-Pruned']) / regTable['Pre-Pruned'] * 100

display(catTable)
print(catTable.to_latex(index=True,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))

display(regTable)
print(regTable.to_latex(index=True,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))


Unnamed: 0,Pre-Pruned,Post-Pruned,Extra-Pruned,Prune Improve %
BreastCancer,0.812821,0.871062,0.934066,14.916629
CarEval,0.899276,0.894067,0.834443,-7.209527
CongressVoting,0.931034,0.943678,0.946552,1.666667


\begin{tabular}{lrrrr}
\toprule
{} &  Pre-Pruned &  Post-Pruned &  Extra-Pruned &  Prune Improve \% \\
\midrule
BreastCancer   &       0.813 &        0.871 &         0.934 &           14.917 \\
CarEval        &       0.899 &        0.894 &         0.834 &           -7.210 \\
CongressVoting &       0.931 &        0.944 &         0.947 &            1.667 \\
\bottomrule
\end{tabular}



  print(catTable.to_latex(index=True,


Unnamed: 0,Pre-Pruned,Post-Pruned,Extra-Pruned,Prune Improve %
Abalone,12.402108,10.8821,7.980036,35.655811
ComputerHardware,14009.979158,14107.12744,19515.064441,-39.294029
ForestFires,4.23686,3.692988,2.726445,35.649392


\begin{tabular}{lrrrr}
\toprule
{} &  Pre-Pruned &  Post-Pruned &  Extra-Pruned &  Prune Improve \% \\
\midrule
Abalone          &      12.402 &       10.882 &         7.980 &           35.656 \\
ComputerHardware &   14009.979 &    14107.127 &     19515.064 &          -39.294 \\
ForestFires      &       4.237 &        3.693 &         2.726 &           35.649 \\
\bottomrule
\end{tabular}



  print(regTable.to_latex(index=True,


In [50]:
categoricalDataSets = ['BreastCancer', 'CarEval', 'CongressVoting']

for currentDataSet in categoricalDataSets:
    # PRE PRUNED
    filePath = currentDataSet + "/UsedTestCases/TestResults/prePrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)

    summaryCount = list(rawData['Class'].value_counts())
    summaryPre = rawData[['Class', 'success']].groupby('Class', as_index=False).mean(['success'])
    summaryPre['Percent of Sample'] = [currentCount / sum(summaryCount) for currentCount in summaryCount]
    
    # POST PRUNED
    filePath = currentDataSet + "/UsedTestCases/TestResults/postPrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)

    summaryCount = list(rawData['Class'].value_counts())
    summaryPost = rawData[['Class', 'success']].groupby('Class', as_index=False).mean(['success'])
    summaryPost['Percent of Sample'] = [currentCount / sum(summaryCount) for currentCount in summaryCount]
    
    # EXTRA PRUNED
    filePath = currentDataSet + "/UsedTestCases_COPY/TestResults/postPrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)

    summaryCount = list(rawData['Class'].value_counts())
    summaryExtra = rawData[['Class', 'success']].groupby('Class', as_index=False).mean(['success'])
    summaryExtra['Percent of Sample'] = [currentCount / sum(summaryCount) for currentCount in summaryCount]
    
    successOverview = pd.DataFrame({currentDataSet + ' Class': summaryPre['Class'], 
                                    'Percent of Sample': summaryPre['Percent of Sample'],
                                    'Pre-Prune': summaryPre['success'],
                                    'Post-Prune': summaryPost['success'],
                                    'Extra-Prune': summaryExtra['success'],
                                    'Prune Improve %': (summaryExtra['success'] - summaryPre['success']) / summaryPre['success'] * 100
                                   })
    display(successOverview)
    print(successOverview.to_latex(index=False,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))
    
    
                                    
    
    

Unnamed: 0,BreastCancer Class,Percent of Sample,Pre-Prune,Post-Prune,Extra-Prune,Prune Improve %
0,2,0.650183,0.799437,0.865352,0.957183,19.732206
1,4,0.349817,0.837696,0.881675,0.891099,6.375


\begin{tabular}{rrrrrr}
\toprule
 BreastCancer Class &  Percent of Sample &  Pre-Prune &  Post-Prune &  Extra-Prune &  Prune Improve \% \\
\midrule
                  2 &              0.650 &      0.799 &       0.865 &        0.957 &           19.732 \\
                  4 &              0.350 &      0.838 &       0.882 &        0.891 &            6.375 \\
\bottomrule
\end{tabular}



  print(successOverview.to_latex(index=False,


Unnamed: 0,CarEval Class,Percent of Sample,Pre-Prune,Post-Prune,Extra-Prune,Prune Improve %
0,acc,0.700434,0.818241,0.833225,0.889902,8.757962
1,good,0.222142,0.512727,0.48,0.214545,-58.156028
2,unacc,0.039797,0.955785,0.948967,0.878099,-8.127972
3,vgood,0.037627,0.734615,0.669231,0.35,-52.356021


\begin{tabular}{lrrrrr}
\toprule
CarEval Class &  Percent of Sample &  Pre-Prune &  Post-Prune &  Extra-Prune &  Prune Improve \% \\
\midrule
          acc &              0.700 &      0.818 &       0.833 &        0.890 &            8.758 \\
         good &              0.222 &      0.513 &       0.480 &        0.215 &          -58.156 \\
        unacc &              0.040 &      0.956 &       0.949 &        0.878 &           -8.128 \\
        vgood &              0.038 &      0.735 &       0.669 &        0.350 &          -52.356 \\
\bottomrule
\end{tabular}



  print(successOverview.to_latex(index=False,


Unnamed: 0,CongressVoting Class,Percent of Sample,Pre-Prune,Post-Prune,Extra-Prune,Prune Improve %
0,democrat,0.614943,0.942056,0.941121,0.942056,0.0
1,republican,0.385057,0.90597,0.947761,0.953731,5.271829


\begin{tabular}{lrrrrr}
\toprule
CongressVoting Class &  Percent of Sample &  Pre-Prune &  Post-Prune &  Extra-Prune &  Prune Improve \% \\
\midrule
            democrat &              0.615 &      0.942 &       0.941 &        0.942 &            0.000 \\
          republican &              0.385 &      0.906 &       0.948 &        0.954 &            5.272 \\
\bottomrule
\end{tabular}



  print(successOverview.to_latex(index=False,


In [52]:
regressionSets = ['Abalone', 'ComputerHardware', 'ForestFires']

for currentDataSet in regressionSets:
    # PRE PRUNED
    filePath = currentDataSet + "/UsedTestCases/TestResults/prePrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)
    
    rawData['ClassQuartile'] = pd.qcut(rawData['Class'], q = 4, duplicates = 'drop')

    dataCopy = rawData.copy()
    summaryCount = dataCopy['ClassQuartile'].value_counts()
    summaryPre = dataCopy[['ClassQuartile', 'success']].groupby('ClassQuartile', as_index=False).mean(['success'])
    summaryPre['PercentOfSample'] = [currentCount / sum(summaryCount) for currentCount in summaryCount]
    
    # POST PRUNED
    filePath = currentDataSet + "/UsedTestCases/TestResults/postPrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)
    
    rawData['ClassQuartile'] = pd.qcut(rawData['Class'], q = 4, duplicates = 'drop')
    rawData['ClassQuartile'] = rawData['ClassQuartile'].astype(str)
    
    dataCopy = rawData.copy()
    summaryCount = list(dataCopy['ClassQuartile'].value_counts())
    summaryPost = dataCopy[['ClassQuartile', 'success']].groupby('ClassQuartile', as_index=False).mean(['success'])
    summaryPost['PercentOfSample'] = [currentCount / sum(summaryCount) for currentCount in summaryCount]
    
    # EXTRA PRUNED
    filePath = currentDataSet + "/UsedTestCases_COPY/TestResults/postPrunedTreeResults.csv"
    rawData = pd.read_csv(filePath, index_col=0)

    rawData['ClassQuartile'] = pd.qcut(rawData['Class'], q = 4, duplicates = 'drop')
    rawData['ClassQuartile'] = rawData['ClassQuartile'].astype(str)
    
    dataCopy = rawData.copy()
    summaryCount = list(dataCopy['ClassQuartile'].value_counts())
    summaryExtra = dataCopy[['ClassQuartile', 'success']].groupby('ClassQuartile', as_index=False).mean(['success'])
    summaryExtra['PercentOfSample'] = [currentCount / sum(summaryCount) for currentCount in summaryCount]
    
    successOverview = pd.DataFrame({currentDataSet + ' Class Quartile': summaryPre['ClassQuartile'], 
                                    'Percent of Sample': summaryPre['PercentOfSample'],
                                    'Pre-Prune': summaryPre['success'],
                                    'Post-Prune': summaryPost['success'],
                                    'Extra-Prune': summaryExtra['success'],
                                    'Prune Improve %': (summaryPre['success'] - summaryExtra['success']) / summaryPre['success'] * 100
                                   })
    
    successOverview = successOverview.sort_values(by=currentDataSet + ' Class Quartile')
    
    display(successOverview)
    
    print(successOverview.to_latex(index=False,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))

Unnamed: 0,Abalone Class Quartile,Percent of Sample,Pre-Prune,Post-Prune,Extra-Prune,Prune Improve %
0,"(0.999, 8.0]",0.338899,9.739051,9.253119,7.817857,19.726704
1,"(8.0, 9.0]",0.268881,10.558821,19.567833,16.567332,-56.905124
2,"(9.0, 11.0]",0.229384,9.128901,8.599053,4.974484,45.508406
3,"(11.0, 29.0]",0.162837,21.558801,6.908067,2.678771,87.574581


\begin{tabular}{lrrrrr}
\toprule
Abalone Class Quartile &  Percent of Sample &  Pre-Prune &  Post-Prune &  Extra-Prune &  Prune Improve \% \\
\midrule
          (0.999, 8.0] &              0.339 &      9.739 &       9.253 &        7.818 &           19.727 \\
            (8.0, 9.0] &              0.269 &     10.559 &      19.568 &       16.567 &          -56.905 \\
           (9.0, 11.0] &              0.229 &      9.129 &       8.599 &        4.974 &           45.508 \\
          (11.0, 29.0] &              0.163 &     21.559 &       6.908 &        2.679 &           87.575 \\
\bottomrule
\end{tabular}



  print(successOverview.to_latex(index=False,


Unnamed: 0,ComputerHardware Class Quartile,Percent of Sample,Pre-Prune,Post-Prune,Extra-Prune,Prune Improve %
0,"(5.999, 27.0]",0.261078,717.661709,49921.397897,58446.824548,-8044.063397
1,"(27.0, 50.0]",0.252695,1676.650385,1795.397805,5692.969408,-239.544216
2,"(50.0, 114.0]",0.247904,3087.13342,814.018048,814.018048,73.631912
3,"(114.0, 1150.0]",0.238323,51048.582809,4435.012655,13988.748682,72.597185


\begin{tabular}{lrrrrr}
\toprule
ComputerHardware Class Quartile &  Percent of Sample &  Pre-Prune &  Post-Prune &  Extra-Prune &  Prune Improve \% \\
\midrule
                  (5.999, 27.0] &              0.261 &    717.662 &   49921.398 &    58446.825 &        -8044.063 \\
                   (27.0, 50.0] &              0.253 &   1676.650 &    1795.398 &     5692.969 &         -239.544 \\
                  (50.0, 114.0] &              0.248 &   3087.133 &     814.018 &      814.018 &           73.632 \\
                (114.0, 1150.0] &              0.238 &  51048.583 &    4435.013 &    13988.749 &           72.597 \\
\bottomrule
\end{tabular}



  print(successOverview.to_latex(index=False,


Unnamed: 0,ForestFires Class Quartile,Percent of Sample,Pre-Prune,Post-Prune,Extra-Prune,Prune Improve %
0,"(-0.001, 0.385]",0.500483,3.514081,3.022144,1.94241,44.724955
1,"(0.385, 2.029]",0.250725,2.84586,1.843477,0.861001,69.745501
2,"(2.029, 6.996]",0.248792,7.092642,6.906366,6.183583,12.816928


\begin{tabular}{lrrrrr}
\toprule
ForestFires Class Quartile &  Percent of Sample &  Pre-Prune &  Post-Prune &  Extra-Prune &  Prune Improve \% \\
\midrule
           (-0.001, 0.385] &              0.500 &      3.514 &       3.022 &        1.942 &           44.725 \\
            (0.385, 2.029] &              0.251 &      2.846 &       1.843 &        0.861 &           69.746 \\
            (2.029, 6.996] &              0.249 &      7.093 &       6.906 &        6.184 &           12.817 \\
\bottomrule
\end{tabular}



  print(successOverview.to_latex(index=False,


In [54]:
categoricalDataSets = ['BreastCancer', 'CarEval', 'CongressVoting']
regressionSets = ['Abalone', 'ComputerHardware', 'ForestFires']
dataSets = categoricalDataSets + regressionSets

for currentDataSet in regressionSets:
    currentTreeOutput = pd.DataFrame()
    for i in range(1,11):

        filePath = currentDataSet + "/UsedTestCases_COPY/Tree" + str(i) + "/postPruneTree.csv"
        rawData = pd.read_csv(filePath, index_col=0)
        
        currentTreeOutput = pd.concat([currentTreeOutput, rawData])

    currentTreeOutput['originalLeaf'] = ~currentTreeOutput['pruned'] & currentTreeOutput['isLeaf']
    leaves = pd.DataFrame(currentTreeOutput[['originalLeaf', 'pruned']].value_counts().reset_index())/10
    leaves['NodeType'] = ['Non-Leaf', 'Original Leaf', 'Pruned Leaf']
    
    output = pd.DataFrame({'Node Type': leaves['NodeType'],
                          'Average Num Nodes': leaves[0]})
    
    display(output)
    
    print(output.to_latex(index=False,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))



Unnamed: 0,Node Type,Average Num Nodes
0,Non-Leaf,113.1
1,Original Leaf,105.8
2,Pruned Leaf,30.7


\begin{tabular}{lr}
\toprule
    Node Type &  Average Num Nodes \\
\midrule
     Non-Leaf &            113.100 \\
Original Leaf &            105.800 \\
  Pruned Leaf &             30.700 \\
\bottomrule
\end{tabular}



  print(output.to_latex(index=False,


Unnamed: 0,Node Type,Average Num Nodes
0,Non-Leaf,21.5
1,Original Leaf,20.3
2,Pruned Leaf,4.0


\begin{tabular}{lr}
\toprule
    Node Type &  Average Num Nodes \\
\midrule
     Non-Leaf &             21.500 \\
Original Leaf &             20.300 \\
  Pruned Leaf &              4.000 \\
\bottomrule
\end{tabular}



  print(output.to_latex(index=False,


Unnamed: 0,Node Type,Average Num Nodes
0,Non-Leaf,168.8
1,Original Leaf,157.5
2,Pruned Leaf,26.8


\begin{tabular}{lr}
\toprule
    Node Type &  Average Num Nodes \\
\midrule
     Non-Leaf &            168.800 \\
Original Leaf &            157.500 \\
  Pruned Leaf &             26.800 \\
\bottomrule
\end{tabular}



  print(output.to_latex(index=False,


In [122]:
import pandas as pd

# Create a sample DataFrame
data = {
    'Values': [1.5, 2.3, 3.7, 4.1, 5.8, 6.2, 7.5, 8.0, 9.3, 10.5]
}
df = pd.DataFrame(data)

# Define bins and labels
bins = [0, 2, 4, 6, 8, 10]
labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']

# Create the bucketing column
df['Bucket'] = pd.cut(df['Values'], bins=bins, labels=labels, right=False)

print(df)


   Values     Bucket
0     1.5   Very Low
1     2.3        Low
2     3.7        Low
3     4.1     Medium
4     5.8     Medium
5     6.2       High
6     7.5       High
7     8.0  Very High
8     9.3  Very High
9    10.5        NaN
