In [18]:
import os
cwd = os.getcwd()
print("Current Working Directory:", cwd)

Current Working Directory: C:\Users\thoma\PycharmProjects\IntroMachineLearning\Project1


In [52]:
import numpy as np
from ucimlrepo import fetch_ucirepo
import pandas as pd
import os

def dataSourcing(dataName):
    """

    @param dataName:
    @return:
    """

    featurePath = dataName + "/featureData.csv"
    targetPath = dataName + "/targetData.csv"

    if dataName == 'BreastCancer':
        # check if file path exists, read from file if so, otherwise grab from online repo
        if os.path.exists(featurePath) & os.path.exists(targetPath):
            print(dataName + " data exists, reading from csv")

            # read from local directory
            dataFeatures = pd.read_csv(featurePath, index_col=0)
            dataTargets = pd.read_csv(targetPath, index_col=0)

        else:
            print(dataName + "data does not exist, reading from source")

            # fetch dataset
            breast_cancer_wisconsin_original = fetch_ucirepo(id=15)

            # data (as pandas dataframes)
            dataFeatures = pd.DataFrame(breast_cancer_wisconsin_original.data.features)
            dataTargets = pd.DataFrame(breast_cancer_wisconsin_original.data.targets)

            # write to local directory
            dataFeatures.to_csv(featurePath, index=True)
            dataTargets.to_csv(targetPath, index=True)

        # remove NAs
        dataFeatures = dataFeatures.dropna()

        # assign types to all columns
        dataFeatures.loc[:, 'Clump_thickness'] = dataFeatures['Clump_thickness'].astype(float)
        dataFeatures.loc[:, 'Uniformity_of_cell_size'] = dataFeatures['Uniformity_of_cell_size'].astype(float)
        dataFeatures.loc[:, 'Uniformity_of_cell_shape'] = dataFeatures['Uniformity_of_cell_shape'].astype(float)
        dataFeatures.loc[:, 'Marginal_adhesion'] = dataFeatures['Marginal_adhesion'].astype(float)
        dataFeatures.loc[:, 'Single_epithelial_cell_size'] = dataFeatures['Single_epithelial_cell_size'].astype(float)
        dataFeatures.loc[:, 'Bare_nuclei'] = dataFeatures['Bare_nuclei'].astype(float)
        dataFeatures.loc[:, 'Bland_chromatin'] = dataFeatures['Bland_chromatin'].astype(float)
        dataFeatures.loc[:, 'Normal_nucleoli'] = dataFeatures['Normal_nucleoli'].astype(float)
        dataFeatures.loc[:, 'Mitoses'] = dataFeatures['Mitoses'].astype(float)

        # set the name of the target column
        dataTargets = dataTargets.loc[dataFeatures.index.tolist()]
        dataTargets.columns = ['Class']

    elif dataName == 'CarEval':
        # check if file path exists, read from file if so, otherwise grab from online repo
        if os.path.exists(featurePath) & os.path.exists(targetPath):
            print(dataName + " data exists, reading from csv")

            # read from local directory
            dataFeatures = pd.read_csv(featurePath, index_col=0)
            dataTargets = pd.read_csv(targetPath, index_col=0)

        else:
            print(dataName + " data does not exist, reading from source")

            # fetch dataset
            car_evaluation = fetch_ucirepo(id=19)

            # store relevant data (as pandas dataframes)
            dataFeatures = pd.DataFrame(car_evaluation.data.features)
            dataTargets = pd.DataFrame(car_evaluation.data.targets)

            # write to the local directory
            dataFeatures.to_csv(featurePath, index=True)
            dataTargets.to_csv(targetPath, index=True)

        # assign types to all columns
        dataFeatures = pd.get_dummies(dataFeatures).astype(bool)

        # set the name of the target column
        dataTargets.columns = ['Class']

    elif dataName == 'CongressVoting':
        # check if file path exists, read from file if so, otherwise grab from online repo
        if os.path.exists(featurePath) & os.path.exists(targetPath):
            print(dataName + " data exists, reading from csv")

            # read from local directory
            dataFeatures = pd.read_csv(featurePath, index_col=0)
            dataTargets = pd.read_csv(targetPath, index_col=0)

        else:
            print(dataName + " data does not exist, reading from source")

            # fetch dataset
            congressional_voting_records = fetch_ucirepo(id=105)

            # store relevant data (as pandas dataframes)
            dataFeatures = pd.DataFrame(congressional_voting_records.data.features)
            dataTargets = pd.DataFrame(congressional_voting_records.data.targets)

            # write to the local directory
            dataFeatures.to_csv(featurePath, index=True)
            dataTargets.to_csv(targetPath, index=True)

        # reset the data to be integers. helps to account for abstain votes which will be 0's
        dataFeatures = dataFeatures.replace({'y': 1, 'n': -1})
        dataFeatures = dataFeatures.fillna(0)

        # assign types to all columns
        dataFeatures.loc[:, 'handicapped-infants'] = dataFeatures['handicapped-infants'].astype(int)
        dataFeatures.loc[:, 'water-project-cost-sharing'] = dataFeatures['water-project-cost-sharing'].astype(int)
        dataFeatures.loc[:, 'adoption-of-the-budget-resolution'] = dataFeatures[
            'adoption-of-the-budget-resolution'].astype(int)
        dataFeatures.loc[:, 'physician-fee-freeze'] = dataFeatures['physician-fee-freeze'].astype(int)
        dataFeatures.loc[:, 'el-salvador-aid'] = dataFeatures['el-salvador-aid'].astype(int)
        dataFeatures.loc[:, 'religious-groups-in-schools'] = dataFeatures['religious-groups-in-schools'].astype(int)
        dataFeatures.loc[:, 'anti-satellite-test-ban'] = dataFeatures['anti-satellite-test-ban'].astype(int)
        dataFeatures.loc[:, 'aid-to-nicaraguan-contras'] = dataFeatures['aid-to-nicaraguan-contras'].astype(int)
        dataFeatures.loc[:, 'mx-missile'] = dataFeatures['mx-missile'].astype(int)
        dataFeatures.loc[:, 'immigration'] = dataFeatures['immigration'].astype(int)
        dataFeatures.loc[:, 'synfuels-corporation-cutback'] = dataFeatures['synfuels-corporation-cutback'].astype(int)
        dataFeatures.loc[:, 'education-spending'] = dataFeatures['education-spending'].astype(int)
        dataFeatures.loc[:, 'superfund-right-to-sue'] = dataFeatures['superfund-right-to-sue'].astype(int)
        dataFeatures.loc[:, 'crime'] = dataFeatures['crime'].astype(int)
        dataFeatures.loc[:, 'duty-free-exports'] = dataFeatures['duty-free-exports'].astype(int)
        dataFeatures.loc[:, 'export-administration-act-south-africa'] = dataFeatures[
            'export-administration-act-south-africa'].astype(int)

        # set the name of the target column
        dataTargets = dataTargets.loc[dataFeatures.index.tolist()]
        dataTargets.columns = ['Class']

    elif dataName == 'Abalone':
        # check if file path exists, read from file if so, otherwise grab from online repo
        if os.path.exists(featurePath) & os.path.exists(targetPath):
            print(dataName + " data exists, reading from csv")

            # read from local directory
            dataFeatures = pd.read_csv(featurePath, index_col=0)
            dataTargets = pd.read_csv(targetPath, index_col=0)

        else:
            print(dataName + " data does not exist, reading from source")

            # fetch dataset
            abalone = fetch_ucirepo(id=1)

            # store relevant data (as pandas dataframes)
            dataFeatures = pd.DataFrame(abalone.data.features)
            dataTargets = pd.DataFrame(abalone.data.targets)

            # write to the local directory
            dataFeatures.to_csv(featurePath, index=True)
            dataTargets.to_csv(targetPath, index=True)

        # assign types to all columns
        # performing one hot encoding for the sex column and then dropping the original column
        dataFeatures_Sex = dataFeatures['Sex']
        dataFeatures_Sex = pd.get_dummies(dataFeatures_Sex).astype(bool)
        dataFeatures = dataFeatures.join(dataFeatures_Sex)
        dataFeatures = dataFeatures.drop(columns='Sex')

        # normalize the rest
        dataFeatures.loc[:, 'Length'] = dataFeatures['Length'].astype(float)
        dataFeatures.loc[:, 'Diameter'] = dataFeatures['Diameter'].astype(float)
        dataFeatures.loc[:, 'Height'] = dataFeatures['Height'].astype(float)
        dataFeatures.loc[:, 'Whole_weight'] = dataFeatures['Whole_weight'].astype(float)
        dataFeatures.loc[:, 'Shucked_weight'] = dataFeatures['Shucked_weight'].astype(float)
        dataFeatures.loc[:, 'Viscera_weight'] = dataFeatures['Viscera_weight'].astype(float)
        dataFeatures.loc[:, 'Shell_weight'] = dataFeatures['Shell_weight'].astype(float)

        # set the name of the target column
        dataTargets = dataTargets.loc[dataFeatures.index.tolist()]
        dataTargets.columns = ['Class']

    elif dataName == 'ComputerHardware':
        # check if file path exists, read from file if so, otherwise grab from online repo
        if os.path.exists(featurePath) & os.path.exists(targetPath):
            print(dataName + "data exists, reading from csv")

            # read from local directory
            dataFeatures = pd.read_csv(featurePath, index_col=0)
            dataTargets = pd.read_csv(targetPath, index_col=0)

        else:
            print(dataName + "data does not exist, reading from source")

            # fetch dataset
            computer_hardware = fetch_ucirepo(id=29)

            # data (as pandas dataframes)
            dataFeatures = pd.DataFrame(computer_hardware.data.features)
            dataTargets = pd.DataFrame(dataFeatures['PRP'])

            # write to the local directory
            dataFeatures.to_csv(featurePath, index=True)
            dataTargets.to_csv(targetPath, index=True)

        # drop the columns that we do not need
        dataFeatures = dataFeatures.drop(columns=['VendorName', 'ModelName', 'PRP', 'ERP'])

        # assign types to all columns, normalizing these
        dataFeatures.loc[:, 'MYCT'] = dataFeatures['MYCT'].astype(float)
        dataFeatures.loc[:, 'MMIN'] = dataFeatures['MMIN'].astype(float)
        dataFeatures.loc[:, 'MMAX'] = dataFeatures['MMAX'].astype(float)
        dataFeatures.loc[:, 'CACH'] = dataFeatures['CACH'].astype(float)
        dataFeatures.loc[:, 'CHMIN'] = dataFeatures['CHMIN'].astype(float)
        dataFeatures.loc[:, 'CHMAX'] = dataFeatures['CHMAX'].astype(float)

        # set the name of the target column
        dataTargets = dataTargets.loc[dataFeatures.index.tolist()]
        dataTargets.columns = ['Class']

    elif dataName == 'ForestFires':
        # check if file path exists, read from file if so, otherwise grab from online repo
        if os.path.exists(featurePath) & os.path.exists(targetPath):
            print(dataName + "data exists, reading from csv")

            # read from local directory
            dataFeatures = pd.read_csv(featurePath, index_col=0)
            dataTargets = pd.read_csv(targetPath, index_col=0)

        else:
            print(dataName + "data does not exist, reading from source")

            # fetch dataset
            forest_fires = fetch_ucirepo(id=162)

            # data (as pandas dataframes)
            dataFeatures = pd.DataFrame(forest_fires.data.features)
            dataTargets = pd.DataFrame(forest_fires.data.targets)

            # write to the local directory
            dataFeatures.to_csv(featurePath, index=True)
            dataTargets.to_csv(targetPath, index=True)

        # assign types to all columns
        # setting these to integers as they are ordinal values
        dataFeatures.loc[:, 'X'] = dataFeatures['X'].astype(int)
        dataFeatures.loc[:, 'Y'] = dataFeatures['Y'].astype(int)

        # adjust these date values to relative integer values
        month_dict = {
            'jan': 1,
            'feb': 2,
            'mar': 3,
            'apr': 4,
            'may': 5,
            'jun': 6,
            'jul': 7,
            'aug': 8,
            'sep': 9,
            'oct': 10,
            'nov': 11,
            'dec': 12
        }
        # Replace month names with integer values
        dataFeatures.loc[:, 'month'] = dataFeatures['month'].replace(month_dict)
        dataFeatures.loc[:, 'month'] = dataFeatures['X'].astype(int)

        day_dict = {
            'mon': 1,
            'tue': 2,
            'wed': 3,
            'thu': 4,
            'fri': 5,
            'sat': 6,
            'sun': 7
        }
        # Replace month names with integer values
        dataFeatures.loc[:, 'day'] = dataFeatures['day'].replace(day_dict)
        dataFeatures.loc[:, 'day'] = dataFeatures['day'].astype(int)

        # normalize these
        dataFeatures.loc[:, 'FFMC'] = dataFeatures['FFMC'].astype(float)
        dataFeatures.loc[:, 'DMC'] = dataFeatures['DMC'].astype(float)
        dataFeatures.loc[:, 'DC'] = dataFeatures['DC'].astype(float)
        dataFeatures.loc[:, 'ISI'] = dataFeatures['ISI'].astype(float)
        dataFeatures.loc[:, 'temp'] = dataFeatures['temp'].astype(float)
        dataFeatures.loc[:, 'RH'] = dataFeatures['RH'].astype(float)
        dataFeatures.loc[:, 'wind'] = dataFeatures['wind'].astype(float)
        dataFeatures.loc[:, 'rain'] = dataFeatures['rain'].astype(float)

        # set the name of the target column
        dataTargets = dataTargets.loc[dataFeatures.index.tolist()]
        dataTargets.columns = ['Class']
        dataTargets.loc[:, 'Class'] = np.log1p(dataTargets['Class'])

    return dataFeatures, dataTargets



In [99]:
currentDataSet = 'CarEval'
features, targets = dataSourcing(currentDataSet)

print(features.info())

CarEval data exists, reading from csv
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1728 entries, 0 to 1727
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   buying_high     1728 non-null   bool 
 1   buying_low      1728 non-null   bool 
 2   buying_med      1728 non-null   bool 
 3   buying_vhigh    1728 non-null   bool 
 4   maint_high      1728 non-null   bool 
 5   maint_low       1728 non-null   bool 
 6   maint_med       1728 non-null   bool 
 7   maint_vhigh     1728 non-null   bool 
 8   doors_2         1728 non-null   bool 
 9   doors_3         1728 non-null   bool 
 10  doors_4         1728 non-null   bool 
 11  doors_5more     1728 non-null   bool 
 12  persons_2       1728 non-null   bool 
 13  persons_4       1728 non-null   bool 
 14  persons_more    1728 non-null   bool 
 15  lug_boot_big    1728 non-null   bool 
 16  lug_boot_med    1728 non-null   bool 
 17  lug_boot_small  1728 non-null   b

In [113]:
# check summary of the targets
targetsSummary = targets.value_counts()
display(targetsSummary)

tuningFilePath = currentDataSet + '/KEEPTestCases/ParameterTuningFile.csv'
crossValFilePath = currentDataSet + '/KEEPTestCases/CrossValidationTestFile.csv'

parameterTuningOutput = pd.read_csv(tuningFilePath, index_col=0)
parameterTuningOutput = parameterTuningOutput.sort_values(by = ['AveragePerformance'], ascending = False)

crossValidationOutput = pd.read_csv(crossValFilePath, index_col=0)

display(parameterTuningOutput)
display(crossValidationOutput)

#overall performance
print(crossValidationOutput['correctAssignment'].mean())

#see average inputs per value
averageInputsPerInput = features.join(targets)
averageInputsPerInput = pd.DataFrame(averageInputsPerInput.groupby('Class')[features.columns].mean())

display(averageInputsPerInput[averageInputsPerInput.columns[0:4]])
display(averageInputsPerInput[averageInputsPerInput.columns[4:8]])
display(averageInputsPerInput[averageInputsPerInput.columns[8:12]])
display(averageInputsPerInput[averageInputsPerInput.columns[12:15]])
display(averageInputsPerInput[averageInputsPerInput.columns[15:18]])
display(averageInputsPerInput[averageInputsPerInput.columns[[19, 20, 18]]])

print(averageInputsPerInput[averageInputsPerInput.columns[8:12]].to_latex(index=True,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))

print(averageInputsPerInput[averageInputsPerInput.columns[[19, 20, 18]]].to_latex(index=True,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))

#group performance by class
performanceByClassSuccess = pd.DataFrame(crossValidationOutput.groupby('actualValue')['correctAssignment'].mean())
performanceByClassSuccess.columns = ['% correct']
performanceByClassSuccess.loc[:,'% correct'] = performanceByClassSuccess['% correct'] * 100
performanceByClassCount = pd.DataFrame(crossValidationOutput.groupby('actualValue')['correctAssignment'].count())
performanceByClassCount.columns = ['totalCases']
performanceByClassSuccess = performanceByClassSuccess.join(performanceByClassCount)
display(performanceByClassSuccess)

print(targetsSummary.to_latex(index=True,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))

print(parameterTuningOutput.to_latex(index=True,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))

print(performanceByClassSuccess.to_latex(index=True,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))



Class
unacc    1210
acc       384
good       69
vgood      65
dtype: int64

Unnamed: 0,p,k,e,s,AveragePerformance,TestsRun
2,1,16,1,1,0.83237,10
4,2,14,1,1,0.829191,10
1,1,8,1,1,0.824277,10
5,2,6,1,1,0.822543,10
3,2,12,1,1,0.813873,10


Unnamed: 0,testID,nearestNeighbors,expectedValue,actualValue,correctAssignment
1449,1,"[1476, 1557, 1451, 1477, 1457, 155, 180, 567, ...",unacc,unacc,True
900,1,"[468, 927, 1224, 1226, 42, 252, 576, 879, 928,...",unacc,unacc,True
1137,1,"[1134, 1139, 1148, 939, 1086, 1112, 1119, 1141...",unacc,unacc,True
65,1,"[281, 173, 1361, 821, 1037, 1685, 10, 14, 20, ...",unacc,unacc,True
274,1,"[271, 382, 1579, 139, 169, 175, 221, 278, 733,...",unacc,unacc,True
...,...,...,...,...,...
1712,5,"[1388, 1280, 1685, 857, 1702, 1640, 407, 839, ...",acc,good,False
1717,5,"[421, 1663, 1609, 1285, 1714, 1636, 1393, 1702...",good,good,True
1721,5,"[857, 1640, 1505, 1397, 1724, 1667, 1613, 1702...",unacc,good,False
1723,5,"[427, 1724, 1714, 1702, 424, 856, 1400, 751, 1...",acc,good,False


0.8069464544138929


Unnamed: 0_level_0,buying_high,buying_low,buying_med,buying_vhigh
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,0.28125,0.231771,0.299479,0.1875
good,0.0,0.666667,0.333333,0.0
unacc,0.267769,0.213223,0.221488,0.297521
vgood,0.0,0.6,0.4,0.0


Unnamed: 0_level_0,maint_high,maint_low,maint_med,maint_vhigh
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,0.273438,0.239583,0.299479,0.1875
good,0.0,0.666667,0.333333,0.0
unacc,0.259504,0.221488,0.221488,0.297521
vgood,0.2,0.4,0.4,0.0


Unnamed: 0_level_0,doors_2,doors_3,doors_4,doors_5more
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,0.210938,0.257812,0.265625,0.265625
good,0.217391,0.26087,0.26087,0.26087
unacc,0.269421,0.247934,0.241322,0.241322
vgood,0.153846,0.230769,0.307692,0.307692


Unnamed: 0_level_0,persons_2,persons_4,persons_more
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
acc,0.0,0.515625,0.484375
good,0.0,0.521739,0.478261
unacc,0.476033,0.257851,0.266116
vgood,0.0,0.461538,0.538462


Unnamed: 0_level_0,lug_boot_big,lug_boot_med,lug_boot_small
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
acc,0.375,0.351562,0.273438
good,0.347826,0.347826,0.304348
unacc,0.304132,0.323967,0.371901
vgood,0.615385,0.384615,0.0


Unnamed: 0_level_0,safety_low,safety_med,safety_high
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
acc,0.0,0.46875,0.53125
good,0.0,0.565217,0.434783
unacc,0.476033,0.295041,0.228926
vgood,0.0,0.0,1.0


\begin{tabular}{lrrrr}
\toprule
{} &  doors\_2 &  doors\_3 &  doors\_4 &  doors\_5more \\
Class &          &          &          &              \\
\midrule
acc   &    0.211 &    0.258 &    0.266 &        0.266 \\
good  &    0.217 &    0.261 &    0.261 &        0.261 \\
unacc &    0.269 &    0.248 &    0.241 &        0.241 \\
vgood &    0.154 &    0.231 &    0.308 &        0.308 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrrr}
\toprule
{} &  safety\_low &  safety\_med &  safety\_high \\
Class &             &             &              \\
\midrule
acc   &       0.000 &       0.469 &        0.531 \\
good  &       0.000 &       0.565 &        0.435 \\
unacc &       0.476 &       0.295 &        0.229 \\
vgood &       0.000 &       0.000 &        1.000 \\
\bottomrule
\end{tabular}



  print(averageInputsPerInput[averageInputsPerInput.columns[8:12]].to_latex(index=True,
  print(averageInputsPerInput[averageInputsPerInput.columns[[19, 20, 18]]].to_latex(index=True,


Unnamed: 0_level_0,% correct,totalCases
actualValue,Unnamed: 1_level_1,Unnamed: 2_level_1
acc,58.566775,1535
good,21.090909,275
unacc,93.92562,4840
vgood,28.076923,260


\begin{tabular}{lr}
\toprule
      &     0 \\
Class &       \\
\midrule
unacc &  1210 \\
acc &   384 \\
good &    69 \\
vgood &    65 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrrrrrr}
\toprule
{} &  p &   k &  e &  s &  AveragePerformance &  TestsRun \\
\midrule
2 &  1 &  16 &  1 &  1 &               0.832 &        10 \\
4 &  2 &  14 &  1 &  1 &               0.829 &        10 \\
1 &  1 &   8 &  1 &  1 &               0.824 &        10 \\
5 &  2 &   6 &  1 &  1 &               0.823 &        10 \\
3 &  2 &  12 &  1 &  1 &               0.814 &        10 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrr}
\toprule
{} &  \% correct &  totalCases \\
actualValue &            &             \\
\midrule
acc         &     58.567 &        1535 \\
good        &     21.091 &         275 \\
unacc       &     93.926 &        4840 \\
vgood       &     28.077 &         260 \\
\bottomrule
\end{tabular}



  print(targetsSummary.to_latex(index=True,
  print(parameterTuningOutput.to_latex(index=True,
  print(performanceByClassSuccess.to_latex(index=True,


In [96]:
# check summary of the targets
featuresSummary = features.describe(percentiles=[.05, .1, .15, .2, .25,
                                                 .30, .35, .40, .45, .5,
                                                 .55, .60, .65, .70, .75,
                                                 .8, .85, .9, .95, 1])
targetsSummary = targets.describe(percentiles=[.05, .1, .15, .2, .25,
                                                 .30, .35, .40, .45, .5,
                                                 .55, .60, .65, .70, .75,
                                                 .8, .85, .9, .95, 1])

print(featuresSummary)
#display(targetsSummary)

       buying_high buying_low buying_med buying_vhigh maint_high maint_low  \
count         1728       1728       1728         1728       1728      1728   
unique           2          2          2            2          2         2   
top          False      False      False        False      False     False   
freq          1296       1296       1296         1296       1296      1296   

       maint_med maint_vhigh doors_2 doors_3  ... doors_5more persons_2  \
count       1728        1728    1728    1728  ...        1728      1728   
unique         2           2       2       2  ...           2         2   
top        False       False   False   False  ...       False     False   
freq        1296        1296    1296    1296  ...        1296      1152   

       persons_4 persons_more lug_boot_big lug_boot_med lug_boot_small  \
count       1728         1728         1728         1728           1728   
unique         2            2            2            2              2   
top        

In [93]:
crossValFilePath = currentDataSet + '/KEEPTestCases/CrossValidationTestFile.csv'

parameterTuningOutput = pd.read_csv(crossValFilePath, index_col=0)

parameterTuningOutput2 = parameterTuningOutput[parameterTuningOutput['testID'].isin([2, 4, 6, 8, 10])]

parameterTuningOutput2.loc[:,'testID'] = (parameterTuningOutput2['testID']/2).astype(int)

display(parameterTuningOutput2)
parameterTuningOutput2 = parameterTuningOutput2.drop(parameterTuningOutput2.index.to_list())
display(parameterTuningOutput2)

#parameterTuningOutput2.to_csv(crossValFilePath, index=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  parameterTuningOutput2.loc[:,'testID'] = (parameterTuningOutput2['testID']/2).astype(int)


Unnamed: 0,testID,nearestNeighbors,expectedValue,actualValue,correctAssignment
344,1,"[350, 128, 342, 1208, 1640, 227, 1199, 1667, 2...",unacc,unacc,True
604,1,"[550, 607, 1468, 556, 173, 499, 553, 721, 739,...",unacc,unacc,True
20,1,"[23, 128, 884, 227, 350, 9, 17, 38, 218, 342, ...",unacc,unacc,True
15,1,"[9, 17, 96, 1311, 771, 1313, 43, 124, 225, 109...",unacc,unacc,True
1433,1,"[1439, 1649, 1442, 1451, 1448, 1550, 1667, 677...",unacc,unacc,True
...,...,...,...,...,...
1685,2,"[1691, 1658, 1577, 1472, 1721, 1280, 1352, 167...",unacc,good,False
1690,2,"[394, 1691, 1636, 1285, 1714, 1672, 1255, 1473...",unacc,good,False
1696,2,"[1372, 1723, 1642, 833, 1453, 292, 1724, 1265,...",good,good,True
1712,2,"[1721, 1658, 1280, 1724, 1691, 1283, 1714, 157...",unacc,good,False


Unnamed: 0,testID,nearestNeighbors,expectedValue,actualValue,correctAssignment
