In [18]:
import os
cwd = os.getcwd()
print("Current Working Directory:", cwd)

Current Working Directory: C:\Users\thoma\PycharmProjects\IntroMachineLearning\Project1


In [52]:
import numpy as np
from ucimlrepo import fetch_ucirepo
import pandas as pd
import os

def dataSourcing(dataName):
    """

    @param dataName:
    @return:
    """

    featurePath = dataName + "/featureData.csv"
    targetPath = dataName + "/targetData.csv"

    if dataName == 'BreastCancer':
        # check if file path exists, read from file if so, otherwise grab from online repo
        if os.path.exists(featurePath) & os.path.exists(targetPath):
            print(dataName + " data exists, reading from csv")

            # read from local directory
            dataFeatures = pd.read_csv(featurePath, index_col=0)
            dataTargets = pd.read_csv(targetPath, index_col=0)

        else:
            print(dataName + "data does not exist, reading from source")

            # fetch dataset
            breast_cancer_wisconsin_original = fetch_ucirepo(id=15)

            # data (as pandas dataframes)
            dataFeatures = pd.DataFrame(breast_cancer_wisconsin_original.data.features)
            dataTargets = pd.DataFrame(breast_cancer_wisconsin_original.data.targets)

            # write to local directory
            dataFeatures.to_csv(featurePath, index=True)
            dataTargets.to_csv(targetPath, index=True)

        # remove NAs
        dataFeatures = dataFeatures.dropna()

        # assign types to all columns
        dataFeatures.loc[:, 'Clump_thickness'] = dataFeatures['Clump_thickness'].astype(float)
        dataFeatures.loc[:, 'Uniformity_of_cell_size'] = dataFeatures['Uniformity_of_cell_size'].astype(float)
        dataFeatures.loc[:, 'Uniformity_of_cell_shape'] = dataFeatures['Uniformity_of_cell_shape'].astype(float)
        dataFeatures.loc[:, 'Marginal_adhesion'] = dataFeatures['Marginal_adhesion'].astype(float)
        dataFeatures.loc[:, 'Single_epithelial_cell_size'] = dataFeatures['Single_epithelial_cell_size'].astype(float)
        dataFeatures.loc[:, 'Bare_nuclei'] = dataFeatures['Bare_nuclei'].astype(float)
        dataFeatures.loc[:, 'Bland_chromatin'] = dataFeatures['Bland_chromatin'].astype(float)
        dataFeatures.loc[:, 'Normal_nucleoli'] = dataFeatures['Normal_nucleoli'].astype(float)
        dataFeatures.loc[:, 'Mitoses'] = dataFeatures['Mitoses'].astype(float)

        # set the name of the target column
        dataTargets = dataTargets.loc[dataFeatures.index.tolist()]
        dataTargets.columns = ['Class']

    elif dataName == 'CarEval':
        # check if file path exists, read from file if so, otherwise grab from online repo
        if os.path.exists(featurePath) & os.path.exists(targetPath):
            print(dataName + " data exists, reading from csv")

            # read from local directory
            dataFeatures = pd.read_csv(featurePath, index_col=0)
            dataTargets = pd.read_csv(targetPath, index_col=0)

        else:
            print(dataName + " data does not exist, reading from source")

            # fetch dataset
            car_evaluation = fetch_ucirepo(id=19)

            # store relevant data (as pandas dataframes)
            dataFeatures = pd.DataFrame(car_evaluation.data.features)
            dataTargets = pd.DataFrame(car_evaluation.data.targets)

            # write to the local directory
            dataFeatures.to_csv(featurePath, index=True)
            dataTargets.to_csv(targetPath, index=True)

        # assign types to all columns
        dataFeatures = pd.get_dummies(dataFeatures).astype(bool)

        # set the name of the target column
        dataTargets.columns = ['Class']

    elif dataName == 'CongressVoting':
        # check if file path exists, read from file if so, otherwise grab from online repo
        if os.path.exists(featurePath) & os.path.exists(targetPath):
            print(dataName + " data exists, reading from csv")

            # read from local directory
            dataFeatures = pd.read_csv(featurePath, index_col=0)
            dataTargets = pd.read_csv(targetPath, index_col=0)

        else:
            print(dataName + " data does not exist, reading from source")

            # fetch dataset
            congressional_voting_records = fetch_ucirepo(id=105)

            # store relevant data (as pandas dataframes)
            dataFeatures = pd.DataFrame(congressional_voting_records.data.features)
            dataTargets = pd.DataFrame(congressional_voting_records.data.targets)

            # write to the local directory
            dataFeatures.to_csv(featurePath, index=True)
            dataTargets.to_csv(targetPath, index=True)

        # reset the data to be integers. helps to account for abstain votes which will be 0's
        dataFeatures = dataFeatures.replace({'y': 1, 'n': -1})
        dataFeatures = dataFeatures.fillna(0)

        # assign types to all columns
        dataFeatures.loc[:, 'handicapped-infants'] = dataFeatures['handicapped-infants'].astype(int)
        dataFeatures.loc[:, 'water-project-cost-sharing'] = dataFeatures['water-project-cost-sharing'].astype(int)
        dataFeatures.loc[:, 'adoption-of-the-budget-resolution'] = dataFeatures[
            'adoption-of-the-budget-resolution'].astype(int)
        dataFeatures.loc[:, 'physician-fee-freeze'] = dataFeatures['physician-fee-freeze'].astype(int)
        dataFeatures.loc[:, 'el-salvador-aid'] = dataFeatures['el-salvador-aid'].astype(int)
        dataFeatures.loc[:, 'religious-groups-in-schools'] = dataFeatures['religious-groups-in-schools'].astype(int)
        dataFeatures.loc[:, 'anti-satellite-test-ban'] = dataFeatures['anti-satellite-test-ban'].astype(int)
        dataFeatures.loc[:, 'aid-to-nicaraguan-contras'] = dataFeatures['aid-to-nicaraguan-contras'].astype(int)
        dataFeatures.loc[:, 'mx-missile'] = dataFeatures['mx-missile'].astype(int)
        dataFeatures.loc[:, 'immigration'] = dataFeatures['immigration'].astype(int)
        dataFeatures.loc[:, 'synfuels-corporation-cutback'] = dataFeatures['synfuels-corporation-cutback'].astype(int)
        dataFeatures.loc[:, 'education-spending'] = dataFeatures['education-spending'].astype(int)
        dataFeatures.loc[:, 'superfund-right-to-sue'] = dataFeatures['superfund-right-to-sue'].astype(int)
        dataFeatures.loc[:, 'crime'] = dataFeatures['crime'].astype(int)
        dataFeatures.loc[:, 'duty-free-exports'] = dataFeatures['duty-free-exports'].astype(int)
        dataFeatures.loc[:, 'export-administration-act-south-africa'] = dataFeatures[
            'export-administration-act-south-africa'].astype(int)

        # set the name of the target column
        dataTargets = dataTargets.loc[dataFeatures.index.tolist()]
        dataTargets.columns = ['Class']

    elif dataName == 'Abalone':
        # check if file path exists, read from file if so, otherwise grab from online repo
        if os.path.exists(featurePath) & os.path.exists(targetPath):
            print(dataName + " data exists, reading from csv")

            # read from local directory
            dataFeatures = pd.read_csv(featurePath, index_col=0)
            dataTargets = pd.read_csv(targetPath, index_col=0)

        else:
            print(dataName + " data does not exist, reading from source")

            # fetch dataset
            abalone = fetch_ucirepo(id=1)

            # store relevant data (as pandas dataframes)
            dataFeatures = pd.DataFrame(abalone.data.features)
            dataTargets = pd.DataFrame(abalone.data.targets)

            # write to the local directory
            dataFeatures.to_csv(featurePath, index=True)
            dataTargets.to_csv(targetPath, index=True)

        # assign types to all columns
        # performing one hot encoding for the sex column and then dropping the original column
        dataFeatures_Sex = dataFeatures['Sex']
        dataFeatures_Sex = pd.get_dummies(dataFeatures_Sex).astype(bool)
        dataFeatures = dataFeatures.join(dataFeatures_Sex)
        dataFeatures = dataFeatures.drop(columns='Sex')

        # normalize the rest
        dataFeatures.loc[:, 'Length'] = dataFeatures['Length'].astype(float)
        dataFeatures.loc[:, 'Diameter'] = dataFeatures['Diameter'].astype(float)
        dataFeatures.loc[:, 'Height'] = dataFeatures['Height'].astype(float)
        dataFeatures.loc[:, 'Whole_weight'] = dataFeatures['Whole_weight'].astype(float)
        dataFeatures.loc[:, 'Shucked_weight'] = dataFeatures['Shucked_weight'].astype(float)
        dataFeatures.loc[:, 'Viscera_weight'] = dataFeatures['Viscera_weight'].astype(float)
        dataFeatures.loc[:, 'Shell_weight'] = dataFeatures['Shell_weight'].astype(float)

        # set the name of the target column
        dataTargets = dataTargets.loc[dataFeatures.index.tolist()]
        dataTargets.columns = ['Class']

    elif dataName == 'ComputerHardware':
        # check if file path exists, read from file if so, otherwise grab from online repo
        if os.path.exists(featurePath) & os.path.exists(targetPath):
            print(dataName + "data exists, reading from csv")

            # read from local directory
            dataFeatures = pd.read_csv(featurePath, index_col=0)
            dataTargets = pd.read_csv(targetPath, index_col=0)

        else:
            print(dataName + "data does not exist, reading from source")

            # fetch dataset
            computer_hardware = fetch_ucirepo(id=29)

            # data (as pandas dataframes)
            dataFeatures = pd.DataFrame(computer_hardware.data.features)
            dataTargets = pd.DataFrame(dataFeatures['PRP'])

            # write to the local directory
            dataFeatures.to_csv(featurePath, index=True)
            dataTargets.to_csv(targetPath, index=True)

        # drop the columns that we do not need
        dataFeatures = dataFeatures.drop(columns=['VendorName', 'ModelName', 'PRP', 'ERP'])

        # assign types to all columns, normalizing these
        dataFeatures.loc[:, 'MYCT'] = dataFeatures['MYCT'].astype(float)
        dataFeatures.loc[:, 'MMIN'] = dataFeatures['MMIN'].astype(float)
        dataFeatures.loc[:, 'MMAX'] = dataFeatures['MMAX'].astype(float)
        dataFeatures.loc[:, 'CACH'] = dataFeatures['CACH'].astype(float)
        dataFeatures.loc[:, 'CHMIN'] = dataFeatures['CHMIN'].astype(float)
        dataFeatures.loc[:, 'CHMAX'] = dataFeatures['CHMAX'].astype(float)

        # set the name of the target column
        dataTargets = dataTargets.loc[dataFeatures.index.tolist()]
        dataTargets.columns = ['Class']

    elif dataName == 'ForestFires':
        # check if file path exists, read from file if so, otherwise grab from online repo
        if os.path.exists(featurePath) & os.path.exists(targetPath):
            print(dataName + "data exists, reading from csv")

            # read from local directory
            dataFeatures = pd.read_csv(featurePath, index_col=0)
            dataTargets = pd.read_csv(targetPath, index_col=0)

        else:
            print(dataName + "data does not exist, reading from source")

            # fetch dataset
            forest_fires = fetch_ucirepo(id=162)

            # data (as pandas dataframes)
            dataFeatures = pd.DataFrame(forest_fires.data.features)
            dataTargets = pd.DataFrame(forest_fires.data.targets)

            # write to the local directory
            dataFeatures.to_csv(featurePath, index=True)
            dataTargets.to_csv(targetPath, index=True)

        # assign types to all columns
        # setting these to integers as they are ordinal values
        dataFeatures.loc[:, 'X'] = dataFeatures['X'].astype(int)
        dataFeatures.loc[:, 'Y'] = dataFeatures['Y'].astype(int)

        # adjust these date values to relative integer values
        month_dict = {
            'jan': 1,
            'feb': 2,
            'mar': 3,
            'apr': 4,
            'may': 5,
            'jun': 6,
            'jul': 7,
            'aug': 8,
            'sep': 9,
            'oct': 10,
            'nov': 11,
            'dec': 12
        }
        # Replace month names with integer values
        dataFeatures.loc[:, 'month'] = dataFeatures['month'].replace(month_dict)
        dataFeatures.loc[:, 'month'] = dataFeatures['X'].astype(int)

        day_dict = {
            'mon': 1,
            'tue': 2,
            'wed': 3,
            'thu': 4,
            'fri': 5,
            'sat': 6,
            'sun': 7
        }
        # Replace month names with integer values
        dataFeatures.loc[:, 'day'] = dataFeatures['day'].replace(day_dict)
        dataFeatures.loc[:, 'day'] = dataFeatures['day'].astype(int)

        # normalize these
        dataFeatures.loc[:, 'FFMC'] = dataFeatures['FFMC'].astype(float)
        dataFeatures.loc[:, 'DMC'] = dataFeatures['DMC'].astype(float)
        dataFeatures.loc[:, 'DC'] = dataFeatures['DC'].astype(float)
        dataFeatures.loc[:, 'ISI'] = dataFeatures['ISI'].astype(float)
        dataFeatures.loc[:, 'temp'] = dataFeatures['temp'].astype(float)
        dataFeatures.loc[:, 'RH'] = dataFeatures['RH'].astype(float)
        dataFeatures.loc[:, 'wind'] = dataFeatures['wind'].astype(float)
        dataFeatures.loc[:, 'rain'] = dataFeatures['rain'].astype(float)

        # set the name of the target column
        dataTargets = dataTargets.loc[dataFeatures.index.tolist()]
        dataTargets.columns = ['Class']
        dataTargets.loc[:, 'Class'] = np.log1p(dataTargets['Class'])

    return dataFeatures, dataTargets



In [91]:
currentDataSet = 'CarEval'
features, targets = dataSourcing(currentDataSet)

CarEval data exists, reading from csv


In [90]:
# check summary of the targets
targetsSummary = targets.value_counts()
display(targetsSummary)

tuningFilePath = currentDataSet + '/KEEPTestCases/ParameterTuningFile.csv'
crossValFilePath = currentDataSet + '/KEEPTestCases/CrossValidationTestFile.csv'

parameterTuningOutput = pd.read_csv(tuningFilePath, index_col=0)
parameterTuningOutput = parameterTuningOutput.sort_values(by = ['AveragePerformance'], ascending = False)

crossValidationOutput = pd.read_csv(crossValFilePath, index_col=0)

display(parameterTuningOutput)
display(crossValidationOutput)

print(crossValidationOutput['correctAssignment'].mean())

performanceByClassSuccess = pd.DataFrame(crossValidationOutput.groupby('actualValue')['correctAssignment'].mean())
performanceByClassSuccess.columns = ['% correct']
performanceByClassSuccess.loc[:,'% correct'] = performanceByClassSuccess['% correct'] * 100
performanceByClassCount = pd.DataFrame(crossValidationOutput.groupby('actualValue')['correctAssignment'].count())
performanceByClassCount.columns = ['totalCases']
performanceByClassSuccess = performanceByClassSuccess.join(performanceByClassCount)
display(performanceByClassSuccess)

print(targetsSummary.to_latex(index=True,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))

print(parameterTuningOutput.to_latex(index=True,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))

print(performanceByClassSuccess.to_latex(index=True,
                  formatters={"name": str.upper},
                  float_format="{:.3f}".format,))



Class
2        444
4        239
dtype: int64

Unnamed: 0,p,k,e,s,AveragePerformance,TestsRun
7,2,4,1,1,0.959124,10
3,1,4,1,1,0.954745,10
1,1,2,1,1,0.942336,10
4,2,1,1,1,0.938686,10
2,1,3,1,1,0.90292,10


Unnamed: 0,testID,nearestNeighbors,expectedValue,actualValue,correctAssignment
684,1,"[314, 637, 19, 309]",2,2,True
396,1,"[314, 19, 309, 637]",2,2,True
35,1,"[314, 19, 637, 309]",2,2,True
423,1,"[19, 637, 116, 489]",2,2,True
324,1,"[314, 309, 637, 19]",2,2,True
...,...,...,...,...,...
658,5,"[691, 435, 522, 316]",4,4,True
668,5,"[296, 60, 530, 67]",4,4,True
681,5,"[691, 361, 73, 255]",4,4,True
696,5,"[691, 56, 259, 60]",4,4,True


0.9597069597069597


Unnamed: 0_level_0,% correct,totalCases
actualValue,Unnamed: 1_level_1,Unnamed: 2_level_1
2,96.957746,1775
4,94.136126,955


\begin{tabular}{lr}
\toprule
  &    0 \\
Class &      \\
\midrule
2 &  444 \\
4 &  239 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrrrrrr}
\toprule
{} &  p &  k &  e &  s &  AveragePerformance &  TestsRun \\
\midrule
7 &  2 &  4 &  1 &  1 &               0.959 &        10 \\
3 &  1 &  4 &  1 &  1 &               0.955 &        10 \\
1 &  1 &  2 &  1 &  1 &               0.942 &        10 \\
4 &  2 &  1 &  1 &  1 &               0.939 &        10 \\
2 &  1 &  3 &  1 &  1 &               0.903 &        10 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrr}
\toprule
{} &  \% correct &  totalCases \\
actualValue &            &             \\
\midrule
2           &     96.958 &        1775 \\
4           &     94.136 &         955 \\
\bottomrule
\end{tabular}



  print(targetsSummary.to_latex(index=True,
  print(parameterTuningOutput.to_latex(index=True,
  print(performanceByClassSuccess.to_latex(index=True,


In [50]:
# check summary of the targets
featuresSummary = features.describe(percentiles=[.05, .1, .15, .2, .25,
                                                 .30, .35, .40, .45, .5,
                                                 .55, .60, .65, .70, .75,
                                                 .8, .85, .9, .95, 1])
targetsSummary = targets.describe(percentiles=[.05, .1, .15, .2, .25,
                                                 .30, .35, .40, .45, .5,
                                                 .55, .60, .65, .70, .75,
                                                 .8, .85, .9, .95, 1])

#print(featuresSummary)
display(targetsSummary)

Unnamed: 0,Class
count,517.0
mean,1.111026
std,1.398436
min,0.0
5%,0.0
10%,0.0
15%,0.0
20%,0.0
25%,0.0
30%,0.0


In [92]:
crossValFilePath = currentDataSet + '/KEEPTestCases/CrossValidationTestFile.csv'

parameterTuningOutput = pd.read_csv(crossValFilePath, index_col=0)

parameterTuningOutput2 = parameterTuningOutput[parameterTuningOutput['testID'].isin([2, 4, 6, 8, 10])]

parameterTuningOutput2.loc[:,'testID'] = (parameterTuningOutput2['testID']/2).astype(int)

parameterTuningOutput2.to_csv(crossValFilePath, index=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  parameterTuningOutput2.loc[:,'testID'] = (parameterTuningOutput2['testID']/2).astype(int)
