#Import Libraries For Helper Functions

In [None]:
from google.colab import drive
import os
import pandas as pd
import numpy as np
import math

In [None]:
def distEuclidean(firstVec, secondVec, columns):
    distance = 0
    for i in columns:
        distance += (firstVec[i] - secondVec[i]) ** 2
    return np.sqrt(distance)

In [None]:
def ZScoreNormalization(features):
  mean = np.mean(features)
  stdDev = np.std(features)
  data = (features - mean) / stdDev
  return data

In [None]:
def forwardSelection(data):
    #print(len(data), len(data[0]))
    dataset = ZScoreNormalization(data) #ZScore Normalization, Mean = 0, Standard Deviation = 1
    #print(dataset)
    bestSoFar = 0.0
    bestFeatureSet = set()
    currFeatureSet = set()
    #loops once outer for each feature
    for i in range(1, len(dataset[0])):
        print(f"Best Accuracy and Features until now: {bestSoFar*100:.2f}% {bestFeatureSet} ") # Checkpointing
        print(f"Currently evaluating level {i} of the search tree")
        currentBest = 0
        featureToAdd = None
        #loops same number of times for during each outer loop
        for feature in range(1, len(dataset[0])):
            if feature not in currFeatureSet:
                print("Consider Adding feature:", feature)
                tempFeatures = list(currFeatureSet) #temporary copy
                tempFeatures.append(feature) #adding newest entry and testing how that affects accuracy in the search
                correctClassification = 0 #successful classifications
                minDist = math.inf # To find minimum distance
                y_pred = 0
                #applying the Euclidean distance function to each row's comparison as a vector
                for row1 in dataset:
                    minDist = math.inf
                    for row2 in dataset:
                        duplicate = (row1 == row2).all()
                        if not duplicate:
                            value = distEuclidean(row1 , row2, tempFeatures)
                            if value < minDist:
                                minDist = value
                                prediction_class = row2[0]
                    if prediction_class == row1[0]:
                        correctClassification += 1
                accuracy = correctClassification / (len(dataset) )  #measuring accuracy

                # determining if recently-stored correctness needs to be updated so that it reflects both globally and in the current working copy.
                if accuracy > currentBest:
                    currentBest = accuracy
                    featureToAdd = feature
        currFeatureSet.add(featureToAdd)
        print(f"At iteration {i} we added feature {featureToAdd} to current set")
        print(f"Using feature(s) {currFeatureSet} the accuracy is {currentBest*100:.2f}%")
        if featureToAdd and currentBest > bestSoFar:  #print operations to display progress and results
              print(f"From {bestSoFar*100:.2f}%")
              bestSoFar = currentBest
              print(f"To {bestSoFar*100:.2f}%")
              print("******************************")
              bestFeatureSet = set(currFeatureSet)

    print(f"Finished Search!! The complete best feature subset is {bestFeatureSet} which has an accuracy of {bestSoFar*100:.2f}%")

In [None]:
print("""Feature Selection\n""")

fileChoice = int(input("Data File to be used: 1: CS170_small_Data__20.txt, 2: CS170_large_Data__1.txt, 3: CS170_XXXLarge_Data__11.txt\n"))
algo = int(input("""\nAlgorithm \n
1. Forward Selection \n
2. Backward Elimination \n""" ))

file = None
if fileChoice == 2:
  file = "CS170_large_Data__1.txt"
elif fileChoice == 3:
  file = "CS170_XXXlarge_Data__11.txt"
else:
  file = "CS170_small_Data__20.txt"

try:
    with open(file) as d:
        data = pd.read_csv(file, delim_whitespace=True)
        dataLen = data.shape[0]
        loadDataset = np.loadtxt(file)
except:
    print("Error Loading File. Re-run and try again.")

# printing dataset details
print(f"\nData contains {len(loadDataset[0]) - 1} features with {dataLen + 1} samples ")
print("Beginning Search")
if algo == 1:
    forwardSelection(loadDataset)

Feature Selection

Data File to be used: 1: CS170_small_Data__20.txt, 2: CS170_large_Data__1.txt, 3: CS170_XXXLarge_Data__11.txt
1

Algorithm 
 
1. Forward Selection 

2. Backward Elimination 
1

Data contains 10 features with 1000 samples 
Beginning Search
Best Accuracy and Features until now: 0.00% set() 
Currently evaluating level 1 of the search tree
Consider Adding feature: 1
Consider Adding feature: 2
Consider Adding feature: 3
Consider Adding feature: 4
Consider Adding feature: 5
Consider Adding feature: 6
Consider Adding feature: 7
Consider Adding feature: 8
Consider Adding feature: 9
Consider Adding feature: 10
At iteration 1 we added feature 5 to current set
Using feature(s) {5} the accuracy is 85.70%
From 0.00%
To 85.70%
******************************
Best Accuracy and Features until now: 85.70% {5} 
Currently evaluating level 2 of the search tree
Consider Adding feature: 1
Consider Adding feature: 2
Consider Adding feature: 3
Consider Adding feature: 4
Consider Adding featu

In [None]:
print("""Feature Selection\n""")

fileChoice = int(input("Data File to be used: 1: CS170_small_Data__20.txt, 2: CS170_large_Data__1.txt, 3: CS170_XXXLarge_Data__11.txt\n"))
algo = int(input("""\nAlgorithm \n
1. Forward Selection \n
2. Backward Elimination \n""" ))

file = None
if fileChoice == 2:
  file = "CS170_large_Data__1.txt"
elif fileChoice == 3:
  file = "CS170_XXXlarge_Data__11.txt"
else:
  file = "CS170_small_Data__20.txt"

try:
    with open(file) as d:
        data = pd.read_csv(file, delim_whitespace=True)
        dataLen = data.shape[0]
        loadDataset = np.loadtxt(file)
except:
    print("Error Loading File. Re-run and try again.")

# printing dataset details
print(f"\nData contains {len(loadDataset[0]) - 1} features with {dataLen + 1} samples ")
print("Beginning Search")
if algo == 1:
    forwardSelection(loadDataset)

Feature Selection

Data File to be used: 1: CS170_small_Data__20.txt, 2: CS170_large_Data__1.txt, 3: CS170_XXXLarge_Data__11.txt
2

Algorithm 
 
1. Forward Selection 

2. Backward Elimination 
1

Data contains 20 features with 2000 samples 
Beginning Search
Best Accuracy and Features until now: 0.00% set() 
Currently evaluating level 1 of the search tree
Consider Adding feature: 1
Consider Adding feature: 2
Consider Adding feature: 3
Consider Adding feature: 4
Consider Adding feature: 5
Consider Adding feature: 6
Consider Adding feature: 7
Consider Adding feature: 8
Consider Adding feature: 9
Consider Adding feature: 10
Consider Adding feature: 11
Consider Adding feature: 12
Consider Adding feature: 13
Consider Adding feature: 14
Consider Adding feature: 15
Consider Adding feature: 16
Consider Adding feature: 17
Consider Adding feature: 18
Consider Adding feature: 19
Consider Adding feature: 20
At iteration 1 we added feature 11 to current set
Using feature(s) {11} the accuracy is 84.9

In [None]:
print("""Feature Selection\n""")

fileChoice = int(input("Data File to be used: 1: CS170_small_Data__20.txt, 2: CS170_large_Data__1.txt, 3: CS170_XXXLarge_Data__11.txt\n"))
algo = int(input("""\nAlgorithm \n
1. Forward Selection \n
2. Backward Elimination \n""" ))

file = None
if fileChoice == 2:
  file = "CS170_large_Data__1.txt"
elif fileChoice == 3:
  file = "CS170_XXXlarge_Data__11.txt"
else:
  file = "CS170_small_Data__20.txt"

try:
    with open(file) as d:
        data = pd.read_csv(file, delim_whitespace=True)
        dataLen = data.shape[0]
        loadDataset = np.loadtxt(file)
except:
    print("Error Loading File. Re-run and try again.")

# printing dataset details
print(f"\nData contains {len(loadDataset[0]) - 1} features with {dataLen + 1} samples ")
print("Beginning Search")
if algo == 1:
    forwardSelection(loadDataset)

Feature Selection


Data contains 80 features with 4000 samples 
Beginning Search
Best Accuracy and Features until now: 0.00% set() 
Currently evaluating level 1 of the search tree
Consider Adding feature: 1
Consider Adding feature: 2
Consider Adding feature: 3
Consider Adding feature: 4
Consider Adding feature: 5
Consider Adding feature: 6
Consider Adding feature: 7
Consider Adding feature: 8
Consider Adding feature: 9
Consider Adding feature: 10
Consider Adding feature: 11
Consider Adding feature: 12
Consider Adding feature: 13
Consider Adding feature: 14
Consider Adding feature: 15
Consider Adding feature: 16
Consider Adding feature: 17
Consider Adding feature: 18
Consider Adding feature: 19
Consider Adding feature: 20
Consider Adding feature: 21
Consider Adding feature: 22
Consider Adding feature: 23
Consider Adding feature: 24
Consider Adding feature: 25
Consider Adding feature: 26
Consider Adding feature: 27
Consider Adding feature: 28
Consider Adding feature: 29
Consider Adding 