#Import Libraries For Helper Functions

In [13]:
from google.colab import drive
import os
import pandas as pd
import numpy as np
import math

In [14]:
def distEuclidean(firstVec, secondVec, columns):
    distance = 0
    for i in columns:
        distance += (firstVec[i] - secondVec[i]) ** 2
    return np.sqrt(distance)

In [15]:
def ZScoreNormalization(features):
  mean = np.mean(features)
  stdDev = np.std(features)
  data = (features - mean) / stdDev
  return data

In [16]:
def forwardSelection(data):
    #print(len(data), len(data[0]))
    dataset = ZScoreNormalization(data) #ZScore Normalization, Mean = 0, Standard Deviation = 1
    #print(dataset)
    bestSoFar = 0.0
    bestFeatureSet = set()
    currFeatureSet = set()
    #loops once outer for each feature
    for i in range(1, len(dataset[0])):
        print(f"Best Accuracy and Features until now: {bestSoFar*100:.2f}% {bestFeatureSet} ") # Checkpointing
        print(f"Currently evaluating level {i} of the search tree")
        currentBest = 0
        featureToAdd = None
        #loops same number of times for during each outer loop
        for feature in range(1, len(dataset[0])):
            if feature not in currFeatureSet:
                print("Consider Adding feature:", feature)
                tempFeatures = list(currFeatureSet) #temporary copy
                tempFeatures.append(feature) #adding newest entry and testing how that affects accuracy in the search
                correctClassification = 0 #successful classifications
                minDist = math.inf # To find minimum distance
                y_pred = 0
                #applying the Euclidean distance function to each row's comparison as a vector
                for row1 in dataset:
                    minDist = math.inf
                    for row2 in dataset:
                        duplicate = (row1 == row2).all()
                        if not duplicate:
                            value = distEuclidean(row1 , row2, tempFeatures)
                            if value < minDist:
                                minDist = value
                                prediction_class = row2[0]
                    if prediction_class == row1[0]:
                        correctClassification += 1
                accuracy = correctClassification / (len(dataset) )  #measuring accuracy

                # determining if recently-stored correctness needs to be updated so that it reflects both globally and in the current working copy.
                if accuracy > currentBest:
                    currentBest = accuracy
                    featureToAdd = feature
        currFeatureSet.add(featureToAdd)
        print(f"At iteration {i} we added feature {featureToAdd} to current set")
        print(f"Using feature(s) {currFeatureSet} the accuracy is {currentBest*100:.2f}%")
        if featureToAdd and currentBest > bestSoFar:  #print operatings to display progress and results
              print(f"From {bestSoFar*100:.2f}%")
              bestSoFar = currentBest
              print(f"To {bestSoFar*100:.2f}%")
              print("******************************")
              bestFeatureSet = set(currFeatureSet)

    print(f"Finished Search!! The complete best feature subset is {bestFeatureSet} which has an accuracy of {bestSoFar*100:.2f}%")

In [35]:
def backward_elimination(data):
  dataset = ZScoreNormalization(data) #ZScore Normalization, Mean = 0, Standard Deviation = 1
  best_accuracy_so_far = 0.0
  best_feature_set = set()
  current_feature_set = set([features for features in range(1, len(dataset[0]))])
  #loops once outer for each feature
  for i in range(1, len(dataset[0])):
    worst_set = 0 #keep a track of the value to be removed
    current_best_accuracy = 0
    print(f"Best Accuracy and Features until now: {best_accuracy_so_far*100:.2f}% {best_feature_set} ") # Checkpointing
    for j in range(1, len(dataset[0])):
      if j in current_feature_set:
        print(f"Feature {j} is considered for elimination")
        current_features_copy = set(current_feature_set) ##copy of current list of features for testing
        current_features_copy.remove(j)  #Adding the most recent entry and evaluating its impact on search accuracy
        classifications = 0 #keep track of successful classifications
        #shortest_distance = math.inf
        pred_class = 0
        # using the euclidean distance function to compare each feature across rows as a vector
        for x in dataset:
          shortest_distance = math.inf
          for y in dataset:
            identical = (x == y).all()
            if not identical: # to not check distance to same sample
              val = distEuclidean(x , y , current_features_copy)
              if val < shortest_distance:
                shortest_distance = val
                pred_class = y[0]
          if pred_class == x[0]:
            classifications += 1
        accuracy = classifications / (len(dataset) )  #to measure accuracy
        #to check for global and current updation
        if accuracy > current_best_accuracy:
          current_best_accuracy = accuracy
          worst_set = j
    if worst_set in current_feature_set:
        current_feature_set.remove(worst_set)
                #print operatings to display progress and results
        print(f"Feature {worst_set} was removed from the current set of features at level {i}")
        print(f"With the features {current_feature_set} ,the accuracy is {current_best_accuracy * 100:.2f}% ")
    if current_best_accuracy  >= best_accuracy_so_far:
        best_accuracy_so_far = current_best_accuracy
        best_feature_set = set(current_feature_set)
  print("Completed !")
  print(f"The best feature set is : {best_feature_set}, with an accuracy of {best_accuracy_so_far*100:.2f}%" )


In [36]:
print("""Feature Selection\n""")

fileChoice = int(input("Data File to be used: 1: CS170_small_Data__20.txt, 2: CS170_large_Data__1.txt, 3: CS170_XXXLarge_Data__11.txt\n"))
algo = int(input("""\nAlgorithm \n
1. Forward Selection \n
2. Backward Elimination \n""" ))

file = None
if fileChoice == 2:
  file = "CS170_large_Data__1.txt"
elif fileChoice == 3:
  file = "CS170_XXXlarge_Data__11.txt"
else:
  file = "CS170_small_Data__20.txt"

try:
    with open(file) as d:
        data = pd.read_csv(file, delim_whitespace=True)
        dataLen = data.shape[0]
        loadDataset = np.loadtxt(file)
except:
    print("Error Loading File. Re-run and try again.")

# printing dataset details
print(f"\nData contains {len(loadDataset[0]) - 1} features with {dataLen + 1} samples ")
print("Beginning Search")
if algo == 1:
    forwardSelection(loadDataset)
elif algo == 2:
  backward_elimination(loadDataset)

Feature Selection

Data File to be used: 1: CS170_small_Data__20.txt, 2: CS170_large_Data__1.txt, 3: CS170_XXXLarge_Data__11.txt
1

Algorithm 

1. Forward Selection 

2. Backward Elimination 
2

Data contains 10 features with 1000 samples 
Beginning Search
Best Accuracy and Features until now: 0.00% set() 
Feature 1 is considered for elimination
Feature 2 is considered for elimination
Feature 3 is considered for elimination
Feature 4 is considered for elimination
Feature 5 is considered for elimination
Feature 6 is considered for elimination
Feature 7 is considered for elimination
Feature 8 is considered for elimination
Feature 9 is considered for elimination
Feature 10 is considered for elimination
Feature 10 was removed from the current set of features at level 1
With the features {1, 2, 3, 4, 5, 6, 7, 8, 9} ,the accuracy is 81.40% 
Best Accuracy and Features until now: 81.40% {1, 2, 3, 4, 5, 6, 7, 8, 9} 
Feature 1 is considered for elimination
Feature 2 is considered for elimination

In [38]:
print("""Feature Selection\n""")

fileChoice = int(input("Data File to be used: 1: CS170_small_Data__20.txt, 2: CS170_large_Data__1.txt, 3: CS170_XXXLarge_Data__11.txt\n"))
algo = int(input("""\nAlgorithm \n
1. Forward Selection \n
2. Backward Elimination \n""" ))

file = None
if fileChoice == 2:
  file = "CS170_large_Data__1.txt"
elif fileChoice == 3:
  file = "CS170_XXXlarge_Data__11.txt"
else:
  file = "CS170_small_Data__20.txt"

try:
    with open(file) as d:
        data = pd.read_csv(file, delim_whitespace=True)
        dataLen = data.shape[0]
        loadDataset = np.loadtxt(file)
except:
    print("Error Loading File. Re-run and try again.")

# printing dataset details
print(f"\nData contains {len(loadDataset[0]) - 1} features with {dataLen + 1} samples ")
print("Beginning Search")
if algo == 1:
    forwardSelection(loadDataset)
elif algo == 2:
  backward_elimination(loadDataset)

Feature Selection

Data File to be used: 1: CS170_small_Data__20.txt, 2: CS170_large_Data__1.txt, 3: CS170_XXXLarge_Data__11.txt
2

Algorithm 

1. Forward Selection 

2. Backward Elimination 
2

Data contains 20 features with 2000 samples 
Beginning Search
Best Accuracy and Features until now: 0.00% set() 
Feature 1 is considered for elimination
Feature 2 is considered for elimination
Feature 3 is considered for elimination
Feature 4 is considered for elimination
Feature 5 is considered for elimination
Feature 6 is considered for elimination
Feature 7 is considered for elimination
Feature 8 is considered for elimination
Feature 9 is considered for elimination
Feature 10 is considered for elimination
Feature 11 is considered for elimination
Feature 12 is considered for elimination
Feature 13 is considered for elimination
Feature 14 is considered for elimination
Feature 15 is considered for elimination
Feature 16 is considered for elimination
Feature 17 is considered for elimination
Feat

In [None]:
print("""Feature Selection\n""")

fileChoice = int(input("Data File to be used: 1: CS170_small_Data__20.txt, 2: CS170_large_Data__1.txt, 3: CS170_XXXLarge_Data__11.txt\n"))
algo = int(input("""\nAlgorithm \n
1. Forward Selection \n
2. Backward Elimination \n""" ))

file = None
if fileChoice == 2:
  file = "CS170_large_Data__1.txt"
elif fileChoice == 3:
  file = "CS170_XXXlarge_Data__11.txt"
else:
  file = "CS170_small_Data__20.txt"

try:
    with open(file) as d:
        data = pd.read_csv(file, delim_whitespace=True)
        dataLen = data.shape[0]
        loadDataset = np.loadtxt(file)
except:
    print("Error Loading File. Re-run and try again.")

# printing dataset details
print(f"\nData contains {len(loadDataset[0]) - 1} features with {dataLen + 1} samples ")
print("Beginning Search")
if algo == 1:
    forwardSelection(loadDataset)
elif algo == 2:
  backward_elimination(loadDataset)