In [None]:
#this first block loads several modules that we may or may not use
#it serves as a check that you have them installed

from math import floor, sqrt, pi
from random import sample

import numpy as np
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plot
plot.rcParams["figure.figsize"] = (20, 10) # (w, h)
import torch
import torch.nn as nn

#scikit image libs
from skimage import io, exposure
from skimage.util import img_as_float
from skimage.color import rgb2gray
from skimage.filters import frangi, meijering, prewitt, gabor
from skimage.feature import hog

#pathing libraries
import os
import glob

#loading images from Google Drive
from google.colab import drive

print("Modules loaded")

In [None]:
#prepares n subplot boxes
def npaxes(n):
    #
    r = floor(sqrt(n))
    while(n % r != 0):
        r -= 1
    c = int(n / r)
    #
    fig, ax = plot.subplots(r, c)
    r_ = 0
    c_ = 0

    axes = []

    for _ in range(n):
        if(c == 1 or r == 1):
          axes.append(ax[max(r_, c_)])
        else:
          axes.append(ax[r_, c_])
        #
        c_ += 1
        if(c_ >= c):
            c_ = 0
            r_ += 1

    #plot.tight_layout()
    #plot.show()

    return axes

In [None]:
#Access to Google Drive content
drive.mount('/content/drive')

In [None]:
#basic directory information

basePath = '/content/drive/My Drive/CSCE633HW5'

print(os.listdir(basePath))

train_dir = basePath + '/train_PREPROCESSED/' #destination of where images will be saved
test_dir = basePath + '/test_PREPROCESSED/' #destination of where images will be saved

In [None]:
#file listing and  loading

def test_extension(f):
  extensions = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']
  ftitle, fext = os.path.splitext(os.path.basename(f))
  for ext in extensions:
    if(fext == ext):
      return True
  return False

#gets files in the given directory that match our extensions (images)
def get_files(dir):
  return [f for f in os.listdir(dir) if test_extension(f)]

#opens the image file as an skimage
def get_image(dir, f):
  return img_as_float(io.imread(os.path.join(dir, f)))

def normalize(img):
  return (img - img.mean()) / img.std()

In [None]:
img = get_image(train_dir, get_files(train_dir)[0])

def get_random_img(dir):
  return get_image(dir, sample(get_files(dir), 1)[0])

In [None]:
img.shape

Feature Extraction and Visualization begins below.

Feature 1: Image mean and stantdard deviation, no visualization (just print the tuple)

Feature 2: gabor filters (horizontal and vertical) and prewitt edge detector, visualization through filtered image plot + histogram of filtered image values (which may also be useful for processing)

Feature 3: Histogram of Oriented Gradients (HOG), visualized through the scikit visualization support; the actual data comes out as a giant feature vector, ready for further processing

In [None]:
#feature 1: image mean and stdev (come shipped with scikit, yay!)
#while we often want to normalize images,
#it may be useful in this context to keep the mean/stdev
#(apparently the lightness/darkness of the lungs in x-rays can indicate their health)
#we can then normalize the images after extracting these features

def feature1(img):
  return (img.mean(), img.std())

def feature1_display(img):
  print(feature1(img))

In [None]:
feature1_display(get_random_img(train_dir))

In [None]:
#feature 2: a couple scikit-supported filters

def feature2(img):
  img = rgb2gray(img)
  g1 = gabor(img, 1, theta = 0)
  g2 = gabor(img, 1, theta = pi / 2)
  return (
  #first: gabor filters
  #these are basically edge detectors (or "orientation" detectors)
  #these are an essential part of every visual feature kit :^)
  #scikit returns "real and imaginary responses"
    g1[0],
    g2[0],
  #edge filter
    prewitt(img)
  #a filter for branching shapes
  #neither this nor the meijering filter worked well - maybe one of you can tinker and get it to do something interesting?
    #frangi(img, black_ridges = False)
  #second: meijering filter, a cool filter I found for "neuriteness" - branching shapes
  #which should highlight the internal structure of the lungs
    #meijering(img, black_ridges = False)
  )

#the way we'd use these filters, other than plugging them into a NN,
#is probably to compute their means or counts - this will give us a general notion
#of the "amount" of the input image that matches that filter
#or, similar to the HOG below, we can create histograms of the values in the image after application of the filters

#returns (hist, bin_edges)
#see https://numpy.org/doc/stable/reference/generated/numpy.histogram.html
def feature2_histogram(feat):
  return np.histogram(feat, 'sqrt')

  #this function displays the results of feature-extraction 2
#i.e.: a picture of the image along with the result after the image goes through each filter
#as well as histograms calculated over the values in each picture
def feature2_display(img):
  fs = (rgb2gray(img),) + feature2(img)
  n = len(fs)

  axes = npaxes(n * 2)

  for i in range(n):
    f = fs[i]
    axes[i].imshow(f, cmap="gray")
    axes[i + n].hist(f.flatten(), bins="sqrt")

In [None]:
#display extracted features from a random training image
feature2_display(get_random_img(train_dir))

In [None]:
#feature 3: HOG (histogram of oriented gradients)
#this extracts a huge amount of information about the orientation of edges in the picture
#and returns a giant feature vector.
#it has several options, but the defaults should be good enough for us (we can test this).
#the two options we are most likely to modify are pixels_per_cell and cells_per_block, which control the way that HOG splits up the image.

#this returns a feature vector
#it is not suitable for visualization
def feature3(img):
  return hog(img, multichannel=True)

#this plots the corresponding HOG image
def feature3_display(img):
  fd, hog_img = hog(img, multichannel = True, visualize=True)
  hog_img = exposure.rescale_intensity(hog_img, in_range=(0, 10))
  plot.imshow(hog_img, cmap="gray")

In [None]:
feature3_display(get_random_img(train_dir))

## (iii) 

1-Feature selection: 
Explore two different feature selection methods of your choice. One method should be part of the Filter category and the other should be part of the Wrapper category

2- Classification: 
Using a simple classifier (e.g., SVM, logistic regression), plot the classification performance using a 5-fold cross-validation on the training data against the number of features for both feature selection methods. Compare and contrast between the two (e.g., in terms of performance and computation time).

In [None]:
#Load data
import pandas as pd


train_data = pd.read_csv('/content/drive/My Drive/CSCE633HW5/train.csv')
test_data = pd.read_csv('/content/drive/My Drive/CSCE633HW5/test.csv')

In [None]:
train_data.head(3)

In [None]:
#get the imags and labels 

def get_imgs(dataframe,dir):
  imgs = []
  for i in range(0,len(dataframe)):
      ig = get_image(dir, dataframe.iloc[i]['filename']) #match the order of the images in the dataframe and train_dir 
      imgs.append(ig)                                           #add all the images in a big array 
  return np.array(imgs,dtype='float32')

train_imgs = get_imgs(train_data,train_dir)


In [None]:
train_imgs.shape

In [None]:
#get the labels (outcome) as an array 
def get_labels(dataframe):
  labels = dataframe['covid(label)']
  return labels.to_numpy()
y = get_labels(train_data)

In [None]:
#get the features for a given imgs 
def get_features(feature,imgs):
  train_data1 = []
  for i in range(0,len(imgs)):
    train_data1.append(feature(imgs[i]))
  return np.array(train_data1)

In [None]:
#get feature 1 for all images 
f1 = get_features(feature1,train_imgs)

#get feature 2 for all images
f2 = get_features(feature2,train_imgs)
f2_flatten = f2.reshape((-1, 200*200*3)) #flatten feature2 output to 1D array 

#get feature 3 for all images 
f3 = get_features(feature3,train_imgs)

#combine the three features in one big feature matrix in which each column represent a potential feature 
X = np.concatenate((f1,f2_flatten,f3),axis=1)

print(f"The size of feature 1 array is {f1.shape}")
print(f"The size of feature 2 array is {f2_flatten.shape}")
print(f"The size of feature 3 array is {f3.shape}")
print(f"the size of the feature matrix is {X.shape}")

Quick intermission for feature scoring

In [None]:
#only run this command if the next cell fails with "module not found" on skfeature
#!pip install git+https://github.com/jundongl/scikit-feature.git

In [None]:
#TODO: fisher score, gini index, conditional entropy...

#gotta love Python and the hard work everybody else put into this stuff
from skfeature.function.similarity_based.fisher_score import fisher_score, feature_ranking as rank_fisher
from skfeature.function.statistical_based.gini_index import gini_index, feature_ranking as rank_gini
from skfeature.utility.mutual_information import conditional_entropy

#fisher score
fisher_scores = fisher_score(X, y) #score for each feature
fisher_rank = rank_fisher(fisher_scores) #feature indices sorted by fisher score
fisher_n = 10
print("Top", fisher_n, "features (fisher):", fisher_rank[:fisher_n])

#gini index
gini_indices = gini_index(X, y)
gini_rank = rank_gini(gini_indices)
gini_n = 10
print("Top", gini_n, "features (gini):", gini_rank[:gini_n])

#conditional entropy
print(X.shape, y.shape)

Back to feature selection

In [None]:
#Filter Method
#use SelectKbest from sklearn that takes the matrix features, the labels
#and number of features to select (k) then performs statistical tests to
#select k number of features 
#the number of features targeted is 50

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

skb = SelectKBest(chi2, k=50)
X_filter = skb.fit_transform(X, y)
print(skb.get_support(indices = True)) #this prints the indices of the selected features 

Feature Selection method 2 : wrapper


In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
#The wrapper method used is the Recursive Feature Elimination (RFE) 
#it works by recursively removing features and building a model on the remaining
#this method would go over all the features (162,000) and as a result takes a lot of time to run
#A way to lower the computational time for the wrapper method is to use 
#the filter method initially and apply it to get 2000 feature out of the 162,000
#then the REF is applied to the 1000 features to select 50 features


#Filter method 
skb2 = SelectKBest(chi2, k=1000)
X_filter2 = skb2.fit_transform(X, y)
# print(skb2.get_support(indices = True))

#Wrapper method 

model = LogisticRegression(solver='lbfgs', max_iter=10000)
rfe = RFE(model, 50)
X_wrapper2 = rfe.fit_transform(X_filter2, y)
print(skb2.get_support(indices = True)[rfe.get_support(indices = True)]) #this prints the indicies of the selected features

Apply logistic regression to a 5-fold cross-validation on the training data across the features selected with the filter method and the wrapper method features 

In [None]:
def cross_val(X,y):
  random_index = np.random.permutation(len(X))
  scores = []
  for i in range(0,5):
      #initilize the training/testing lists
      Xs = []
      Xt = []
      ys = []
      yt = []
      #split into 5 folds randomly
      for j in range(0,len(X)): 
          if (j%5 ==i):
              Xs.append(X[random_index[j]])
              ys.append(y[random_index[j]])
          else: 
              Xt.append(X[random_index[j]])
              yt.append(y[random_index[j]])
      #convert list to array
      Xs = np.asarray(Xs)
      Xt = np.asarray(Xt)
      ys = np.asarray(ys)
      yt = np.asarray(yt)
      
      #apply logistic regression to the testing data
      model = LogisticRegression(solver='saga', max_iter=10000)
      #C = Inverse of regularization strength; must be a positive float --> smaller values specify stronger regularization.
      model.fit(Xt, yt)
      pred = model.predict(Xs)
      score = model.score(Xs, ys)
      scores += [score]
      print((round(score,3)))

      # plot the performance 
      plot.plot(range(1, len(scores)+1), scores)
      plot.yticks([0, 0.5, 1])
      plot.xticks([0, 1, 2, 3, 4, 5])
      plot.xlabel("Subset of features", fontsize = 25)
      plot.ylabel("Cross validation score ", fontsize =25)
      # plot.show()
  print("Mean score:", sum(scores) / len(scores))

In [None]:
#apply cross validation to filter method features
cross_val(X_filter,y)


In [None]:
#apply the cross validation to the wrapper method features
cross_val(X_wrapper2,y)

b.iv: Adaboost

Learn a simple model (in this case, logistic regression) multiple times, each time weighting the samples based on how badly the *previous* models did on them; each new model is weighted by how well it does

In [None]:
#X and y should already be defined from feature selection above

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, make_scorer

def adaboost(T, cv_count = 5):
  classifier = AdaBoostClassifier(LogisticRegression(max_iter=10000), n_estimators=T)
  scores = cross_validate(classifier, X, y, cv=cv_count, scoring = make_scorer(accuracy_score))["test_score"]
  print("AdaBoost(", T, ") test scores:", scores)
  print("Average:", sum(scores)/len(scores))

adaboost(1)
adaboost(10)
adaboost(100)
adaboost(1000)


AdaBoost only performs about as well as the "filter" method, and is *significantly* more expensive (in terms of runtime).