# EE257 Project - Feature Extraction

[Shoulder Implant X-Ray Manufacturer Classification Data Set (2020)](https://archive.ics.uci.edu/ml/datasets/Shoulder+Implant+X-Ray+Manufacturer+Classification)

In [44]:
# handle imports
import os
import tensorflow as tf
from PIL import Image
import random
import pathlib
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge, Lasso, LassoCV

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix, classification_report


In [1]:
current_path = os.getcwd()
dataset_path = current_path + '\dataset'
dataset_path

'c:\\Users\\tickn\\ml\\EE257\\EE257 Project\\dataset'

In [2]:
# Load dataset and split

data_dir = pathlib.Path(dataset_path + '\data')
batch_size = 32

def describe_img(filepath):
    rand_img = random.choice(list(filepath.glob('**\*.jpg')))
    width, height = Image.open(str(rand_img)).size

    return width, height

def random_img(filepath):
    return  random.choice(list(filepath.glob('**\*.jpg')))

img_width, img_height = describe_img(data_dir)

# load image dataset
train_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    validation_split = 0.2,
    subset = "training",
    seed = 123,
    color_mode="grayscale",
    image_size = (img_height , img_width),
    batch_size = batch_size
)

test_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    validation_split = 0.2,
    subset = "validation",
    seed = 123,
    color_mode="grayscale",
    image_size = (img_height , img_width),
    batch_size = batch_size
)



Found 597 files belonging to 4 classes.
Using 478 files for training.
Found 597 files belonging to 4 classes.
Using 119 files for validation.


In [3]:
def dataset_to_2D(dataset):
    x = []
    y = []
    for img_batch, label_batch in dataset:
        # flatten images since model fit() needs 2D input
        for img in img_batch:
            x.append(img.flatten())
        for label in label_batch:
            y.append(label)
    return x, y
        
x_train, y_train = dataset_to_2D(train_ds.as_numpy_iterator())
x_test, y_test = dataset_to_2D(test_ds.as_numpy_iterator())

print(np.shape(x_train))
print(np.shape(y_train))

    


(478, 62500)
(478,)


In [8]:
# feature selection pipeline
lasso = Lasso(fit_intercept=True, max_iter=10000)
lasso.fit(x_train, y_train)

Lasso(max_iter=10000)

In [11]:
# feature selection using Lasso shrinkage
l1_select = SelectFromModel(lasso).fit(x_train, y_train)
print(l1_select.get_support(indices=True))


[   53   606  1106  1377  1693  1876  2932  3841  3953  5048  5381  6546
  7571  8106  8453  8697  8698  8884  9089  9358  9367  9608  9629  9856
 10203 10547 10613 10615 10880 10953 11096 11436 11857 11908 12192 12214
 12366 12373 12389 12616 12640 12652 12853 12867 12910 14104 14147 14781
 15454 15614 16091 16137 16138 16562 16601 16623 16696 17437 17582 17965
 18892 18951 19394 19446 19696 20794 21089 21123 21671 21682 21684 22158
 22531 22817 23108 24116 25175 25454 25750 27335 27567 27580 27627 27875
 28156 28667 28918 29525 29954 30417 30676 31045 31134 31385 31655 32051
 32126 32299 32594 32871 36437 36902 36931 38171 38929 39428 39429 40175
 40231 40301 41337 42312 42814 43275 43327 43426 43525 43587 44570 46619
 47830 47831 48294 48408 48438 48439 48440 48457 48824 48999 49249 49757
 50167 50250 50531 50824 50934 50935 51269 51349 51375 51770 52500 52525
 52537 52553 52787 53287 54327 54688 54998 55580 55830 56209 56210 56222
 56652 56750 56775 56798 56803 56805 56953 57558 57

In [12]:
ridge = Ridge(fit_intercept=True, max_iter=10000)
ridge.fit(x_train, y_train)

Ridge(max_iter=10000)

In [13]:
# feature selection using Ridge shrinkage
l2_select = SelectFromModel(ridge).fit(x_train, y_train)
print(l2_select.get_support(indices=True))


[    6    22    23 ... 62490 62491 62492]


In [32]:
# search best param for baseline model
svm = GridSearchCV(
    SVC(),
    param_grid={
        "C" : [0.1 , 1.0 , 10.0 , 100.0 , 1000.0],
        "kernel" : ['linear' , 'poly' , 'rbf']
    },
    scoring='accuracy'
)

svm.fit(x_train , y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1.0, 10.0, 100.0, 1000.0],
                         'kernel': ['linear', 'poly', 'rbf']},
             scoring='accuracy')

In [33]:
svm.best_params_

{'C': 10.0, 'kernel': 'rbf'}

In [29]:
clf1 = Pipeline([
    ('feature selection' , SelectFromModel(Ridge(fit_intercept=True, max_iter=10000))),
    ('model' , SVC(C=10.0 , kernel='rbf'))
])

clf2 = Pipeline([
    ('feature selection' , SelectFromModel(Lasso(fit_intercept=True, max_iter=10000))),
    ('model' , SVC(C=10.0 , kernel='rbf'))
])


clf1.fit(x_train , y_train)
clf2.fit(x_train,y_train)

print(" Training error with l1 reg: %f " %clf1.score(x_train, y_train))
print(" Test error l1 reg: %f " %clf1.score(x_test, y_test))
print()
print(" Training error l2 reg: %f " %clf2.score(x_train, y_train))
print(" Test error l2 reg: %f " %clf2.score(x_test, y_test))


 Training error with l1 reg: 0.953975 
 Test error l1 reg: 0.478992 
 Training error l2 reg: 0.951883 
 Test error l2 reg: 0.504202 


In [37]:
cl1_predict = clf1.predict(x_test)
cl2_predict = clf2.predict(x_test)

print("Confusion Matrix")
print(confusion_matrix(y_test, cl1_predict))
print("--------------------------")
print(classification_report(y_test, cl1_predict, target_names=['Cofield' , 'Depuy' , 'Tornier' , 'Zimmer']))

Confusion Matrix
[[ 5  7  1  2]
 [ 3 45  1 12]
 [ 2 10  0  6]
 [ 4 13  1  7]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.36      0.33      0.34        15
       Depuy       0.60      0.74      0.66        61
     Tornier       0.00      0.00      0.00        18
      Zimmer       0.26      0.28      0.27        25

    accuracy                           0.48       119
   macro avg       0.30      0.34      0.32       119
weighted avg       0.41      0.48      0.44       119



In [38]:
print()

print("Confusion Matrix")
print(confusion_matrix(y_test, cl2_predict))
print("--------------------------")
print(classification_report(y_test, cl2_predict, target_names=['Cofield' , 'Depuy' , 'Tornier' , 'Zimmer']))




Confusion Matrix
[[ 5  6  1  3]
 [ 1 48  4  8]
 [ 0 12  1  5]
 [ 4 14  1  6]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.50      0.33      0.40        15
       Depuy       0.60      0.79      0.68        61
     Tornier       0.14      0.06      0.08        18
      Zimmer       0.27      0.24      0.26        25

    accuracy                           0.50       119
   macro avg       0.38      0.35      0.35       119
weighted avg       0.45      0.50      0.47       119



In [39]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

d_tree = GridSearchCV(
    DecisionTreeClassifier(),
    param_grid={
        "criterion" : ['gini' , 'entropy'],
        "splitter" : ['best' , 'random']
    },
    scoring='accuracy'
)

d_tree.fit(x_train , y_train)


GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'splitter': ['best', 'random']},
             scoring='accuracy')

In [40]:
d_tree.best_params_

{'criterion': 'entropy', 'splitter': 'best'}

In [41]:
clf3 = Pipeline([
    ('feature selection' , SelectFromModel(Ridge(fit_intercept=True, max_iter=10000))),
    ('model' , DecisionTreeClassifier(criterion="entropy"))
])

clf4 = Pipeline([
    ('feature selection' , SelectFromModel(Lasso(fit_intercept=True, max_iter=10000))),
    ('model' , DecisionTreeClassifier(criterion="entropy"))
])


clf3.fit(x_train , y_train)
clf4.fit(x_train,y_train)

print(" Training error with l1 reg: %f " %clf3.score(x_train, y_train))
print(" Test error l1 reg: %f " %clf3.score(x_test, y_test))
print()
print(" Training error l2 reg: %f " %clf4.score(x_train, y_train))
print(" Test error l2 reg: %f " %clf4.score(x_test, y_test))

 Training error with l1 reg: 1.000000 
 Test error l1 reg: 0.369748 

 Training error l2 reg: 1.000000 
 Test error l2 reg: 0.378151 


In [42]:
rand_tree = GridSearchCV(
    RandomForestClassifier(),
    param_grid={
        "n_estimators" : [1 , 100 , 1000],
        "criterion" : ['gini' , 'entropy'],
    },
    scoring='accuracy'
)

rand_tree.fit(x_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'n_estimators': [1, 100, 1000]},
             scoring='accuracy')

In [43]:
rand_tree.best_params_

{'criterion': 'gini', 'n_estimators': 100}

In [45]:
clf5 = Pipeline([
    ('feature selection' , SelectFromModel(Ridge(fit_intercept=True, max_iter=10000))),
    ('model' , RandomForestClassifier())
])

clf6 = Pipeline([
    ('feature selection' , SelectFromModel(Lasso(fit_intercept=True, max_iter=10000))),
    ('model' , RandomForestClassifier())
])


clf5.fit(x_train , y_train)
clf6.fit(x_train,y_train)

print(" Training error with l1 reg: %f " %clf5.score(x_train, y_train))
print(" Test error l1 reg: %f " %clf5.score(x_test, y_test))
print()
print(" Training error l2 reg: %f " %clf6.score(x_train, y_train))
print(" Test error l2 reg: %f " %clf6.score(x_test, y_test))

 Training error with l1 reg: 1.000000 
 Test error l1 reg: 0.521008 

 Training error l2 reg: 1.000000 
 Test error l2 reg: 0.512605 
