# EE257 Project - Feature Extraction

[Shoulder Implant X-Ray Manufacturer Classification Data Set (2020)](https://archive.ics.uci.edu/ml/datasets/Shoulder+Implant+X-Ray+Manufacturer+Classification)

In [40]:
# handle imports
import os
import tensorflow as tf
from PIL import Image
import random
import pathlib
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge, Lasso, LassoCV, RidgeCV

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix, classification_report

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


In [2]:
# prints whether running on CPU or GPU
tf.debugging.set_log_device_placement(False)

In [3]:
current_path = os.getcwd()
dataset_path = current_path + '\dataset'
dataset_path

'c:\\Users\\tickn\\ml\\EE257\\EE257 Project\\dataset'

In [4]:
# Load dataset and split

data_dir = pathlib.Path(dataset_path + '\data')
batch_size = 32

def describe_img(filepath):
    rand_img = random.choice(list(filepath.glob('**\*.jpg')))
    width, height = Image.open(str(rand_img)).size

    return width, height

def random_img(filepath):
    return  random.choice(list(filepath.glob('**\*.jpg')))

img_width, img_height = describe_img(data_dir)

# load image dataset
train_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    validation_split = 0.2,
    subset = "training",
    seed = 123,
    color_mode="grayscale",
    image_size = (img_height , img_width),
    batch_size = batch_size
)

test_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    validation_split = 0.2,
    subset = "validation",
    seed = 123,
    color_mode="grayscale",
    image_size = (img_height , img_width),
    batch_size = batch_size
)



Found 597 files belonging to 4 classes.
Using 478 files for training.
Found 597 files belonging to 4 classes.
Using 119 files for validation.


In [5]:
def dataset_to_2D(dataset):
    x = []
    y = []
    for img_batch, label_batch in dataset:
        # flatten images since model fit() needs 2D input
        for img in img_batch:
            x.append(img.flatten())
        for label in label_batch:
            y.append(label)
    return x, y
        
x_train, y_train = dataset_to_2D(train_ds.as_numpy_iterator())
x_test, y_test = dataset_to_2D(test_ds.as_numpy_iterator())

print(np.shape(x_train))
print(np.shape(y_train))

    


(478, 62500)
(478,)


In [6]:
# Feature selection using lasso, varying alpha values
L1_0 = SelectFromModel(Lasso(alpha=0.1 , max_iter=10000)).fit(x_train, y_train)
L1_1 = SelectFromModel(Lasso(alpha=1 , max_iter=10000)).fit(x_train, y_train)
L1_2 = SelectFromModel(Lasso(alpha=10 ,max_iter=10000)).fit(x_train, y_train)


  model = cd_fast.enet_coordinate_descent(


In [7]:
# Different regularized datasets
L1_0train = L1_0.transform(x_train)
L1_0test = L1_0.transform(x_test)

L1_1train = L1_1.transform(x_train)
L1_1test = L1_1.transform(x_test)

L1_2train = L1_2.transform(x_train)
L1_2test = L1_2.transform(x_test)

In [8]:
# feature selection using Lasso shrinkage
print("alpha = 1")
print(L1_0.get_support(indices=True))
print(np.shape(L1_0train))
print("------------------------------------------------")
print("alpha = 10")
print(L1_1.get_support(indices=True))
print(np.shape(L1_1train))
print("------------------------------------------------")
print("alpha = 100")
print(L1_2.get_support(indices=True))
print(np.shape(L1_2train))

alpha = 1
[   45    53    91   143   145   940  1099  1376  1566  1687  1690  1693
  2377  2686  2703  2929  3152  3221  3309  3594  3937  3953  4569  4635
  4680  4732  4851  5048  5050  5147  5381  5407  5856  6094  6150  6178
  6314  6703  6947  7386  7425  7571  7572  7582  7909  8106  8145  8235
  8368  8453  8458  8464  8534  8623  8634  8675  8839  8840  9152  9358
  9553  9598  9600  9608  9798  9953 10221 10386 10387 10528 10546 10547
 10586 10616 10617 10651 10700 10880 10949 11114 11363 11371 11419 11436
 11603 11609 11621 11622 11629 11686 11850 11879 11897 11908 12139 12193
 12197 12219 12249 12264 12291 12370 12389 12413 12616 12620 12631 12910
 13118 13308 13323 13331 13348 13610 13868 14104 14146 14161 14169 14219
 14358 14383 14392 14483 14584 14594 14642 14656 14845 14846 14915 15042
 15113 15167 15545 15614 15791 15864 15887 15893 16091 16145 16250 16418
 16556 16563 16601 16623 16669 16812 16846 16872 16873 16931 16942 16945
 17304 17581 17906 18045 18863 18892 1934

In [9]:
l1_training_set = []
l1_test_set = []

l1_training_set.append((L1_0train,y_train))
l1_training_set.append((L1_1train,y_train))
l1_training_set.append((L1_2train,y_train))

l1_test_set.append((L1_0test,y_test))
l1_test_set.append((L1_1test,y_test))
l1_test_set.append((L1_2test,y_test))


In [10]:
# Feature selection using ridge, varying alpha values
L2_0 = SelectFromModel(Ridge(alpha = 0.1)).fit(x_train, y_train)
L2_1 = SelectFromModel(Lasso(alpha=1)).fit(x_train, y_train)
L2_2 = SelectFromModel(Lasso(alpha=10)).fit(x_train, y_train)


  model = cd_fast.enet_coordinate_descent(


In [11]:
# Different regularized datasets
L2_0train = L2_0.transform(x_train)
L2_0test = L2_0.transform(x_test)

L2_1train = L2_1.transform(x_train)
L2_1test = L2_1.transform(x_test)

L2_2train = L2_2.transform(x_train)
L2_2test = L2_2.transform(x_test)

l2_training_set = []
l2_test_set = []

l2_training_set.append((L2_0train,y_train))
l2_training_set.append((L2_1train,y_train))
l2_training_set.append((L2_2train,y_train))

l2_test_set.append((L2_0test,y_test))
l2_test_set.append((L2_1test,y_test))
l2_test_set.append((L2_2test,y_test))

In [17]:
l1_training_set[0][0]

array([[  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  3., 132., 110., ..., 147., 152., 148.],
       [ 23.,  23.,  23., ..., 122., 129., 134.],
       ...,
       [  6.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   2.,   0.,   0.],
       [ 47.,  47.,  47., ..., 198., 196., 192.]], dtype=float32)

In [12]:
# feature selection using Ridge shrinkage
print("alpha = 1")
print(L2_0.get_support(indices=True))
print(np.shape(L2_0train))
print("------------------------------------------------")
print("alpha = 10")
print(L2_1.get_support(indices=True))
print(np.shape(L2_1train))
print("------------------------------------------------")
print("alpha = 100")
print(L2_2.get_support(indices=True))
print(np.shape(L2_2train))

alpha = 1
[    6    22    23 ... 62490 62491 62492]
(478, 25357)
------------------------------------------------
alpha = 10
[   53   606  1106  1377  1693  1876  2932  3841  3953  5048  5381  6546
  7571  8106  8453  8697  8698  8884  9089  9358  9367  9608  9629  9856
 10203 10547 10613 10615 10880 10953 11096 11436 11857 11908 12192 12214
 12366 12373 12389 12616 12640 12652 12853 12867 12910 14104 14147 14781
 15454 15614 16091 16137 16138 16562 16601 16623 16696 17437 17582 17965
 18892 18951 19394 19446 19696 20794 21089 21123 21671 21682 21684 22158
 22531 22817 23108 24116 25175 25454 25750 27335 27567 27580 27627 27875
 28156 28667 28918 29525 29954 30417 30676 31045 31134 31385 31655 32051
 32126 32299 32594 32871 36437 36902 36931 38171 38929 39428 39429 40175
 40231 40301 41337 42312 42814 43275 43327 43426 43525 43587 44570 46619
 47830 47831 48294 48408 48438 48439 48440 48457 48824 48999 49249 49757
 50167 50250 50531 50824 50934 50935 51269 51349 51375 51770 52500 52525

Effects of different regularization strength on model performance

In [13]:
# Support Vector Machine
svm = SVC(C=10)

In [31]:
def reg_testing(estimator, training , test):
    for i in range(len(training)):
        
        estimator.fit(training[i][0],training[i][1])
        predictions = estimator.predict(test[i][0])
        
        print("Confusion Matrix")
        print(confusion_matrix(test[i][1], predictions))
        print("--------------------------")
        print(classification_report(test[i][1], predictions, target_names=['Cofield' , 'Depuy' , 'Tornier' , 'Zimmer']))
        print("--------------------------")
        print(" Training error: %f " %estimator.score(training[i][0],training[i][1]))
        print(" Test error: %f " %estimator.score(test[i][0],test[i][1]))
        print()
        print()
        

In [32]:
#L1
reg_testing(svm,l1_training_set,l1_test_set)

Confusion Matrix
[[ 5  5  1  4]
 [ 2 47  3  9]
 [ 1  8  1  8]
 [ 5 13  1  6]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.38      0.33      0.36        15
       Depuy       0.64      0.77      0.70        61
     Tornier       0.17      0.06      0.08        18
      Zimmer       0.22      0.24      0.23        25

    accuracy                           0.50       119
   macro avg       0.35      0.35      0.34       119
weighted avg       0.45      0.50      0.47       119

--------------------------
 Training error: 0.974895 
 Test error: 0.495798 


Confusion Matrix
[[ 5  6  1  3]
 [ 1 48  4  8]
 [ 0 12  1  5]
 [ 4 14  1  6]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.50      0.33      0.40        15
       Depuy       0.60      0.79      0.68        61
     Tornier       0.14      0.06      0.08        18
      Zimmer       0.27      0.24      0.26        25

    

In [33]:
#L2
reg_testing(svm,l2_training_set,l2_test_set)

Confusion Matrix
[[ 5  7  1  2]
 [ 3 45  1 12]
 [ 2 10  0  6]
 [ 4 13  1  7]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.36      0.33      0.34        15
       Depuy       0.60      0.74      0.66        61
     Tornier       0.00      0.00      0.00        18
      Zimmer       0.26      0.28      0.27        25

    accuracy                           0.48       119
   macro avg       0.30      0.34      0.32       119
weighted avg       0.41      0.48      0.44       119

--------------------------
 Training error: 0.953975 
 Test error: 0.478992 


Confusion Matrix
[[ 5  6  1  3]
 [ 1 48  4  8]
 [ 0 12  1  5]
 [ 4 14  1  6]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.50      0.33      0.40        15
       Depuy       0.60      0.79      0.68        61
     Tornier       0.14      0.06      0.08        18
      Zimmer       0.27      0.24      0.26        25

    

In [36]:
# Random Forest
rand_tree = RandomForestClassifier()

In [37]:
#L1
reg_testing(rand_tree,l1_training_set,l1_test_set)

Confusion Matrix
[[ 5  7  1  2]
 [ 2 51  1  7]
 [ 1 14  0  3]
 [ 2 17  0  6]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.50      0.33      0.40        15
       Depuy       0.57      0.84      0.68        61
     Tornier       0.00      0.00      0.00        18
      Zimmer       0.33      0.24      0.28        25

    accuracy                           0.52       119
   macro avg       0.35      0.35      0.34       119
weighted avg       0.43      0.52      0.46       119

--------------------------
 Training error: 1.000000 
 Test error: 0.521008 


Confusion Matrix
[[ 5  8  1  1]
 [ 2 50  1  8]
 [ 0 13  0  5]
 [ 1 17  1  6]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.62      0.33      0.43        15
       Depuy       0.57      0.82      0.67        61
     Tornier       0.00      0.00      0.00        18
      Zimmer       0.30      0.24      0.27        25

    

In [38]:
#L2
reg_testing(rand_tree,l2_training_set,l2_test_set)

Confusion Matrix
[[ 5  7  1  2]
 [ 3 51  0  7]
 [ 2  9  0  7]
 [ 1 18  0  6]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.45      0.33      0.38        15
       Depuy       0.60      0.84      0.70        61
     Tornier       0.00      0.00      0.00        18
      Zimmer       0.27      0.24      0.26        25

    accuracy                           0.52       119
   macro avg       0.33      0.35      0.33       119
weighted avg       0.42      0.52      0.46       119

--------------------------
 Training error: 1.000000 
 Test error: 0.521008 


Confusion Matrix
[[ 5  9  1  0]
 [ 2 51  1  7]
 [ 0  8  0 10]
 [ 2 18  0  5]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.56      0.33      0.42        15
       Depuy       0.59      0.84      0.69        61
     Tornier       0.00      0.00      0.00        18
      Zimmer       0.23      0.20      0.21        25

    

In [41]:
# Decision Tree
tree = DecisionTreeClassifier()

In [42]:
#L1
reg_testing(tree,l1_training_set,l1_test_set)

Confusion Matrix
[[ 5  6  2  2]
 [ 7 31  7 16]
 [ 4  7  1  6]
 [ 4  8  2 11]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.25      0.33      0.29        15
       Depuy       0.60      0.51      0.55        61
     Tornier       0.08      0.06      0.07        18
      Zimmer       0.31      0.44      0.37        25

    accuracy                           0.40       119
   macro avg       0.31      0.33      0.32       119
weighted avg       0.42      0.40      0.40       119

--------------------------
 Training error: 1.000000 
 Test error: 0.403361 


Confusion Matrix
[[ 5  5  1  4]
 [ 8 33  8 12]
 [ 1 11  2  4]
 [ 3 12  4  6]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.29      0.33      0.31        15
       Depuy       0.54      0.54      0.54        61
     Tornier       0.13      0.11      0.12        18
      Zimmer       0.23      0.24      0.24        25

    

In [43]:
#L2
reg_testing(tree,l2_training_set,l2_test_set)

Confusion Matrix
[[ 7  2  4  2]
 [ 7 31  6 17]
 [ 1  7  2  8]
 [ 3 12  3  7]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.39      0.47      0.42        15
       Depuy       0.60      0.51      0.55        61
     Tornier       0.13      0.11      0.12        18
      Zimmer       0.21      0.28      0.24        25

    accuracy                           0.39       119
   macro avg       0.33      0.34      0.33       119
weighted avg       0.42      0.39      0.40       119

--------------------------
 Training error: 1.000000 
 Test error: 0.394958 


Confusion Matrix
[[ 5  6  1  3]
 [ 6 36 11  8]
 [ 2  9  2  5]
 [ 3 11  3  8]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.31      0.33      0.32        15
       Depuy       0.58      0.59      0.59        61
     Tornier       0.12      0.11      0.11        18
      Zimmer       0.33      0.32      0.33        25

    

In [44]:
# Log Reg
logreg = LogisticRegression(solver='newton-cg',max_iter=1000)

In [45]:
#L1
reg_testing(logreg,l1_training_set,l1_test_set)



Confusion Matrix
[[ 7  3  1  4]
 [ 6 36  7 12]
 [ 2  7  3  6]
 [ 5 16  0  4]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.35      0.47      0.40        15
       Depuy       0.58      0.59      0.59        61
     Tornier       0.27      0.17      0.21        18
      Zimmer       0.15      0.16      0.16        25

    accuracy                           0.42       119
   macro avg       0.34      0.35      0.34       119
weighted avg       0.42      0.42      0.41       119

--------------------------
 Training error: 1.000000 
 Test error: 0.420168 


Confusion Matrix
[[ 4  7  3  1]
 [ 5 29 11 16]
 [ 2  9  2  5]
 [ 5 13  1  6]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.25      0.27      0.26        15
       Depuy       0.50      0.48      0.49        61
     Tornier       0.12      0.11      0.11        18
      Zimmer       0.21      0.24      0.23        25

    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
#L2
reg_testing(logreg,l2_training_set,l2_test_set)



Confusion Matrix
[[ 5  7  2  1]
 [ 2 36  7 16]
 [ 1  7  3  7]
 [ 6 10  1  8]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.36      0.33      0.34        15
       Depuy       0.60      0.59      0.60        61
     Tornier       0.23      0.17      0.19        18
      Zimmer       0.25      0.32      0.28        25

    accuracy                           0.44       119
   macro avg       0.36      0.35      0.35       119
weighted avg       0.44      0.44      0.44       119

--------------------------
 Training error: 1.000000 
 Test error: 0.436975 


Confusion Matrix
[[ 4  7  3  1]
 [ 5 29 11 16]
 [ 2  9  2  5]
 [ 5 13  1  6]]
--------------------------
              precision    recall  f1-score   support

     Cofield       0.25      0.27      0.26        15
       Depuy       0.50      0.48      0.49        61
     Tornier       0.12      0.11      0.11        18
      Zimmer       0.21      0.24      0.23        25

    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
