In [1]:
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
from IPython.display import display
%matplotlib inline
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import cv2
from PIL import Image
from tqdm import tqdm
from skimage.feature import hog
from skimage.color import rgb2grey
import scikitplot as skplt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc
from sklearn.pipeline import Pipeline
import timeit
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, ShuffleSplit
import mahotas
import pickle
from sklearn.metrics import plot_confusion_matrix

In [2]:
# feature-descriptor-1: Hu Moments
def fd_hu_moments(image):
    feature = cv2.HuMoments(cv2.moments(image)).flatten()
    return feature

In [3]:
# feature-descriptor-2: Haralick Texture
def fd_haralick(image):
    haralick = mahotas.features.haralick(image).mean(axis=0)
    return haralick

In [25]:
training_data = []
IMG_SIZE =500
CATEGORIES = ["NORMAL","PNEUMONIA"]
DATADIR = "C:/Users/yashh/Data Analytics/Project/chest-xray-pneumonia/chest_xray/train"

def create_training_data():
    for category in CATEGORIES:  

        path = os.path.join(DATADIR,category) 
        class_num = CATEGORIES.index(category) 

        for img in tqdm(os.listdir(path)): 
            try:
                img_array = cv2.imread(os.path.join(path,img) ,cv2.IMREAD_GRAYSCALE)  # convert to array
                new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))  # resize to normalize data size

                fv_hu_moments = fd_hu_moments(new_array)
                fv_haralick   = fd_haralick(new_array)

                global_feature = np.hstack([fv_haralick, fv_hu_moments])
                #haralick = mahotas.features.haralick(new_array).mean(axis=0)
                #histogram_features, hist_image= hog(new_array, orientations=8, pixels_per_cell=(16, 16), cells_per_block=(1, 1), visualize=True)
        
                training_data.append([global_feature,class_num]) 
            except Exception as e:  
                pass

create_training_data()

print(len(training_data))

100%|██████████████████████████████████████████████████████████████████████████████| 1341/1341 [02:56<00:00,  7.61it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 3875/3875 [05:33<00:00, 15.01it/s]


5216


In [26]:
testing_data = []
IMG_SIZE =200
CATEGORIES = ["NORMAL","PNEUMONIA"]
DATADIR = "C:/Users/yashh/Data Analytics/Project/chest-xray-pneumonia/chest_xray/test"

def create_testing_data():
    for category in CATEGORIES:  

        path = os.path.join(DATADIR,category) 
        class_num = CATEGORIES.index(category) 

        for img in tqdm(os.listdir(path)): 
            try:
                img_array = cv2.imread(os.path.join(path,img) ,cv2.IMREAD_GRAYSCALE)  # convert to array
                new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))  # resize to normalize data size

                fv_hu_moments = fd_hu_moments(new_array)
                fv_haralick   = fd_haralick(new_array)

                global_feature = np.hstack([fv_haralick, fv_hu_moments])
                #haralick = mahotas.features.haralick(new_array).mean(axis=0)
                #histogram_features, hist_image= hog(new_array, orientations=8, pixels_per_cell=(16, 16), cells_per_block=(1, 1), visualize=True)
        
                testing_data.append([global_feature,class_num]) 
            except Exception as e:  
                pass

create_testing_data()

print(len(testing_data))

100%|████████████████████████████████████████████████████████████████████████████████| 234/234 [00:17<00:00, 13.48it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 390/390 [00:19<00:00, 19.97it/s]


624


In [27]:
data = []
IMG_SIZE =200
CATEGORIES = ["NORMAL","PNEUMONIA"]
DATADIR = "C:/Users/yashh/Data Analytics/Project/chest-xray-pneumonia/chest_xray/all"

def create_data():
    for category in CATEGORIES:  

        path = os.path.join(DATADIR,category) 
        class_num = CATEGORIES.index(category) 

        for img in tqdm(os.listdir(path)): 
            try:
                img_array = cv2.imread(os.path.join(path,img) ,cv2.IMREAD_GRAYSCALE)  # convert to array
                new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))  # resize to normalize data size

                fv_hu_moments = fd_hu_moments(new_array)
                fv_haralick   = fd_haralick(new_array)

                global_feature = np.hstack([fv_haralick, fv_hu_moments])
                #haralick = mahotas.features.haralick(new_array).mean(axis=0)
                #histogram_features, hist_image= hog(new_array, orientations=8, pixels_per_cell=(16, 16), cells_per_block=(1, 1), visualize=True)
        
                data.append([global_feature,class_num]) 
            except Exception as e:  
                pass

create_data()

print(len(data))

100%|██████████████████████████████████████████████████████████████████████████████| 1575/1575 [02:06<00:00, 12.42it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 4265/4265 [01:55<00:00, 37.08it/s]


5840


In [28]:
import random
random.shuffle(training_data)
random.shuffle(testing_data)
random.shuffle(data)

In [29]:
pickle_out = open("SVM_data.pickle","wb")
pickle.dump(data, pickle_out)
pickle_out.close()

In [4]:
training_data= pickle.load(open("SVM_train.pickle","rb"))
testing_data= pickle.load(open("SVM_test.pickle","rb"))

In [5]:
X = []
y = []

for features,label in training_data:
    X.append(features)
    y.append(label)

In [6]:
X_test = []
y_test = []

for features,label in testing_data:
    X_test.append(features)
    y_test.append(label)

In [7]:
pre_process = StandardScaler()
pre_process.fit(X)
X = pre_process.transform(X)

In [8]:
pre_process = StandardScaler()
pre_process.fit(X_test)
X_test = pre_process.transform(X_test)

In [35]:
from sklearn.ensemble import RandomForestClassifier
num_trees=100
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, shuffle=True, random_state=16)
model = RandomForestClassifier(n_estimators=num_trees, max_features=5, random_state=16)
results = cross_val_score(model, X, y, cv=cv)
print(f"Accuracy: {round(results.mean()*100, 2)}%")

Accuracy: 90.2%


In [36]:
model.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=5,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=16, verbose=0,
                       warm_start=False)

In [14]:
pickle_out = open("RF_model.pickle","wb")
pickle.dump(model, pickle_out)
pickle_out.close()

NameError: name 'model' is not defined

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
param_grid = {'n_estimators':list(range(50,500,50)),
             'criterion': ['gini', 'entropy'],
                'max_depth': [5,10,15,20],
             'min_samples_split': list(range(3,10)),
             'min_samples_leaf': [1,5,10,15,20]}

grid = GridSearchCV(rf, param_grid, cv=6, verbose=10, scoring='accuracy')

In [10]:
grid.fit(X,y)

GridSearchCV(cv=6, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,...
                                              random_state=None, verbose=0,
                                   

In [11]:
grid.best_score_

0.9029919447640967

In [12]:
grid.best_params_

{'criterion': 'entropy',
 'max_depth': 15,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 350}

In [None]:
{'criterion': 'entropy',
 'max_depth': 15,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 350}

In [17]:
model = RandomForestClassifier(criterion= 'entropy', max_depth= 15, min_samples_leaf= 1, min_samples_split= 4,n_estimators= 350)

In [18]:
model.fit(X,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=350,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [19]:
pred = model.predict(X_test)

In [20]:
print(f"Accuracy: {accuracy_score(y_test, pred)}")
print(f"Recall: {recall_score(y_test, pred)}")
print(classification_report(y_test, pred))

Accuracy: 0.7756410256410257
Recall: 0.9435897435897436
              precision    recall  f1-score   support

           0       0.84      0.50      0.62       234
           1       0.76      0.94      0.84       390

    accuracy                           0.78       624
   macro avg       0.80      0.72      0.73       624
weighted avg       0.79      0.78      0.76       624



In [None]:
confusion_matrix(y_test,pred)

In [None]:
CATEGORIES = ["NORMAL","PNEUMONIA"]
disp = plot_confusion_matrix(model.fit(X,y), X_test, y_test,
                                 display_labels=CATEGORIES,
                                 cmap=plt.cm.Blues,
                                 normalize=None)
disp.ax_.set_title("Chest X-Ray Images")

In [None]:
def grid_search():
    standardized_data = StandardScaler()
    pca = PCA(n_components=10)
    svc = SVC()
    pipe = Pipeline([('standardized_data', standardized_data),
                     ('pca', pca),
                     ('svc', svc)])
    
    start = timeit.default_timer()
    svm_parameters = [{'svc__C':[0.1,1.,10.,100.], 'svc__kernel':['rbf'], 'svc__gamma':[0.1,1.,10.,100.], 
                       'svc__probability':[True], 'svc__random_state':[155]}]  
    grid_search_svm = GridSearchCV(pipe, svm_parameters, cv=3, n_jobs=4)
    grid_search_svm.fit(train_x, train_y)
    print(f'Runtime for SVM:{timeit.default_timer() - start}')
    return grid_search_svm

In [None]:
svm_model = grid_search()
svm_model.best_params_

In [None]:
model =SVC(C= 1.0,gamma= 0.1, kernel='rbf',probability=True,random_state= 155)

In [None]:
model.fit(train_x_stand,train_y)

In [None]:
svm_results = model.predict(test_x)

In [None]:
print(f"Accuracy: {accuracy_score(test_y, svm_results)}")
print(f"Recall: {recall_score(test_y, svm_results)}")
print(classification_report(test_y, svm_results))

In [None]:
def plot_roc(probs, label):
    fpr = {}
    tpr = {}
    roc_auc = {}
    # results = [n for n in np.ravel(test_y) if n == label]
    for i in range(2):  #  binary classifier
        fpr[i], tpr[i], _ = roc_curve(test_y, probs[:,i])
        roc_auc[i] = auc(fpr[i], tpr[i])
        print(roc_auc[i])
        

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(test_y.ravel(), svm_results.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    fig, ax = plt.subplots()
    ax.plot(fpr[label], tpr[label], color='darkorange', lw=2, label='ROC curve {:.2f}'.format(roc_auc[label]))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
predict_prob = svm_model.predict_proba(test_x)

In [None]:
import scikitplot as skplt
skplt.metrics.plot_roc(test_y, predict_prob)
plt.show()

In [None]:
print()