# Pneumonia_Training

Versions

* v001_Initial commit

Imports

In [14]:
# general
import numpy as np
import pandas as pd
import os
import pydicom

# keras
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

ModuleNotFoundError: No module named 'pydicom'

Constants

In [2]:
DATA_DIR = 'C:/Users/loren/Downloads'
TRAIN_DCM = os.path.join(DATA_DIR, "stage_1_train_images")
TEST_DCM = os.path.join(DATA_DIR, "stage_1_test_images")
TRAIN_IMAGES = os.path.join(DATA_DIR, "train_images")
TEST_IMAGES = os.path.join(DATA_DIR, "test_images")

## Extract features using NN

In [14]:
# general
import numpy as np
import pandas as pd
import os
import pydicom
from scipy import misc

In [17]:
def dcm2np(dcm_path, target_size=(224, 224)):
    """ Transforms a dcm into a np.array
    """
    # --- Open DICOM file
    d = pydicom.read_file(dcm_path)
    im = d.pixel_array

    # --- Convert from single-channel grayscale to 3-channel RGB
    im = np.stack([im] * 3, axis=2)
    
    im = scipy.misc.imresize(im, target_size, interp='bilinear', mode=None)
    
    return im

In [18]:
gna = dcm2np(r'C:\Users\loren\Downloads\stage_1_train_images\00aecb01-a116-45a2-956c-08d2fa55433f.dcm')

gna.shape

`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.


(224, 224, 3)

In [5]:
def _get_feature_values(img_path, model):
    """ Get the feature values for a specific image out of pre-trained model 
    """
    im = dcm2np(img_path)
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return model.predict(x)[0].reshape(-1)

def _create_pretrained_feature_df(class_directory):
    """ Create features dataframe based on pretrained model 
    """
    features = []
    n_max = len(os.listdir(class_directory))
    divisor = 100
    for n, file in enumerate(os.listdir(class_directory)):
        if( n % (n_max/divisor) ==0 ):
            print('Scoring image {0}'.format(n/n_max))
        full_path = os.path.join(class_directory, file)
        features.append(_get_feature_values(full_path, model))
    return pd.DataFrame(np.stack(features))

def _build_basic_rf(df):
    """ Build basic random forest based on extracted pre-trained model 
    """
    x_train, x_test, y_train, y_test = train_test_split(df[df.columns.difference(["Target"])], df["Target"])
    rf = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_leaf=15, n_jobs=-1, class_weight="balanced")
    rf.fit(x_train, y_train)
    train_preds = rf.predict_proba(x_train)
    test_preds = rf.predict_proba(x_test)
    print(f"AUC for train set is {roc_auc_score(y_train, train_preds[:,1])}")
    print(f"AUC for test set is {roc_auc_score(y_test, test_preds[:,1])}")
    return rf

In [None]:
model = InceptionV3(weights='imagenet', include_top=False)

pos_df = _create_pretrained_feature_df(os.path.join(TRAIN_IMAGES, "positive"))
pos_df["Target"] = 1

neg_df = _create_pretrained_feature_df(os.path.join(TRAIN_IMAGES, "negative"))
neg_df["Target"] = 0

full_df = pd.concat([pos_df, neg_df])

In [10]:
pos_df.to_csv(os.path.join(DATA_DIR, 'Inception_v3_positive.csv'))

In [6]:
rf_trained = _build_basic_rf(full_df)

Scoring image 0.0
Scoring image 0.0
Scoring image 4.0
Scoring image 8.0


OSError: image file is truncated

## Classifiers

In [None]:
classifiers = {
    "Nearest Neighbors":KNeighborsClassifier(3),
    "Linear SVM":SVC(kernel="linear", C=0.025),
    "RBF SVM":SVC(gamma=2, C=1),
    "Gaussian Process":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    "Decision Tree":DecisionTreeClassifier(max_depth=5),
    "Random Forest":RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Neural Net":MLPClassifier(alpha=1),
    "AdaBoost":AdaBoostClassifier(),
    "Naive Bayes":GaussianNB(),
    "QDA":QuadraticDiscriminantAnalysis()
}

In [None]:
figure = plt.figure(figsize=(27, 13))
plt.xlabel('',size = 20)

# iterate over classifiers
for i, (name, clf) in enumerate(classifiers.items()):
    
    # fit
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)

    # plot
    ax = plt.subplot((len(classifiers) // 4) +1, 4, i+1)
    ax.plot(Y.index, clf.predict(X), marker='o')
    ax.plot(Y.index, Y.values, marker='x')
    ax.set_yticks(())
    for tick in ax.xaxis.get_major_ticks():
                tick.label.set_fontsize(20) 
    ax.set_title(name, fontsize=20)
    ax.text(Y.index[0], 0.5, ('%.3f' % score),
            size=15, horizontalalignment='left')

plt.tight_layout()
plt.show()

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues
                          index=0):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    ax = plt.subplot((len(classifiers) // 4) +1, 4, i+1)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plt.figure(figusize=(27, 13))

for i, (name, clf) in enumerate(classifiers.items()):
    Y2_test = Y2
    Y2_pred = clf.predict(X2)
             
    # Compute confusion matrix
    cnf_matrix = confusion_matrix(Y2_test, Y2_pred)
    np.set_printoptions(precision=2)

    # Plot normalized confusion matrix
    
    plot_confusion_matrix(cnf_matrix, 
                          classes=['normal', 'recession'], 
                          normalize=True,
                          title='Normalized confusion matrix',
                          index=i)

plt.show()