In [None]:
#%matplotlib inline

import os
import numpy as np
import nibabel as nib
import matplotlib.pyplot as plt
import pandas as pd
import glob
from natsort import natsorted

from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import math, getopt, sys

Function to generate n_arr (almost) equal-sized arrays from arr:

In [None]:
def chunks(arr, n_arr):
    n_elems = int(math.ceil(len(arr)/n_arr))
    return [arr[i:i + n_elems] for i in range(0, len(arr), n_elems)]

Function to load images from file being in defined path:

In [None]:
def load_image(path, file):
    filepath = os.path.join(path, file)
    img = nib.load(filepath)
    img_data = img.get_data()
    img_data = img_data[:,:,:,0]
    return img_data

Function to delete all zero vectors from image:

In [None]:
def preprocess_image(img_data, x_zero, y_zero, z_zero):
    img_data = np.delete(img_data, np.where(x_zero==1), axis=0)
    img_data = np.delete(img_data, np.where(y_zero==1), axis=1)
    img_data = np.delete(img_data, np.where(z_zero==1), axis=2)
    return img_data

In [None]:
def count_histograms(path, file, x_zero, y_zero, z_zero, x_pix, y_pix, z_pix):

    img_data = load_image(path, file)
    preprocess_image(img_data, x_zero, y_zero, z_zero)

    for j in range(len(x_pix)):
        x_range = x_pix[j]

        for k in range(len(y_pix)):
            y_range = y_pix[k]

            for l in range (len(z_pix)):
                z_range = z_pix[l]

                temp = np.ix_(x_range, y_range, z_range)
                img_part = img_data[temp]
                hist, bin_edges = np.histogram(img_part, range=(1e-5,img_data.max()), bins=bins)

                if (j,k,l)==(0,0,0):
                    part_array = hist
                else:
                    part_array = np.hstack((part_array, hist))
                    
    return part_array

Function to calculate histograms for train and test data. For the first image the optimal number of bins is calculated automatically through 'auto' statement. Then the same number of bins is used for all other images:

In [None]:
def histogram_arrays(x_parts=1, y_parts=1, z_parts=1, bins=10):
    
    img_data = load_image(src_train_path, "train_1.nii")
    
    #Looking for zero vectors in array
    x_zero = np.all(img_data==0, axis=(1,2))
    y_zero = np.all(img_data==0, axis=(0,2))
    z_zero = np.all(img_data==0, axis=(0,1))
    
    preprocess_image(img_data, x_zero, y_zero, z_zero)
    
    (x_dim, y_dim, z_dim) = img_data.shape
    x_pix, y_pix, z_pix = chunks(range(x_dim),x_parts), chunks(range(y_dim),y_parts), chunks(range(z_dim),z_parts)
    
    for i, file in enumerate(train_names):
        part_array = count_histograms(src_train_path, file, x_zero, y_zero, z_zero, x_pix, y_pix, z_pix)
        
        if i == 0:
            train_array = part_array
        else:
            train_array = np.vstack((train_array, part_array))
            
    for i, file in enumerate(test_names):
        part_array = count_histograms(src_test_path, file, x_zero, y_zero, z_zero, x_pix, y_pix, z_pix)
        
        if i == 0:
            test_array = part_array
        else:
            test_array = np.vstack((test_array, part_array))
        
    return train_array, test_array

Defining paths to .nii files:

In [None]:
src_train_path = os.path.join(os.getcwd(), "data", "set_train")
src_test_path = os.path.join(os.getcwd(), "data", "set_test")

Extracting names of .nii files and sorting them in natural way, so that the order is the same as in the targets.csv file:

In [None]:
train_filepaths = os.path.join(src_train_path, "*.nii")
train_paths = (glob.glob(train_filepaths))
train_names = [os.path.basename(x) for x in train_paths]
train_names = natsorted(train_names)

test_filepaths = os.path.join(src_test_path, "*.nii")
test_paths = (glob.glob(test_filepaths))
test_names = [os.path.basename(x) for x in test_paths]
test_names = natsorted(test_names)

Calculating X (all training samples derived from given training images), X_test_submission (test samples derived from given test images) and y (targets for given training images) matrices. Data from X matrix are split into training and test data for local algorithm validation:

In [None]:
x_parts, y_parts, z_parts, bins = 2, 2, 2, 30

In [None]:
X, X_test = histogram_arrays(x_parts, y_parts, z_parts, bins)

In [None]:
y = pd.read_csv("targets.csv", header=None)
y = y.values.squeeze()

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=0)

Normalizing data and PCA decomposition:

In [None]:
n_components = 20

pipeline = make_pipeline(Normalizer(), PCA(n_components=n_components)).fit(X_train.astype(np.float64))

X_train = pipeline.transform(X_train.astype(np.float64))
X_valid = pipeline.transform(X_valid.astype(np.float64))
X_test = pipeline.transform(X_test.astype(np.float64))

In [None]:
# normalizer = Normalizer().fit(X_train.astype(np.float64))
# X_train = normalizer.transform(X_train.astype(np.float64))
# X_valid = normalizer.transform(X_valid.astype(np.float64))
# X_test = normalizer.transform(X_test.astype(np.float64))

SVR with cross-validation used to fit data:

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
svr = SVR(kernel='rbf')
clf = GridSearchCV(svr, param_grid, n_jobs=2)
clf = clf.fit(X_train, y_train)

scores = cross_val_score(clf, X_train, y_train)

print(scores)
print(clf.score(X_train, y_train))
print(clf.best_estimator_)

Validating algorithm by calculating mean squared error:

In [None]:
y_train_pred = clf.predict(X_train)
y_train_pred = np.around(y_train_pred)
y_train_pred = y_train_pred.astype(int)

y_valid_pred = clf.predict(X_valid)
y_valid_pred = np.around(y_valid_pred)
y_valid_pred = y_valid_pred.astype(int)

train_error = mean_squared_error(y_train, y_train_pred)
valid_error = mean_squared_error(y_valid, y_valid_pred)
print(train_error)
print(valid_error)
with open("Output.txt", "w") as text_file:
    text_file.write("Train error: " + str(train_error))
    text_file.write("Test error: " + str(valid_error))

Saving scores obtained from original test images to produce file for submission.

In [None]:
y_test_pred = clf.predict(X_test)
y_test_pred = np.around(y_test_pred)
y_test_pred = y_test_pred.astype(int)

nr = np.arange(1,139)

df = pd.DataFrame({"ID" : nr, "Prediction" : y_test_pred})
df.to_csv("submission.csv", index=False)