In [1]:
import numpy as np
import pandas as pd
import os
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import resample

### Columns

#### train.csv and test.csv

- Patient- a unique Id for each patient (also the name of the patient's DICOM folder)
- Weeks- the relative number of weeks pre/post the baseline CT (may be negative)
- FVC - the recorded lung capacity in ml
- Percent- a computed field which approximates the patient's FVC as a percent of the typical FVC for a person of similar characteristics
- Age
- Sex
- SmokingStatus

Percent is the comparison of the patient's measured FVC against expected

In [2]:
DATA_DIR = "../tmp/osic-pulmonary-fibrosis-progression" # Local

In [34]:
# Load test and training data
train_df = pd.read_csv(DATA_DIR + '/train.csv') # Use first
test_df = pd.read_csv(DATA_DIR + '/test.csv') # Save for later

# Load exported autoencoder encoding from local pickle
PICKLE_PATH = "patient_ids_to_encodings_dict.pkl"
with open(PICKLE_PATH, "rb") as f:
    patient_ct_encodings = pickle.load(f)

# Convert to DF with patient ID as rows, encoding as 128 columns
patient_ct_encodings = pd.DataFrame.from_dict(patient_ct_encodings, orient="index")

# Left join encoding to training & test data
train_df = train_df.join(patient_ct_encodings, on="Patient", how="left")
test_df = test_df.join(patient_ct_encodings, on="Patient", how="left")

train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,0,1,2,...,118,119,120,121,122,123,124,125,126,127
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,0.036863,0.250262,0.214101,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,0.036863,0.250262,0.214101,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,0.036863,0.250262,0.214101,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,0.036863,0.250262,0.214101,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,0.036863,0.250262,0.214101,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935


In [35]:
train_df.info()
train_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1549 entries, 0 to 1548
Columns: 135 entries, Patient to 127
dtypes: float64(129), int64(3), object(3)
memory usage: 1.6+ MB


Unnamed: 0,Weeks,FVC,Percent,Age,0,1,2,3,4,5,...,118,119,120,121,122,123,124,125,126,127
count,1549.0,1549.0,1549.0,1549.0,1508.0,1508.0,1508.0,1508.0,1508.0,1508.0,...,1508.0,1508.0,1508.0,1508.0,1508.0,1508.0,1508.0,1508.0,1508.0,1508.0
mean,31.861846,2690.479019,77.672654,67.188509,0.077912,0.283909,0.247919,0.0,0.191235,0.021996,...,0.085014,0.0,0.017765,0.218686,0.345038,0.0,0.035827,0.0,0.140537,0.002985
std,23.24755,832.770959,19.823261,7.057395,0.022669,0.047313,0.04694,0.0,0.044747,0.014561,...,0.014866,0.0,0.009292,0.037695,0.053437,0.0,0.010854,0.0,0.019603,0.009326
min,-5.0,827.0,28.877577,49.0,0.014428,0.173586,0.139829,0.0,0.080489,0.0,...,0.05695,0.0,0.0,0.119162,0.219961,0.0,0.012659,0.0,0.102922,0.0
25%,12.0,2109.0,62.8327,63.0,0.072091,0.262751,0.230127,0.0,0.184656,0.013923,...,0.07532,0.0,0.011729,0.198937,0.307978,0.0,0.028957,0.0,0.128444,0.0
50%,28.0,2641.0,75.676937,68.0,0.083772,0.2857,0.2498,0.0,0.196558,0.023145,...,0.084533,0.0,0.019234,0.220777,0.346934,0.0,0.034138,0.0,0.139901,0.0
75%,47.0,3171.0,88.621065,72.0,0.087803,0.303469,0.2706,0.0,0.212214,0.02949,...,0.092409,0.0,0.024061,0.240291,0.368807,0.0,0.041371,0.0,0.14744,0.0
max,133.0,6399.0,153.145378,88.0,0.118162,0.422589,0.400225,0.0,0.317843,0.091417,...,0.133232,0.0,0.044072,0.354603,0.481334,0.0,0.076293,0.0,0.217183,0.055429


In [42]:
# Define custom transformers
class DataFrameSelector(BaseEstimator, TransformerMixin):
    """Selects columns from a Pandas DataFrame using attr"""
    def __init__(self, attr: list):
        self.attr = attr
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attr].values

In [43]:
# Make labels
labels_df = train_df[["FVC", "Patient"]]

# Define input classes
# No-op weeks because time progress is significant?
no_op_attrs = ["Weeks"]
num_attrs = ["Percent", "Age"]
cat_attrs = ["SmokingStatus", "Sex"]
# After the join, the encoded CT columns have integer column labels
encoded_attrs = [x for x in list(train_df) if type(x) == int]

# Define no-operation pipeline
no_op_pipeline = Pipeline([
    ('selector', DataFrameSelector(no_op_attrs)),
])

# Define numerical pipeline
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attrs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

# Define categorical pipeline
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attrs)),
    ('one_hot_encoder', OneHotEncoder()),
])

# Encoded CT pipeline
# Impute missing values but do not do additional scaling
encoded_pipeline = Pipeline([
    ('selector', DataFrameSelector(encoded_attrs)),
    ('imputer', SimpleImputer(strategy="median")),
])

cleaning_pipeline = FeatureUnion(transformer_list=[
    ("no_op_pipeline", no_op_pipeline),
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
    ("encoded_pipeline", encoded_pipeline),
])



In [44]:
# Prepare training data & convert to DataFrame
X, y = cleaning_pipeline.fit_transform(train_df), train_df[['FVC']]
X = pd.DataFrame(X.todense())

# Reshape y
n_samples = len(X)
y = y.values.reshape(n_samples, )

X.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,126,127,128,129,130,131,132,133,134,135
count,1549.0,1549.0,1549.0,1549.0,1549.0,1549.0,1549.0,1549.0,1549.0,1549.0,...,1549.0,1549.0,1549.0,1549.0,1549.0,1549.0,1549.0,1549.0,1549.0,1549.0
mean,31.861846,-2.064198e-16,8.532017e-16,0.052937,0.67011,0.276953,0.209813,0.790187,0.078067,0.283956,...,0.085002,0.0,0.017804,0.218741,0.345089,0.0,0.035782,0.0,0.14052,0.002906
std,23.24755,1.000323,1.000323,0.223981,0.470325,0.447638,0.407306,0.407306,0.022387,0.046684,...,0.014668,0.0,0.009171,0.037194,0.052725,0.0,0.010713,0.0,0.019342,0.009215
min,-5.0,-2.462301,-2.578059,0.0,0.0,0.0,0.0,0.0,0.014428,0.173586,...,0.05695,0.0,0.0,0.119162,0.219961,0.0,0.012659,0.0,0.102922,0.0
25%,12.0,-0.7488549,-0.5936839,0.0,0.0,0.0,0.0,1.0,0.073561,0.264107,...,0.075624,0.0,0.012214,0.199152,0.311755,0.0,0.029112,0.0,0.129435,0.0
50%,28.0,-0.100708,0.1150217,0.0,1.0,0.0,0.0,1.0,0.083772,0.2857,...,0.084533,0.0,0.019234,0.220777,0.346934,0.0,0.034138,0.0,0.139901,0.0
75%,47.0,0.5524796,0.6819861,0.0,1.0,1.0,0.0,1.0,0.08749,0.302084,...,0.092364,0.0,0.024052,0.239982,0.368589,0.0,0.041206,0.0,0.147277,0.0
max,133.0,3.80851,2.949844,1.0,1.0,1.0,1.0,1.0,0.118162,0.422589,...,0.133232,0.0,0.044072,0.354603,0.481334,0.0,0.076293,0.0,0.217183,0.055429


In [45]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,126,127,128,129,130,131,132,133,134,135
0,-4.0,-0.979923,1.674174,0.0,1.0,0.0,0.0,1.0,0.036863,0.250262,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
1,5.0,-1.108174,1.674174,0.0,1.0,0.0,0.0,1.0,0.036863,0.250262,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
2,7.0,-1.302454,1.674174,0.0,1.0,0.0,0.0,1.0,0.036863,0.250262,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
3,9.0,-1.19706,1.674174,0.0,1.0,0.0,0.0,1.0,0.036863,0.250262,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
4,11.0,-1.292296,1.674174,0.0,1.0,0.0,0.0,1.0,0.036863,0.250262,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935


Looking at Bayesian regressors because they have ways to estimate confidence intervals of predictions directly

In [46]:
# Bayesian Ridge Regression
regr = BayesianRidge()
regr.fit(X.values, y)
y_pred = regr.predict(X.values)

print("Bayesian Ridge Regression Training Metrics")
print(f"R2 Score {r2_score(y, y_pred)} \nMAE {mean_absolute_error(y, y_pred)} ")
print(f"First 5 predictions: {y_pred[:4]}")
print(f"First 5 real FVC: {y[:4]}")

Bayesian Ridge Regression Training Metrics
R2 Score 0.9222945407321945 
MAE 181.54737067116542 
First 5 predictions: [2250.42927079 2158.61811321 2030.57005403 2097.10855768]
First 5 real FVC: [2315 2214 2061 2144]


In [47]:
# Decision Tree Regression
tree_regr = DecisionTreeRegressor()
tree_regr.fit(X.values, y)
y_pred = regr.predict(X.values)

print("Decision Tree Regression Training Metrics")
print(f"R2 Score {r2_score(y, y_pred)} \nMAE {mean_absolute_error(y, y_pred)} ")
print(f"First 5 predictions: {y_pred[:4]}")
print(f"First 5 real FVC: {y[:4]}")

Decision Tree Regression Training Metrics
R2 Score 0.9222945407321945 
MAE 181.54737067116542 
First 5 predictions: [2250.42927079 2158.61811321 2030.57005403 2097.10855768]
First 5 real FVC: [2315 2214 2061 2144]


`Todo: hyperparam matrix solving`

In [48]:
# Test DF was previously loaded
# Process and reshape
test_X, test_y = cleaning_pipeline.fit_transform(test_df), test_df[['FVC']]
test_X = pd.DataFrame(test_X.todense())

n_samples = len(test_X)
test_y = test_y.values.reshape(n_samples, )

# Save 5% of test set for validation
test_X, val_X, test_y, val_y = train_test_split(test_X, test_y, test_size=0.05)

In [49]:
# Calculate test accuracy
regr = BayesianRidge()
regr.fit(test_X.values, test_y)
test_y_pred = regr.predict(test_X)

print("Bayesian Ridge Regression Test Metrics")
print(f"R2 Score {r2_score(test_y, test_y_pred)} \nMAE {mean_absolute_error(test_y, test_y_pred)} ")
print(f"First 5 predictions: {test_y_pred[:4]}")
print(f"First 5 real FVC: {test_y[:4]}")

Bayesian Ridge Regression Test Metrics
R2 Score 1.360819400608726e-06 
MAE 361.7500168209243 
First 5 predictions: [2653.50040087 2653.49732214 2653.50003364 2653.50224334]
First 5 real FVC: [3020 2739 1930 2925]
