In [24]:
import numpy as np
import pandas as pd
import os
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer

### Columns

#### train.csv and test.csv

- Patient- a unique Id for each patient (also the name of the patient's DICOM folder)
- Weeks- the relative number of weeks pre/post the baseline CT (may be negative)
- FVC - the recorded lung capacity in ml
- Percent- a computed field which approximates the patient's FVC as a percent of the typical FVC for a person of similar characteristics
- Age
- Sex
- SmokingStatus

Percent is the comparison of the patient's measured FVC against expected

In [25]:
DATA_DIR = "../tmp/osic-pulmonary-fibrosis-progression" # Local

In [26]:
# Load test and training data
train_df = pd.read_csv(DATA_DIR + '/train.csv') # Use first
test_df = pd.read_csv(DATA_DIR + '/test.csv') # Save for later

# Load exported autoencoder encoding from local pickle
PICKLE_PATH = "patient_ids_to_encodings_dict.pkl"
with open(PICKLE_PATH, "rb") as f:
    patient_ct_encodings = pickle.load(f)

# Convert to DF with patient ID as rows, encoding as 128 columns
patient_ct_encodings = pd.DataFrame.from_dict(patient_ct_encodings, orient="index")

# Left join encoding to training & test data
train_df = train_df.join(patient_ct_encodings, on="Patient", how="left")
test_df = test_df.join(patient_ct_encodings, on="Patient", how="left")

train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,0,1,2,...,118,119,120,121,122,123,124,125,126,127
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,0.036863,0.250262,0.214101,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,0.036863,0.250262,0.214101,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,0.036863,0.250262,0.214101,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,0.036863,0.250262,0.214101,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,0.036863,0.250262,0.214101,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935


In [27]:
test_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,0,1,2,...,118,119,120,121,122,123,124,125,126,127
0,ID00419637202311204720264,6,3020,70.186855,73,Male,Ex-smoker,0.077481,0.273456,0.218755,...,0.060672,0.0,0.006664,0.226692,0.298766,0.0,0.0281,0.0,0.130775,0.0
1,ID00421637202311550012437,15,2739,82.045291,68,Male,Ex-smoker,0.088297,0.298811,0.270244,...,0.090454,0.0,0.019464,0.22045,0.358317,0.0,0.034138,0.0,0.138857,0.0
2,ID00422637202311677017371,6,1930,76.672493,73,Male,Ex-smoker,0.080014,0.270827,0.238446,...,0.062583,0.0,0.022374,0.207912,0.305052,0.0,0.034256,0.0,0.115796,0.0
3,ID00423637202312137826377,17,3294,79.258903,72,Male,Ex-smoker,0.086455,0.2857,0.254137,...,0.089488,0.0,0.024379,0.230229,0.359478,0.0,0.031867,0.0,0.138221,0.0
4,ID00426637202313170790466,0,2925,71.824968,73,Male,Never smoked,0.084378,0.284071,0.253392,...,0.088711,0.0,0.026607,0.225449,0.356416,0.0,0.03175,0.0,0.145091,0.0


In [52]:
# Define custom transformers
class DataFrameSelector(BaseEstimator, TransformerMixin):
    """Selects columns from a Pandas DataFrame using attr"""
    def __init__(self, attr: list):
        self.attr = attr
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attr]

In [53]:
# Make labels
labels_df = train_df[["FVC", "Patient"]]

# Define input classes
# No-op weeks because time progress is significant?
no_op_attrs = ["Weeks"]
num_attrs = ["Percent", "Age"]
cat_attrs = ["SmokingStatus", "Sex"]
# After the join, the encoded CT columns have integer column labels
encoded_attrs = list(range(0, 128))

# Define no-operation pipeline
no_op_pipeline = Pipeline([
    ('selector', DataFrameSelector(no_op_attrs)),
])

# Define numerical pipeline
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attrs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

# Define categorical pipeline
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attrs)),
    ('one_hot_encoder', OneHotEncoder()),
])

# Define encoded CT pipeline
encoded_pipeline = Pipeline([
    ('selector', DataFrameSelector(encoded_attrs)),
    ('imputer', SimpleImputer(strategy="median")),
])

cleaning_pipeline = FeatureUnion(transformer_list=[
    ("no_op_pipeline", no_op_pipeline),
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
    ("encoded_pipeline", encoded_pipeline),
])

#preprocessor = ColumnTransformer(
    #transformers=[
        #('num', numeric_transformer, numeric_features),
        #('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
#clf = Pipeline(steps=[('preprocessor', preprocessor),
                      #('classifier', LogisticRegression())])


In [54]:
# Prepare training data & convert to DataFrame
X, y = cleaning_pipeline.fit_transform(train_df), train_df[['FVC']]
X = pd.DataFrame(X.todense())

# Reshape y
n_samples = len(X)
y = y.values.reshape(n_samples, )

X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,126,127,128,129,130,131,132,133,134,135
0,-4.0,-0.979923,1.674174,0.0,1.0,0.0,0.0,1.0,0.036863,0.250262,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
1,5.0,-1.108174,1.674174,0.0,1.0,0.0,0.0,1.0,0.036863,0.250262,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
2,7.0,-1.302454,1.674174,0.0,1.0,0.0,0.0,1.0,0.036863,0.250262,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
3,9.0,-1.19706,1.674174,0.0,1.0,0.0,0.0,1.0,0.036863,0.250262,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935
4,11.0,-1.292296,1.674174,0.0,1.0,0.0,0.0,1.0,0.036863,0.250262,...,0.103108,0.0,0.003177,0.183595,0.307978,0.0,0.050988,0.0,0.141652,0.010935


Looking at Bayesian regressors because they have ways to estimate confidence intervals of predictions directly

In [56]:
# Bayesian Ridge Regression
br_regr = BayesianRidge()
br_regr.fit(X.values, y)
y_pred = br_regr.predict(X.values)

print("Bayesian Ridge Regression Training Metrics")
print(f"R2 Score {r2_score(y, y_pred)} \nMAE {mean_absolute_error(y, y_pred)} ")
print(f"First 5 predictions: {y_pred[:4]}")
print(f"First 5 real FVC: {y[:4]}")

Bayesian Ridge Regression Training Metrics
R2 Score 0.9222945407321945 
MAE 181.54737067116542 
First 5 predictions: [2250.42927079 2158.61811321 2030.57005403 2097.10855768]
First 5 real FVC: [2315 2214 2061 2144]


In [57]:
# Decision Tree Regression
tree_regr = DecisionTreeRegressor()
tree_regr.fit(X.values, y)
y_pred = tree_regr.predict(X.values)

print("Decision Tree Regression Training Metrics")
print(f"R2 Score {r2_score(y, y_pred)} \nMAE {mean_absolute_error(y, y_pred)} ")
print(f"First 5 predictions: {y_pred[:4]}")
print(f"First 5 real FVC: {y[:4]}")

Decision Tree Regression Training Metrics
R2 Score 1.0 
MAE 0.0 
First 5 predictions: [2315. 2214. 2061. 2144.]
First 5 real FVC: [2315 2214 2061 2144]


`Todo: hyperparam matrix solving`

In [58]:
test_df.head()
# No change from before

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,0,1,2,...,118,119,120,121,122,123,124,125,126,127
0,ID00419637202311204720264,6,3020,70.186855,73,Male,Ex-smoker,0.077481,0.273456,0.218755,...,0.060672,0.0,0.006664,0.226692,0.298766,0.0,0.0281,0.0,0.130775,0.0
1,ID00421637202311550012437,15,2739,82.045291,68,Male,Ex-smoker,0.088297,0.298811,0.270244,...,0.090454,0.0,0.019464,0.22045,0.358317,0.0,0.034138,0.0,0.138857,0.0
2,ID00422637202311677017371,6,1930,76.672493,73,Male,Ex-smoker,0.080014,0.270827,0.238446,...,0.062583,0.0,0.022374,0.207912,0.305052,0.0,0.034256,0.0,0.115796,0.0
3,ID00423637202312137826377,17,3294,79.258903,72,Male,Ex-smoker,0.086455,0.2857,0.254137,...,0.089488,0.0,0.024379,0.230229,0.359478,0.0,0.031867,0.0,0.138221,0.0
4,ID00426637202313170790466,0,2925,71.824968,73,Male,Never smoked,0.084378,0.284071,0.253392,...,0.088711,0.0,0.026607,0.225449,0.356416,0.0,0.03175,0.0,0.145091,0.0


In [59]:
# Prepare test data & convert to DataFrame
X, y = cleaning_pipeline.fit_transform(test_df), test_df[['FVC']]
X = pd.DataFrame(X.todense())

# Reshape y
n_samples = len(X)
y = y.values.reshape(n_samples, )

X.head()
# Except after using the same pipeline, 134 cols instead of 136?

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,124,125,126,127,128,129,130,131,132,133
0,6.0,-1.306936,0.618853,1.0,0.0,1.0,0.077481,0.273456,0.218755,0.0,...,0.060672,0.0,0.006664,0.226692,0.298766,0.0,0.0281,0.0,0.130775,0.0
1,15.0,1.360182,-1.9597,1.0,0.0,1.0,0.088297,0.298811,0.270244,0.0,...,0.090454,0.0,0.019464,0.22045,0.358317,0.0,0.034138,0.0,0.138857,0.0
2,6.0,0.151769,0.618853,1.0,0.0,1.0,0.080014,0.270827,0.238446,0.0,...,0.062583,0.0,0.022374,0.207912,0.305052,0.0,0.034256,0.0,0.115796,0.0
3,17.0,0.733487,0.103142,1.0,0.0,1.0,0.086455,0.2857,0.254137,0.0,...,0.089488,0.0,0.024379,0.230229,0.359478,0.0,0.031867,0.0,0.138221,0.0
4,0.0,-0.938503,0.618853,0.0,1.0,1.0,0.084378,0.284071,0.253392,0.0,...,0.088711,0.0,0.026607,0.225449,0.356416,0.0,0.03175,0.0,0.145091,0.0


In [61]:
# Calculate test accuracy of trained regressors
br_test_y_pred = br_regr.predict(test_X.values)
tree_test_y_pred = tree_regr.predict(test_X.values)

print("Bayesian Ridge Regression Test Metrics")
print(f"R2 Score {r2_score(test_y, br_test_y_pred)} \nMAE {mean_absolute_error(test_y, br_test_y_pred)} ")
print(f"First 5 predictions: {br_test_y_pred[:4]}")
print(f"First 5 real FVC: {test_y[:4]}")

print("Decision Tree Regression Training Metrics")
print(f"R2 Score {r2_score(y, tree_test_y_pred)} \nMAE {mean_absolute_error(y, tree_test_y_pred)} ")
print(f"First 5 predictions: {tree_test_y_pred[:4]}")
print(f"First 5 real FVC: {y[:4]}")

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 136 is different from 134)

**BUG**
When using the same pipeline on both training and test data, the processed output has different number of columns:
  - train: 136
  - test: 134
        
Because of this, predictors fitted to the training data fail on the test set - why?