In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [2]:
DATA_DIR = "../tmp/osic-pulmonary-fibrosis-progression" # Local

In [3]:
train_df = pd.read_csv(DATA_DIR + '/train.csv')

train_df.info()
train_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1549 entries, 0 to 1548
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Patient        1549 non-null   object 
 1   Weeks          1549 non-null   int64  
 2   FVC            1549 non-null   int64  
 3   Percent        1549 non-null   float64
 4   Age            1549 non-null   int64  
 5   Sex            1549 non-null   object 
 6   SmokingStatus  1549 non-null   object 
dtypes: float64(1), int64(3), object(3)
memory usage: 84.8+ KB


Unnamed: 0,Weeks,FVC,Percent,Age
count,1549.0,1549.0,1549.0,1549.0
mean,31.861846,2690.479019,77.672654,67.188509
std,23.24755,832.770959,19.823261,7.057395
min,-5.0,827.0,28.877577,49.0
25%,12.0,2109.0,62.8327,63.0
50%,28.0,2641.0,75.676937,68.0
75%,47.0,3171.0,88.621065,72.0
max,133.0,6399.0,153.145378,88.0


### Columns

#### train.csv and test.csv

- Patient- a unique Id for each patient (also the name of the patient's DICOM folder)
- Weeks- the relative number of weeks pre/post the baseline CT (may be negative)
- FVC - the recorded lung capacity in ml
- Percent- a computed field which approximates the patient's FVC as a percent of the typical FVC for a person of similar characteristics
- Age
- Sex
- SmokingStatus

Percent is the comparison of the patient's measured FVC against expected

`TODO: Load and join autoencoder encoding to train_df`

In [4]:
# Define custom transformers
class DataFrameSelector(BaseEstimator, TransformerMixin):
    """Selects columns from a Pandas DataFrame using attr"""
    def __init__(self, attr: list):
        self.attr = attr
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attr].values

In [5]:
# Make labels
labels_df = train_df[["FVC", "Patient"]]

# Define input classes
num_attrs = ["Percent", "Age"]
cat_attrs = ["SmokingStatus", "Sex"]
no_op_attrs = ["Weeks"]

# Define no-operation pipeline
no_op_pipeline = Pipeline([
    ('selector', DataFrameSelector(no_op_attrs)),
])

# Define numerical pipeline
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attrs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

# Define categorical pipeline
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attrs)),
    ('one_hot_encoder', OneHotEncoder()),
])

cleaning_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
    ("no_op_pipeline", no_op_pipeline),
])



In [6]:
# Prepare training data
X, y = cleaning_pipeline.fit_transform(train_df), train_df[['FVC']]

# Convert to dataframe
# Todo: column names
X = pd.DataFrame(X.toarray())

# Reshape y
n_samples = len(X)
y = y.values.reshape(n_samples, )

Looking at Bayesian regressors because they have ways to estimate confidence intervals of predictions directly

In [7]:
# Bayesian Ridge Regression
regr = BayesianRidge(tol=1e-6)
regr.fit(X.values, y)

y_pred = regr.predict(X)
print(f"Training Metrics: \nR2 Score {r2_score(y, y_pred)} \nMAE {mean_absolute_error(y, y_pred)} ")

Training Metrics: 
R2 Score 0.835900989760577 
MAE 262.4514828853769 


`Todo: hyperparam matrix solving`

In [8]:
# Load and prepare test data
test_df = pd.read_csv(DATA_DIR + '/test.csv')

# Process and reshape
test_X, test_y = cleaning_pipeline.fit_transform(test_df), test_df[['FVC']]
test_X = pd.DataFrame(test_X.toarray())

n_samples = len(test_X)
test_y = test_y.values.reshape(n_samples, )

# Save 5% of test set for validation
test_X, val_X, test_y, val_y = train_test_split(test_X, test_y, test_size=0.05)

In [9]:
# Calculate test accuracy
regr = BayesianRidge(tol=1e-6)
regr.fit(test_X.values, test_y)

test_y_pred = regr.predict(test_X)
print(f"Test Metrics: \nR2 Score {r2_score(test_y, test_y_pred)} \nMAE {mean_absolute_error(test_y, test_y_pred)} ")

Test Metrics: 
R2 Score 1.6457573234762712e-05 
MAE 162.4981694581063 
