https://www.kaggle.com/zhengyuandonshen/kernelfacb0742df

In [None]:
import pydicom
import os
from os import listdir
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import scipy as sp
from functools import partial
from tqdm.notebook import tqdm

%matplotlib inline

In [None]:
im_path = "../input/osic-pulmonary-fibrosis-progressiont/"
train_df = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
test_df = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
print('Training data shape: ', train_df.shape)
train_df.head()

In [None]:
# construct train input
train_df['Patient_Week'] = train_df['Patient'].astype(str) + '_' + train_df['Weeks'].astype(str)
output = pd.DataFrame()
gb = train_df.groupby('Patient')
tk0 = tqdm(gb, total=len(gb))
for _, usr_df in tk0:
    usr_output = pd.DataFrame()
    for week, tmp in usr_df.groupby('Weeks'):
        rename_cols = {'Weeks': 'base_Week', 'FVC': 'base_FVC', 'Percent': 'base_Percent', 'Age': 'base_Age'}
        tmp = tmp.drop(columns='Patient_Week').rename(columns=rename_cols)
        drop_cols = ['Age', 'Sex', 'SmokingStatus', 'Percent']
        _usr_output = usr_df.drop(columns=drop_cols).rename(columns={'Weeks': 'predict_Week'}).merge(tmp, on='Patient')
        _usr_output['Week_passed'] = _usr_output['predict_Week'] - _usr_output['base_Week']
        usr_output = pd.concat([usr_output, _usr_output])
    output = pd.concat([output, usr_output])
    
train_df = output[output['Week_passed']!=0].reset_index(drop=True)
print(train_df.shape)
train_df.head()

In [None]:
train_df = pd.get_dummies(train_df, columns=['Sex'])
train_df = pd.get_dummies(train_df, columns=['SmokingStatus'])
train_df = train_df.rename(columns={"Sex_Female": "Female", 
                                    "Sex_Male": "Male",
                                    "SmokingStatus_Currently smokes": "CurrentlySmokes",
                                    "SmokingStatus_Ex-smoker": "ExSmoker",
                                    "SmokingStatus_Never smoked": "NeverSmoked"})
train_df.head()

In [None]:
X = train_df.drop(['Patient','FVC','base_Week','predict_Week','Patient_Week'], axis=1)
y = train_df['FVC']

In [None]:
# Splite data into training and testing
from sklearn import model_selection

# Reserve 20% for testing
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, shuffle = False)

print('training data has ' + str(X_train.shape[0]) + 
      ' observation with ' + str(X_train.shape[1]) + ' features')
print('test data has ' + str(X_test.shape[0]) + 
      ' observation with ' + str(X_test.shape[1]) + ' features')

In [None]:
# standardization (x-mean)/std
# normalization (x-x_min)/(x_max-x_min) ->[0,1]

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
import xgboost as xgb
from xgboost import XGBRegressor
regr_XGB = XGBRegressor()

In [None]:
regr_XGB.fit(X_train, y_train)

In [None]:
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV

In [None]:
regr_XGB_opt = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8999999999999999, eta=0.01,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
             monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=0.7999999999999999,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
regr_XGB_opt.fit(X_train, y_train)
y_pred = regr_XGB_opt.predict(X_test)

In [None]:
plt.figure(figsize=(5,5))
plt.scatter(y_test, y_pred,color = 'r', alpha = 0.3)
plt.plot([min(y_test),max(y_test)],[min(y_test),max(y_test)], color = 'k')
plt.xlabel('FVC$_{\mathrm{test}}$')
plt.ylabel('FVC$_{\mathrm{pred}}$')
plt.rcParams.update({'font.size': 22})

In [None]:
from sklearn.metrics import mean_squared_error
mse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (mse**0.5))

In [None]:
data_dmatrix = xgb.DMatrix(data=X,label=y)
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}
cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=10,
                    num_boost_round=200,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)
print((cv_results["test-rmse-mean"]).tail(1))

In [None]:
importances = regr_XGB.feature_importances_
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature importance ranking by XGBoost Model:")
for ind in range(X.shape[1]):
    print ("%s : %.4f" %(X.columns[indices[ind]],importances[indices[ind]]))

In [None]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(15,13)) 
sns.heatmap(X.corr(), annot = True, fmt = '.2f')

In [None]:
FVC_pred_train = regr_XGB_opt.predict(X_train)
train_df = pd.DataFrame(X_train, columns = X.columns)
train_df['FVC'] = y_train
train_df['FVC_pred'] = FVC_pred_train

FVC_pred_test = regr_XGB_opt.predict(X_test)
test_df = pd.DataFrame(X_test, columns = X.columns)
test_df['FVC'] = np.asarray(y_test)
test_df['FVC_pred'] = FVC_pred_test

In [None]:
# baseline score
train_df['Confidence'] = 100
train_df['sigma_clipped'] = train_df['Confidence'].apply(lambda x: max(x, 70))
train_df['diff'] = abs(train_df['FVC'] - train_df['FVC_pred'])
train_df['delta'] = train_df['diff'].apply(lambda x: min(x, 1000))
train_df['score'] = -2**0.5*train_df['delta']/train_df['sigma_clipped'] - np.log(2**0.5*train_df['sigma_clipped'])
score = train_df['score'].mean()
print(score)

In [None]:
def loss_func(weight, row):
    confidence = weight
    sigma_clipped = max(confidence, 70)
    diff = abs(row['FVC'] - row['FVC_pred'])
    delta = min(diff, 1000)
    score = -2**0.5*delta/sigma_clipped - np.log(2**0.5*sigma_clipped)
    return -score

results = []
tk0 = tqdm(test_df.iterrows(), total=len(test_df))
for _, row in tk0:
    loss_partial = partial(loss_func, row=row)
    weight = [100]
    #bounds = [(70, 100)]
    #result = sp.optimize.minimize(loss_partial, weight, method='SLSQP', bounds=bounds)
    result = sp.optimize.minimize(loss_partial, weight, method='SLSQP')
    x = result['x']
    results.append(x[0])

In [None]:
test_df['Confidence'] = results
test_df['sigma_clipped'] = test_df['Confidence'].apply(lambda x: max(x, 70))
test_df['diff'] = abs(test_df['FVC'] - test_df['FVC_pred'])
test_df['delta'] = test_df['diff'].apply(lambda x: min(x, 1000))
test_df['score'] = -2**0.5*test_df['delta']/test_df['sigma_clipped'] - np.log(2**0.5*test_df['sigma_clipped'])
score = test_df['score'].mean()
print(score)

In [None]:

test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')\
        .rename(columns={'Weeks': 'base_Week', 'FVC': 'base_FVC', 'Percent': 'base_Percent', 'Age': 'base_Age'})
submission = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
submission['Patient'] = submission['Patient_Week'].apply(lambda x: x.split('_')[0])
submission['predict_Week'] = submission['Patient_Week'].apply(lambda x: x.split('_')[1]).astype(int)
test = submission.drop(columns=['FVC', 'Confidence']).merge(test, on='Patient')
test['Week_passed'] = test['predict_Week'] - test['base_Week']
print(test.shape)
test.head()

In [None]:
test = pd.get_dummies(test, columns=['Sex'])
test = pd.get_dummies(test, columns=['SmokingStatus'])
test = test.rename(columns={"Sex_Female": "Female", 
                                    "Sex_Male": "Male",
                                    "SmokingStatus_Currently smokes": "CurrentlySmokes",
                                    "SmokingStatus_Ex-smoker": "ExSmoker",
                                    "SmokingStatus_Never smoked": "NeverSmoked"})

In [None]:
submission = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
submission

In [None]:
submission = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
submission['Patient'] = submission['Patient_Week'].apply(lambda x: x.split('_')[0])
submission['predict_Week'] = submission['Patient_Week'].apply(lambda x: x.split('_')[1]).astype(int)
submission.head()

In [None]:
# sub = submission.drop(columns=['FVC', 'Confidence']).merge(test[['Patient_Week', 'FVC_pred', 'Confidence']], 
#                                                            on='Patient_Week')
sub = submission.drop(columns=['Patient','predict_Week','FVC', 'Confidence']).merge(test, on='Patient_Week')
# sub.columns = submission.columns
sub.to_csv('submission.csv', index=False)
sub['Female'] = 0
sub['CurrentlySmokes'] = 0
sub = sub[['Patient_Week', 'Patient', 'predict_Week', 'base_Week', 'base_FVC', 'base_Percent', 'base_Age', 
           'Week_passed', 'Female', 'Male', 'CurrentlySmokes', 'ExSmoker', 'NeverSmoked']]

In [None]:
X_test_sub = sub.iloc[:,4:]
scaler = MinMaxScaler()
scaler.fit(X_test_sub)
X_test_sub = scaler.transform(X_test_sub)
FVC_pred_sub = regr_XGB_opt.predict(X_test_sub)
sub['FVC_pred'] = FVC_pred_sub 
sub

In [None]:
for pid in sub['Patient'].unique():
# pid = 'ID00426637202313170790466'
    temp = sub[sub['Patient'] == pid]
    plt.plot(temp['predict_Week'], temp['FVC_pred'])

In [None]:
attempt1 = submission.merge(sub, on='Patient_Week')
attempt1 = attempt1.loc[:,['Patient_Week','FVC_pred','Confidence']]
attempt1.columns = ['Patient_Week','FVC','Confidence']
attempt1['Confidence'] = 350
attempt1.to_csv('submission.csv', index=False)