<a href="https://colab.research.google.com/github/viveksahukar/kaggle_pulmonary_fibrosis/blob/master/pf_first_basic_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch as th
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# Set path to the data
import os
os.chdir('/content/gdrive/My Drive/colab_work_DS/kaggle_pf/')

In [None]:
meta = pd.read_csv('meta_data.csv')

In [None]:
meta.Patient.value_counts()

ID00078637202199415319443    1018
ID00202637202249376026949     825
ID00173637202238329754031     602
ID00180637202240177410333     577
ID00035637202182204917484     574
                             ... 
ID00242637202264759739921      18
ID00126637202218610655908      17
ID00229637202260254240583      17
ID00248637202266698862378      16
ID00165637202237320314458      12
Name: Patient, Length: 173, dtype: int64

In [None]:
train = pd.read_csv('train.csv')

In [None]:
train.Patient.value_counts()

ID00167637202237397919352    10
ID00400637202305055099402    10
ID00229637202260254240583    10
ID00421637202311550012437    10
ID00388637202301028491611    10
                             ..
ID00344637202287684217717     7
ID00381637202299644114027     7
ID00186637202242472088675     7
ID00267637202270790561585     6
ID00047637202184938901501     6
Name: Patient, Length: 176, dtype: int64

In [None]:
pid = 'ID00007637202177411956430'
train[train.Patient == pid].Patient.value_counts()

ID00007637202177411956430    9
Name: Patient, dtype: int64

In [None]:
meta[meta.Patient == pid].Patient.value_counts()

ID00007637202177411956430    30
Name: Patient, dtype: int64

In [None]:
df_meta = meta.groupby(['Patient']).agg(
    {
     'img_mean': ['mean', 'std'],
     'img_std':['mean', 'std']
    }
)
df_meta.columns = df_meta.columns.map('_'.join)
df_meta = df_meta.reset_index()

In [None]:
df_patient = pd.merge(left=df_meta, right=train, how='left', on='Patient')

In [None]:
df_patient

Unnamed: 0,Patient,img_mean_mean,img_mean_std,img_std_mean,img_std_std,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,97.884207,74.272910,1175.873703,28.845099,-4,2315,58.253649,79,Male,Ex-smoker
1,ID00007637202177411956430,97.884207,74.272910,1175.873703,28.845099,5,2214,55.712129,79,Male,Ex-smoker
2,ID00007637202177411956430,97.884207,74.272910,1175.873703,28.845099,7,2061,51.862104,79,Male,Ex-smoker
3,ID00007637202177411956430,97.884207,74.272910,1175.873703,28.845099,9,2144,53.950679,79,Male,Ex-smoker
4,ID00007637202177411956430,97.884207,74.272910,1175.873703,28.845099,11,2069,52.063412,79,Male,Ex-smoker
...,...,...,...,...,...,...,...,...,...,...,...
1519,ID00426637202313170790466,535.176502,60.303555,497.187228,15.569531,13,2712,66.594637,73,Male,Never smoked
1520,ID00426637202313170790466,535.176502,60.303555,497.187228,15.569531,19,2978,73.126412,73,Male,Never smoked
1521,ID00426637202313170790466,535.176502,60.303555,497.187228,15.569531,31,2908,71.407524,73,Male,Never smoked
1522,ID00426637202313170790466,535.176502,60.303555,497.187228,15.569531,43,2975,73.052745,73,Male,Never smoked


In [None]:
df_patient.Sex.value_counts()

Male      1206
Female     318
Name: Sex, dtype: int64

In [None]:
df_patient.SmokingStatus.value_counts()

Ex-smoker           1020
Never smoked         422
Currently smokes      82
Name: SmokingStatus, dtype: int64

In [None]:
sex_dummies = pd.get_dummies(df_patient.Sex)
smoking_dummies = pd.get_dummies(df_patient.SmokingStatus)
df = pd.concat([df_patient, sex_dummies, smoking_dummies], axis=1)
df.drop(columns=['Sex', 'SmokingStatus'], inplace=True)

In [None]:
df.head()

Unnamed: 0,Patient,img_mean_mean,img_mean_std,img_std_mean,img_std_std,Weeks,FVC,Percent,Age,Female,Male,Currently smokes,Ex-smoker,Never smoked
0,ID00007637202177411956430,97.884207,74.27291,1175.873703,28.845099,-4,2315,58.253649,79,0,1,0,1,0
1,ID00007637202177411956430,97.884207,74.27291,1175.873703,28.845099,5,2214,55.712129,79,0,1,0,1,0
2,ID00007637202177411956430,97.884207,74.27291,1175.873703,28.845099,7,2061,51.862104,79,0,1,0,1,0
3,ID00007637202177411956430,97.884207,74.27291,1175.873703,28.845099,9,2144,53.950679,79,0,1,0,1,0
4,ID00007637202177411956430,97.884207,74.27291,1175.873703,28.845099,11,2069,52.063412,79,0,1,0,1,0


In [None]:
df_rf = df.drop(columns=['Patient'])

In [None]:
df_rf

Unnamed: 0,img_mean_mean,img_mean_std,img_std_mean,img_std_std,Weeks,FVC,Percent,Age,Female,Male,Currently smokes,Ex-smoker,Never smoked
0,97.884207,74.272910,1175.873703,28.845099,-4,2315,58.253649,79,0,1,0,1,0
1,97.884207,74.272910,1175.873703,28.845099,5,2214,55.712129,79,0,1,0,1,0
2,97.884207,74.272910,1175.873703,28.845099,7,2061,51.862104,79,0,1,0,1,0
3,97.884207,74.272910,1175.873703,28.845099,9,2144,53.950679,79,0,1,0,1,0
4,97.884207,74.272910,1175.873703,28.845099,11,2069,52.063412,79,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1519,535.176502,60.303555,497.187228,15.569531,13,2712,66.594637,73,0,1,0,0,1
1520,535.176502,60.303555,497.187228,15.569531,19,2978,73.126412,73,0,1,0,0,1
1521,535.176502,60.303555,497.187228,15.569531,31,2908,71.407524,73,0,1,0,0,1
1522,535.176502,60.303555,497.187228,15.569531,43,2975,73.052745,73,0,1,0,0,1


In [None]:
X = df_rf.drop(columns=['FVC'])
y = df_rf['FVC']

In [None]:
X.head()

Unnamed: 0,img_mean_mean,img_mean_std,img_std_mean,img_std_std,Weeks,Percent,Age,Female,Male,Currently smokes,Ex-smoker,Never smoked
0,97.884207,74.27291,1175.873703,28.845099,-4,58.253649,79,0,1,0,1,0
1,97.884207,74.27291,1175.873703,28.845099,5,55.712129,79,0,1,0,1,0
2,97.884207,74.27291,1175.873703,28.845099,7,51.862104,79,0,1,0,1,0
3,97.884207,74.27291,1175.873703,28.845099,9,53.950679,79,0,1,0,1,0
4,97.884207,74.27291,1175.873703,28.845099,11,52.063412,79,0,1,0,1,0


In [None]:
y.head()

0    2315
1    2214
2    2061
3    2144
4    2069
Name: FVC, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=10, random_state=0)
regr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [None]:
y_predict = regr.predict(X_test)

In [None]:
y_predict.shape

(305,)

In [None]:
regr.score(X_test, y_test)

0.9769096544292526

In [None]:
predictions = regr.predict_proba(X_test)

AttributeError: ignored