In [1]:
import tensorflow as tf
import pandas as pd 
import tensorflow_io as tfio 
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import os
from sklearn.cluster import KMeans

In [2]:
main_dir=("/home/tkrsh/osic-main/")

files=[]

for dirname, _, filenames in os.walk(main_dir):
    for filename in (filenames):
        files.append(os.path.join((dirname), filename))

files=[x for x in files if '.csv' not in x]

train_images= [str(x) for x in files if 'train'  in x]
test_images = [str(x) for x in files if 'test'   in x]

In [3]:
def decode_image(image_path):
    image_bytes = tf.io.read_file(image_path)
    image = tfio.image.decode_dicom_image(image_bytes, dtype=tf.uint16)
    image=np.squeeze(image.numpy())
    return image

def show_scan(image):
    img = decode_image(image)
    patient_name=str(image).split('/')[1]
    fig, ax = plt.subplots()
    im=ax.imshow(img,cmap='Greys')
    plt.axis('off')
    plt.title("Baseline CT Scan of Patient {}".format(patient_name))
    fig.set_size_inches(9,9)
    plt.show()

In [4]:
train=pd.read_csv(main_dir+"train.csv")
df=train.copy()

In [5]:
df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker


In [6]:
df=train.copy()

In [7]:
df=pd.concat([df,pd.get_dummies(df['Sex'])],axis=1).drop(['Sex'],axis=1)


In [8]:
df=pd.concat([df,pd.get_dummies(df['SmokingStatus'])],axis=1).drop(['SmokingStatus'],axis=1)


In [9]:
df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Female,Male,Currently smokes,Ex-smoker,Never smoked
0,ID00007637202177411956430,-4,2315,58.253649,79,0,1,0,1,0
1,ID00007637202177411956430,5,2214,55.712129,79,0,1,0,1,0
2,ID00007637202177411956430,7,2061,51.862104,79,0,1,0,1,0
3,ID00007637202177411956430,9,2144,53.950679,79,0,1,0,1,0
4,ID00007637202177411956430,11,2069,52.063412,79,0,1,0,1,0


In [10]:
df['dFVC'] =df["FVC"]-df["FVC"].shift(1)

In [11]:
df['d%'] = df["Percent"]-df["Percent"].shift(1)

In [12]:
df['Gap'] = df["Weeks"]-df["Weeks"].shift(1)

In [13]:
df.fillna(0,inplace=True)

In [14]:
df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Female,Male,Currently smokes,Ex-smoker,Never smoked,dFVC,d%,Gap
0,ID00007637202177411956430,-4,2315,58.253649,79,0,1,0,1,0,0.0,0.0,0.0
1,ID00007637202177411956430,5,2214,55.712129,79,0,1,0,1,0,-101.0,-2.54152,9.0
2,ID00007637202177411956430,7,2061,51.862104,79,0,1,0,1,0,-153.0,-3.850025,2.0
3,ID00007637202177411956430,9,2144,53.950679,79,0,1,0,1,0,83.0,2.088576,2.0
4,ID00007637202177411956430,11,2069,52.063412,79,0,1,0,1,0,-75.0,-1.887267,2.0


In [15]:
df_1=df[df['Patient']=="ID00007637202177411956430"]

In [16]:
Means=KMeans(n_clusters=3).fit((df["Age"].values).reshape(-1,1))

In [17]:
df["Age_Cat"]=Means.labels_

In [18]:
df=pd.concat([df,pd.get_dummies(df['Age_Cat'],prefix="Age_Cat")],axis=1).drop(['Age_Cat'],axis=1)


In [19]:
df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Female,Male,Currently smokes,Ex-smoker,Never smoked,dFVC,d%,Gap,Age_Cat_0,Age_Cat_1,Age_Cat_2
0,ID00007637202177411956430,-4,2315,58.253649,79,0,1,0,1,0,0.0,0.0,0.0,0,1,0
1,ID00007637202177411956430,5,2214,55.712129,79,0,1,0,1,0,-101.0,-2.54152,9.0,0,1,0
2,ID00007637202177411956430,7,2061,51.862104,79,0,1,0,1,0,-153.0,-3.850025,2.0,0,1,0
3,ID00007637202177411956430,9,2144,53.950679,79,0,1,0,1,0,83.0,2.088576,2.0,0,1,0
4,ID00007637202177411956430,11,2069,52.063412,79,0,1,0,1,0,-75.0,-1.887267,2.0,0,1,0


In [20]:
train_df=df.drop("Patient",axis=1)

In [21]:
df["Gap"]= [int(x) for x in df["Gap"]]

In [22]:
df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Female,Male,Currently smokes,Ex-smoker,Never smoked,dFVC,d%,Gap,Age_Cat_0,Age_Cat_1,Age_Cat_2
0,ID00007637202177411956430,-4,2315,58.253649,79,0,1,0,1,0,0.0,0.0,0,0,1,0
1,ID00007637202177411956430,5,2214,55.712129,79,0,1,0,1,0,-101.0,-2.54152,9,0,1,0
2,ID00007637202177411956430,7,2061,51.862104,79,0,1,0,1,0,-153.0,-3.850025,2,0,1,0
3,ID00007637202177411956430,9,2144,53.950679,79,0,1,0,1,0,83.0,2.088576,2,0,1,0
4,ID00007637202177411956430,11,2069,52.063412,79,0,1,0,1,0,-75.0,-1.887267,2,0,1,0


In [23]:
sub=pd.read_csv(main_dir+"sample_submission.csv")

In [24]:
sub.head()

Unnamed: 0,Patient_Week,FVC,Confidence
0,ID00419637202311204720264_-12,2000,100
1,ID00421637202311550012437_-12,2000,100
2,ID00422637202311677017371_-12,2000,100
3,ID00423637202312137826377_-12,2000,100
4,ID00426637202313170790466_-12,2000,100


In [27]:
a



NameError: name 'a' is not defined

In [25]:
from sklearn.model_selection import KFold
import tensorflow as tf

In [482]:
# PARAMETERS
n_folds=5

In [473]:
def score(y_true, y_pred):
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    sigma = y_pred[:, 2] - y_pred[:, 0]
    fvc_pred = y_pred[:, 1]
    sigma_clip = tf.maximum(sigma, C1)
    delta = tf.abs(y_true[:, 0] - fvc_pred)
    delta = tf.minimum(delta, C2)
    sq2 = tf.sqrt( tf.dtypes.cast(2, dtype=tf.float32) )
    metric = (delta / sigma_clip)*sq2 + tf.math.log(sigma_clip* sq2)
    return K.mean(metric)

In [483]:
fold=KFold(n_splits=n_folds)

In [488]:
for train , test in fold.split(df):
  

SyntaxError: unexpected EOF while parsing (<ipython-input-488-2996885ce8b2>, line 2)