In [84]:
import logging
from luigi import Task
from luigi.parameter import BoolParameter, IntParameter
from luigi.task import ExternalTask
from luigi.target import Target
import luigi
from csci_utils.luigi.dask.target import CSVTarget
from csci_utils.luigi.dask.target import ParquetTarget
from csci_utils.luigi.task import Requirement
from csci_utils.luigi.task import Requires
from csci_utils.luigi.task import TargetOutput
from luigi.contrib.s3 import S3Target
import pandas as pd
import pandas as pd
import numpy as np
import pathlib
from dask import dataframe as dd
from sklearn.metrics.pairwise import cosine_similarity, nan_euclidean_distances
from sklearn.preprocessing import LabelEncoder, normalize
import dask.array as da
import glob
import matplotlib;

In [86]:
%matplotlib inline

DEBUG:matplotlib.pyplot:Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [2]:
def encode_objects_general(ddf, object_cols):
    LE = LabelEncoder()
    for object_col in object_cols:
        ddf[object_col] = da.from_array(
            LE.fit_transform(ddf[object_col].astype(str)))
    return ddf

def normalize_general(ddf,columns):
    result = ddf.copy()
    for feature_name in columns:
        max_value = ddf[feature_name].max()
        min_value = ddf[feature_name].min()
        result[feature_name] = 2*(ddf[feature_name] - min_value) / (max_value - min_value) - 1
    return result

def normalize_chex(ddf, object_cols):
    ddf = normalize_general(ddf,object_cols)
    ddf =  normalize_general(ddf,['Age'])
    return ddf

In [3]:
import os

In [4]:
import pandas as pd

In [5]:
import dask.dataframe as dd

In [6]:
class ChexpertDataframe(ExternalTask):

    s3_path = 's3://radio-star-csci-e-29/unzipped/'

    output = TargetOutput(
        file_pattern="",
        ext="train.csv",
        target_class=S3Target,
        path=s3_path
    )

In [7]:
os.path.relpath(('C:\\Users\\wmj\\PycharmProjects\\radio-star\\models\\Tasks\\', 'C:\\Users\\wmj\\PycharmProjects\\radio-star\\data\\processed\\')[-1])

'..\\data\\processed'

In [8]:
class ProcessChexpertDfToParquet(Task):
    requires = Requires()
    chexpertdf = Requirement(ChexpertDataframe)

    output = TargetOutput(
        target_class=ParquetTarget,
        path="../data/processed/",
        ext="",
        flag=False,
        storage_options=dict(requester_pays=True),
    )

    def run(self):
        pathCSV = self.input()["chexpertdf"].path
        ddf = dd.read_csv(pathCSV)
        self.output().write_dask(ddf, compression="gzip")


In [9]:
class NormalizeDF(Task):
    """The Dataframe is best normalized before similarity calculations are
    run on it."""


    requires = Requires()
    proc_chexpertdf = Requirement(ProcessChexpertDfToParquet)

    output = TargetOutput(
        target_class=ParquetTarget,
        path="../data/processed/",
        ext="",
        flag=False,
        storage_options=dict(requester_pays=True),
    )

    def run(self):
        ddf = self.input()["proc_chexpertdf"].read_dask()
        ddf_raw = ddf.copy()
        ddf = ddf.drop(columns=['Path'])
        object_cols = ddf.dtypes[(ddf.dtypes == object)].index.values

        ddf = encode_objects_general(ddf, object_cols)

        ddf = normalize_chex(ddf, object_cols)

        self.output().write_dask(ddf, compression="gzip")

In [10]:
class FindSimilar(Task):
    """The Dataframe is best normalized before similarity calculations are
    run on it."""

    requires = Requires()
    proc_chexpertdf = Requirement(ProcessChexpertDfToParquet)
    normalize_df = Requirement(NormalizeDF)
    comparator_index = IntParameter(default = 37959)
    n_images = IntParameter(default = 5)

    output = TargetOutput(
        target_class=ParquetTarget,
        path="../data/processed/",
        ext="",
        flag=False,
        storage_options=dict(requester_pays=True),
    )

    def run(self):
        ddf = self.input()["normalize_df"].read_dask()
        ddf_raw = self.input()["proc_chexpertdf"].read_dask()
        
        object_cols = ddf.dtypes[(ddf.dtypes == object)].index.values

        row_comparator_raw = ddf.loc[self.comparator_index]

        # This compensate for a bug in dask row equality calculations
        row_comparator_na = row_comparator_raw.isna().compute().iloc[0]

        similar_features_idx = (ddf.isna() == row_comparator_na).sum(
            1).compute().nlargest(n=100).index

        argsorted = nan_euclidean_distances(
            row_comparator_raw.compute().values.reshape(1, -1),
            ddf.loc[similar_features_idx.to_list()].compute().values).argsort()
        
        top_n = similar_features_idx[argsorted][0][:self.n_images]
        
        top_n_close_images = ddf_raw.loc[top_n]

        self.output().write_dask(top_n_close_images, compression="gzip")
        

In [52]:
class ChexpertDataBucket(ExternalTask):

    s3_path = 's3://radio-star-csci-e-29/'

    output = TargetOutput(
        file_pattern="",
        ext="",
        target_class=S3Target,
        path=s3_path
    )

In [53]:
ChexpertDataBucket().output().exists()

INFO:root:self.target_kwargs['path'] is s3://radio-star-csci-e-29/


True

In [68]:
class PullSimilarImages(Task):
    """The Dataframe is best normalized before similarity calculations are
    run on it."""

    requires = Requires()
    find_similar = Requirement(FindSimilar)
    chexpert_data_bucket = Requirement(ChexpertDataBucket)
    
    output = TargetOutput(
        target_class=Target,
        path="../data/processed/",
        ext="")
    
    def run(self):
        simil_dir_path = self.input()['find_similar'].path
        simil_path = glob.glob(os.path.join(simil_dir_path,'*.parquet'))[0]
        df = pd.read_parquet(simil_path)
        s3_bucket_path = self.input()['chexpert_data_bucket'].path
        bucket = pathlib.Path(pathlib.Path(s3_bucket_path).parts[1])
        images = S3Images(aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'], aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'], region_name= 'ap-southeast-2')
        
        for index, row in df_simil.iterrows():
            rel_path = pathlib.Path(*pathlib.Path(row['Path']).parts[2:])
            key = os.path.join('unzipped', rel_path)
            image = images.from_s3(bucket=bucket, key=key)
            

In [100]:
key

PurePosixPath('unzipped/patient59195/study1/view1_frontal.jpg')

In [103]:
        for index, row in df_simil.iterrows():

            rel_path = pathlib.PurePosixPath(*pathlib.PurePosixPath(row['Path']).parts[2:])
            key = pathlib.PurePosixPath('unzipped') / rel_path
            image = images.from_s3(bucket=str(bucket), key=str(key))
            image.show()


DEBUG:botocore.hooks:Event before-parameter-build.s3.GetObject: calling handler <function sse_md5 at 0x000001377FAC4730>
DEBUG:botocore.hooks:Event before-parameter-build.s3.GetObject: calling handler <function validate_bucket_name at 0x000001377FAC46A8>
DEBUG:botocore.hooks:Event before-parameter-build.s3.GetObject: calling handler <bound method S3RegionRedirector.redirect_from_cache of <botocore.utils.S3RegionRedirector object at 0x000001370AC35898>>
DEBUG:botocore.hooks:Event before-parameter-build.s3.GetObject: calling handler <bound method S3ArnParamHandler.handle_arn of <botocore.utils.S3ArnParamHandler object at 0x000001370AC35C50>>
DEBUG:botocore.hooks:Event before-parameter-build.s3.GetObject: calling handler <function generate_idempotent_uuid at 0x000001377FAC4510>
DEBUG:botocore.hooks:Event before-call.s3.GetObject: calling handler <function add_expect_header at 0x000001377FAC49D8>
DEBUG:botocore.hooks:Event before-call.s3.GetObject: calling handler <bound method S3RegionRed

NoSuchKey: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.

In [105]:
image

NameError: name 'image' is not defined

In [70]:
key

'unzipped\\patient09313\\study1\\view1_frontal.jpg'

In [66]:
bucket

WindowsPath('radio-star-csci-e-29')

In [15]:
p = pathlib.Path(df_simil.Path.sample().values[0])

In [31]:
pathlib.Path(*p.parts[1:])

WindowsPath('train/patient40586/study11/view1_frontal.jpg')

In [94]:
s3_parent_dir = ChexpertDataImages().output().path
for index, row in df_simil.iterrows():
    rel_path = pathlib.PurePosixPath(*pathlib.PurePosixPath(row['Path']).parts[1:])
    s3_img_path = os.path.join(s3_parent_dir, rel_path)

INFO:root:self.target_kwargs['path'] is s3://radio-star-csci-e-29/unzipped/


In [95]:
rel_path

PurePosixPath('train/patient59195/study1/view1_frontal.jpg')

In [19]:
dir_path = FindSimilar().output().path

INFO:root:self.target_kwargs['path'] is ../data/processed/FindSimilar
INFO:root:BaseDaskTarget path is ../data/processed/FindSimilar/


In [20]:
os.path.join(dir_path,'*.parquet')

'../data/processed/FindSimilar/*.parquet'

In [21]:
import boto3
from PIL import Image
from io import BytesIO
import os

class S3ImagesInvalidExtension(Exception):
    pass

class S3ImagesUploadFailed(Exception):
    pass

class S3Images(object):
    
    """Useage:
    
        images = S3Images(aws_access_key_id='fjrn4uun-my-access-key-589gnmrn90', 
                          aws_secret_access_key='4f4nvu5tvnd-my-secret-access-key-rjfjnubu34un4tu4', 
                          region_name='eu-west-1')
        im = images.from_s3('my-example-bucket-9933668', 'pythonlogo.png')
        im
        images.to_s3(im, 'my-example-bucket-9933668', 'pythonlogo2.png')
    """
    
    def __init__(self, aws_access_key_id, aws_secret_access_key, region_name):
        self.s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, 
                                     aws_secret_access_key=aws_secret_access_key, 
                                     region_name=region_name)
        

    def from_s3(self, bucket, key):
        file_byte_string = self.s3.get_object(Bucket=bucket, Key=key)['Body'].read()
        return Image.open(BytesIO(file_byte_string))
    

    def to_s3(self, img, bucket, key):
        buffer = BytesIO()
        img.save(buffer, self.__get_safe_ext(key))
        buffer.seek(0)
        sent_data = self.s3.put_object(Bucket=bucket, Key=key, Body=buffer)
        if sent_data['ResponseMetadata']['HTTPStatusCode'] != 200:
            raise S3ImagesUploadFailed('Failed to upload image {} to bucket {}'.format(key, bucket))
        
    def __get_safe_ext(self, key):
        ext = os.path.splitext(key)[-1].strip('.').upper()
        if ext in ['JPG', 'JPEG']:
            return 'JPEG' 
        elif ext in ['PNG']:
            return 'PNG' 
        else:
            raise S3ImagesInvalidExtension('Extension is invalid') 

In [40]:
images = S3Images(aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'], aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'], region_name= 'ap-southeast-2')

DEBUG:botocore.hooks:Event choose-service-name: calling handler <function handle_service_name_alias at 0x000001377FAA1C80>
DEBUG:botocore.hooks:Event creating-client-class.s3: calling handler <function add_generate_presigned_post at 0x000001377FA5EF28>
DEBUG:botocore.hooks:Event creating-client-class.s3: calling handler <function lazy_call.<locals>._handler at 0x0000013709A466A8>
DEBUG:botocore.hooks:Event creating-client-class.s3: calling handler <function add_generate_presigned_url at 0x000001377FA5ED08>
DEBUG:botocore.endpoint:Setting s3 timeout as (60, 60)
DEBUG:botocore.client:Registering retry handlers for service: s3


In [38]:
bucket = 'radio-star-csci-e-29'
key = os.path.join('unzipped', rel_path)

In [42]:
key

'unzipped\\train\\patient59195\\study1\\view1_frontal.jpg'

In [41]:
images.from_s3(bucket=bucket, key=key)

DEBUG:botocore.hooks:Event before-parameter-build.s3.GetObject: calling handler <function sse_md5 at 0x000001377FAC4730>
DEBUG:botocore.hooks:Event before-parameter-build.s3.GetObject: calling handler <function validate_bucket_name at 0x000001377FAC46A8>
DEBUG:botocore.hooks:Event before-parameter-build.s3.GetObject: calling handler <bound method S3RegionRedirector.redirect_from_cache of <botocore.utils.S3RegionRedirector object at 0x000001370AC35898>>
DEBUG:botocore.hooks:Event before-parameter-build.s3.GetObject: calling handler <bound method S3ArnParamHandler.handle_arn of <botocore.utils.S3ArnParamHandler object at 0x000001370AC35C50>>
DEBUG:botocore.hooks:Event before-parameter-build.s3.GetObject: calling handler <function generate_idempotent_uuid at 0x000001377FAC4510>
DEBUG:botocore.hooks:Event before-call.s3.GetObject: calling handler <function add_expect_header at 0x000001377FAC49D8>
DEBUG:botocore.hooks:Event before-call.s3.GetObject: calling handler <bound method S3RegionRed

NoSuchKey: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.