In [4]:
import logging
from luigi import Task
from luigi.parameter import BoolParameter, IntParameter
from luigi.task import ExternalTask
from luigi.target import Target
import luigi
from csci_utils.luigi.dask.target import CSVTarget
from csci_utils.luigi.dask.target import ParquetTarget
from csci_utils.luigi.task import Requirement
from csci_utils.luigi.task import Requires
from csci_utils.luigi.task import TargetOutput
from luigi.contrib.s3 import S3Target
import pandas as pd
import pandas as pd
import numpy as np
import pathlib
from dask import dataframe as dd
from sklearn.metrics.pairwise import cosine_similarity, nan_euclidean_distances
from sklearn.preprocessing import LabelEncoder, normalize
import dask.array as da
import glob
import matplotlib;

DEBUG:matplotlib:(private) matplotlib data path: C:\Users\wmj\.virtualenvs\radio-star-X64HrzkT\lib\site-packages\matplotlib\mpl-data
DEBUG:matplotlib:matplotlib data path: C:\Users\wmj\.virtualenvs\radio-star-X64HrzkT\lib\site-packages\matplotlib\mpl-data
DEBUG:matplotlib:CONFIGDIR=C:\Users\wmj\.matplotlib
DEBUG:matplotlib:matplotlib version 3.3.3
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is win32


In [5]:
%matplotlib inline

DEBUG:matplotlib:CACHEDIR=C:\Users\wmj\.matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from C:\Users\wmj\.matplotlib\fontlist-v330.json
DEBUG:matplotlib.pyplot:Loaded backend module://ipykernel.pylab.backend_inline version unknown.
DEBUG:matplotlib.pyplot:Loaded backend module://ipykernel.pylab.backend_inline version unknown.
DEBUG:matplotlib.pyplot:Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [6]:
def encode_objects_general(ddf, object_cols):
    LE = LabelEncoder()
    for object_col in object_cols:
        ddf[object_col] = da.from_array(
            LE.fit_transform(ddf[object_col].astype(str)))
    return ddf

def normalize_general(ddf,columns):
    result = ddf.copy()
    for feature_name in columns:
        max_value = ddf[feature_name].max()
        min_value = ddf[feature_name].min()
        result[feature_name] = 2*(ddf[feature_name] - min_value) / (max_value - min_value) - 1
    return result

def normalize_chex(ddf, object_cols):
    ddf = normalize_general(ddf,object_cols)
    ddf =  normalize_general(ddf,['Age'])
    return ddf

In [7]:
import os

In [8]:
import pandas as pd

In [9]:
import dask.dataframe as dd

In [10]:
class ChexpertDataframe(ExternalTask):

    s3_path = 's3://radio-star-csci-e-29/unzipped/'

    output = TargetOutput(
        file_pattern="",
        ext="train.csv",
        target_class=S3Target,
        path=s3_path
    )

In [11]:
os.path.relpath(('C:\\Users\\wmj\\PycharmProjects\\radio-star\\models\\Tasks\\', 'C:\\Users\\wmj\\PycharmProjects\\radio-star\\data\\processed\\')[-1])

'..\\data\\processed'

In [12]:
class ProcessChexpertDfToParquet(Task):
    requires = Requires()
    chexpertdf = Requirement(ChexpertDataframe)

    output = TargetOutput(
        target_class=ParquetTarget,
        path="../data/processed/",
        ext="",
        flag=False,
        storage_options=dict(requester_pays=True),
    )

    def run(self):
        pathCSV = self.input()["chexpertdf"].path
        ddf = dd.read_csv(pathCSV)
        self.output().write_dask(ddf, compression="gzip")


In [13]:
class NormalizeDF(Task):
    """The Dataframe is best normalized before similarity calculations are
    run on it."""


    requires = Requires()
    proc_chexpertdf = Requirement(ProcessChexpertDfToParquet)

    output = TargetOutput(
        target_class=ParquetTarget,
        path="../data/processed/",
        ext="",
        flag=False,
        storage_options=dict(requester_pays=True),
    )

    def run(self):
        ddf = self.input()["proc_chexpertdf"].read_dask()
        ddf_raw = ddf.copy()
        ddf = ddf.drop(columns=['Path'])
        object_cols = ddf.dtypes[(ddf.dtypes == object)].index.values

        ddf = encode_objects_general(ddf, object_cols)

        ddf = normalize_chex(ddf, object_cols)

        self.output().write_dask(ddf, compression="gzip")

In [14]:
class FindSimilar(Task):
    """The Dataframe is best normalized before similarity calculations are
    run on it."""

    requires = Requires()
    proc_chexpertdf = Requirement(ProcessChexpertDfToParquet)
    normalize_df = Requirement(NormalizeDF)
    comparator_index = IntParameter(default=78414)
    n_images = IntParameter(default=5)

    output = TargetOutput(
        target_class=ParquetTarget,
        path="../data/processed/",
        ext="",
        flag=False,
        storage_options=dict(requester_pays=True),
    )

    def run(self):
        ddf = self.input()["normalize_df"].read_dask()
        ddf_raw = self.input()["proc_chexpertdf"].read_dask()

        object_cols = ddf_raw.dtypes[(ddf_raw.dtypes == object).values]

        row_comparator = ddf.loc[self.comparator_index]

        most_similar_idx = (
            (ddf.values == row_comparator.values)
            .sum(axis=1)
            .astype("int")
            .compute()
            .argsort()[::-1]
        )



        idx_partition_to_view = most_similar_idx[: int(most_similar_idx.size / 5)]
        # Here we set our indices to sample only 1/5 of the dataset,
        # that which is closest to our images

        df = ddf.loc[idx_partition_to_view].compute()

        dist_in_space = nan_euclidean_distances(df.fillna(0), df.loc[
            self.comparator_index].fillna(0).values.reshape(1,-1))

        close_idx = pd.DataFrame(data=dist_in_space, index=df.index)

        print(close_idx[:10])

        row_comparator_raw = ddf_raw.loc[self.comparator_index].compute()

        df_close_rows, mi_df = return_df_close_rows(df, row_comparator,
                                                   close_idx)

        print('original row is: ')
        print(row_comparator_raw)

        print('close image ids are: ')
        print(mi_df)

        
        
        print(list(df_close_rows.index))

        self.output().write_dask(ddf_raw.loc[list(df_close_rows.index)],
                                 compression="gzip")

In [16]:
class ChexpertDataBucket(ExternalTask):

    s3_path = 's3://radio-star-csci-e-29/'

    output = TargetOutput(
        file_pattern="",
        ext="",
        target_class=S3Target,
        path=s3_path
    )

In [17]:
ChexpertDataBucket().output().exists()

INFO:root:self.target_kwargs['path'] is s3://radio-star-csci-e-29/


True

In [18]:
import boto3
from PIL import Image
from io import BytesIO
import os

class S3ImagesInvalidExtension(Exception):
    pass

class S3ImagesUploadFailed(Exception):
    pass

class S3Images(object):
    
    """Useage:
    
        images = S3Images(aws_access_key_id='fjrn4uun-my-access-key-589gnmrn90', 
                          aws_secret_access_key='4f4nvu5tvnd-my-secret-access-key-rjfjnubu34un4tu4', 
                          region_name='eu-west-1')
        im = images.from_s3('my-example-bucket-9933668', 'pythonlogo.png')
        im
        images.to_s3(im, 'my-example-bucket-9933668', 'pythonlogo2.png')
    """
    
    def __init__(self, aws_access_key_id, aws_secret_access_key, region_name):
        self.s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, 
                                     aws_secret_access_key=aws_secret_access_key, 
                                     region_name=region_name)
        

    def from_s3(self, bucket, key):
        file_byte_string = self.s3.get_object(Bucket=bucket, Key=key)['Body'].read()
        return Image.open(BytesIO(file_byte_string))
    

    def to_s3(self, img, bucket, key):
        buffer = BytesIO()
        img.save(buffer, self.__get_safe_ext(key))
        buffer.seek(0)
        sent_data = self.s3.put_object(Bucket=bucket, Key=key, Body=buffer)
        if sent_data['ResponseMetadata']['HTTPStatusCode'] != 200:
            raise S3ImagesUploadFailed('Failed to upload image {} to bucket {}'.format(key, bucket))
        
    def __get_safe_ext(self, key):
        ext = os.path.splitext(key)[-1].strip('.').upper()
        if ext in ['JPG', 'JPEG']:
            return 'JPEG' 
        elif ext in ['PNG']:
            return 'PNG' 
        else:
            raise S3ImagesInvalidExtension('Extension is invalid') 

In [19]:
class PullSimilarImages(Task):
    """The Dataframe is best normalized before similarity calculations are
    run on it."""

    requires = Requires()
    find_similar = Requirement(FindSimilar)
    chexpert_data_bucket = Requirement(ChexpertDataBucket)
    
    output = TargetOutput(
        target_class=Target,
        path="../data/processed/",
        ext="")
    
    def run(self):
        simil_dir_path = self.input()['find_similar'].path
        simil_path = glob.glob(os.path.join(simil_dir_path,'*.parquet'))[0]
        df = pd.read_parquet(simil_path)
        s3_bucket_path = self.input()['chexpert_data_bucket'].path
        bucket = pathlib.Path(pathlib.Path(s3_bucket_path).parts[1])
        images = S3Images(aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'], aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'], region_name= 'ap-southeast-2')
        
        for index, row in df_simil.iterrows():
            rel_path = pathlib.Path(*pathlib.Path(row['Path']).parts[2:])
            key = os.path.join('unzipped', rel_path)
            image = images.from_s3(bucket=bucket, key=key)
            

In [26]:
simil_df = FindSimilar().output().path

INFO:root:self.target_kwargs['path'] is ../data/processed/FindSimilar
INFO:root:BaseDaskTarget path is ../data/processed/FindSimilar/


In [24]:
os.listdir(FindSimilar().output().path)

INFO:root:self.target_kwargs['path'] is ../data/processed/FindSimilar
INFO:root:BaseDaskTarget path is ../data/processed/FindSimilar/


FileNotFoundError: [WinError 3] The system cannot find the path specified: '../data/processed/FindSimilar/'

In [21]:
df_simil = simil_df.compute()

NameError: name 'simil_df' is not defined

In [22]:
df_simil

NameError: name 'df_simil' is not defined

In [None]:
s3_bucket_path = ChexpertDataBucket().output().path
bucket = pathlib.Path(pathlib.Path(s3_bucket_path).parts[1])
images = S3Images(aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'], aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'], region_name= 'ap-southeast-2')

for index, row in df_simil.iterrows():
    rel_path = pathlib.Path(*pathlib.Path(row['Path']).parts[2:])
    key = os.path.join('unzipped', rel_path)
    image = images.from_s3(bucket=bucket, key=key)

In [None]:
def find_close_row(df, close_idx, column, value):
    closestidx = close_idx[df[column] == value] \
        .drop(row_comparator.index.values[0], errors = 'ignore') \
        .idxmin()
    row = df.loc[closestidx]
    return row

def return_df_close_rows(df, row_comparator, close_idx):
    encoded_dict = {1: "POSITIVE ", 0: "UNCERTAIN ", -1: "NEGATIVE ",
                    np.NaN: 'unmentioned'}

    df_case_control = pd.DataFrame(columns=row_comparator.columns)
    cols_hierarchy = []
    for column in row_comparator:
        if row_comparator[column].values in (-1, 0, 1):

            for value in (-1, 1):

                try:
                    row = find_close_row(df, close_idx, column, value)

                    df_case_control = pd.concat((df_case_control, row))

                except:
                    pass
            cols_hierarchy.append(column)

    mi = pd.MultiIndex.from_frame(df_case_control[cols_hierarchy])

    multi_index_images_df = pd.Series(df_case_control.index, index=mi, name = 'Id')

    return df_case_control, multi_index_images_df

In [None]:
simil_path = glob.glob(os.path.join(simil_dir_path,'*.parquet'))

In [None]:
simil_path

In [None]:
df = pd.read_parquet(simil_path)

In [None]:
for index, row in df_simil.iterrows():

    rel_path = pathlib.PurePosixPath(*pathlib.PurePosixPath(row['Path']).parts[2:])
    key = pathlib.PurePosixPath('unzipped') / rel_path
    image = images.from_s3(bucket=str(bucket), key=str(key))
    image.show()


In [None]:
s3_parent_dir = ChexpertDataImages().output().path
for index, row in df_simil.iterrows():
    rel_path = pathlib.PurePosixPath(*pathlib.PurePosixPath(row['Path']).parts[1:])
    s3_img_path = os.path.join(s3_parent_dir, rel_path)

In [None]:
rel_path

In [None]:
dir_path = FindSimilar().output().path

In [None]:
os.path.join(dir_path,'*.parquet')

In [None]:
import boto3
from PIL import Image
from io import BytesIO
import os

class S3ImagesInvalidExtension(Exception):
    pass

class S3ImagesUploadFailed(Exception):
    pass

class S3Images(object):
    
    """Useage:
    
        images = S3Images(aws_access_key_id='fjrn4uun-my-access-key-589gnmrn90', 
                          aws_secret_access_key='4f4nvu5tvnd-my-secret-access-key-rjfjnubu34un4tu4', 
                          region_name='eu-west-1')
        im = images.from_s3('my-example-bucket-9933668', 'pythonlogo.png')
        im
        images.to_s3(im, 'my-example-bucket-9933668', 'pythonlogo2.png')
    """
    
    def __init__(self, aws_access_key_id, aws_secret_access_key, region_name):
        self.s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, 
                                     aws_secret_access_key=aws_secret_access_key, 
                                     region_name=region_name)
        

    def from_s3(self, bucket, key):
        file_byte_string = self.s3.get_object(Bucket=bucket, Key=key)['Body'].read()
        return Image.open(BytesIO(file_byte_string))
    

    def to_s3(self, img, bucket, key):
        buffer = BytesIO()
        img.save(buffer, self.__get_safe_ext(key))
        buffer.seek(0)
        sent_data = self.s3.put_object(Bucket=bucket, Key=key, Body=buffer)
        if sent_data['ResponseMetadata']['HTTPStatusCode'] != 200:
            raise S3ImagesUploadFailed('Failed to upload image {} to bucket {}'.format(key, bucket))
        
    def __get_safe_ext(self, key):
        ext = os.path.splitext(key)[-1].strip('.').upper()
        if ext in ['JPG', 'JPEG']:
            return 'JPEG' 
        elif ext in ['PNG']:
            return 'PNG' 
        else:
            raise S3ImagesInvalidExtension('Extension is invalid') 

In [None]:
images = S3Images(aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'], aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'], region_name= 'ap-southeast-2')

In [None]:
bucket = 'radio-star-csci-e-29'
key = os.path.join('unzipped', rel_path)

In [None]:
key

In [None]:
images.from_s3(bucket=bucket, key=key)