# Downloading dataset

In [2]:
# install kaggle python api
!pip install kaggle --upgrade



In [7]:
import os, json, subprocess

In [30]:
def check_dataset(path=''):
    if not path:
        path = os.getcwd() + '/dataset/'  
    return os.path.exists(path + 'cataract') and os.path.exists(path + 'diabetic_retinopathy') and os.path.exists(path + 'glaucoma') and os.path.exists(path + 'normal')
    
def init_on_kaggle(username, api_key):
    KAGGLE_CONFIG_DIR = os.path.join(os.path.expandvars('$HOME'), '.kaggle')
    os.makedirs(KAGGLE_CONFIG_DIR, exist_ok = True)
    api_dict = {"username":username, "key":api_key}
    with open(f"{KAGGLE_CONFIG_DIR}/kaggle.json", "w", encoding='utf-8') as f:
        json.dump(api_dict, f)
    cmd = f"chmod 600 {KAGGLE_CONFIG_DIR}/kaggle.json"
    output = subprocess.check_output(cmd.split(" "))
    output = output.decode(encoding='UTF-8')
    print(output)
    
    
def download_dataset_from_kaggle():
    init_on_kaggle("yudzhao", '14e199e96baf549cf5fbf0c5f2dfbc27')
    
    import kaggle
    dataset_name = "gunavenkatdoddi/eye-diseases-classification"
    print(kaggle.api.dataset_view(dataset_name))
    
    kaggle.api.dataset_download_files(dataset_name)
    
    os.system('unzip eye-diseases-classification.zip')
    

In [31]:
dataset_location = os.getcwd() + "/dataset/"

In [32]:
if not check_dataset(dataset_location):
    print('The dataset doesn\'t exist. Try to load from kaggle')
    download_dataset_from_kaggle()

In [33]:
!ls

dataset				   eye-diseases-classification.zip
eye_diseases_classification.ipynb  README.md


# Data Preparation

## Import Images

In [37]:
from pyspark.sql import SparkSession

In [39]:
spark = SparkSession.builder\
        .master("local")\
        .appName("eye_diseases_calssification")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [44]:
df = spark.read.format('image').option('dropInvalid', True).option("recursiveFileLookup","true").load(dataset_location)

In [46]:
df

DataFrame[image: struct<origin:string,height:int,width:int,nChannels:int,mode:int,data:binary>]

In [47]:
df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)



In [52]:
df.select('image.origin', 'image.width', 'image.height', 'image.nChannels', 'image.mode').show(10, truncate=False)

+------------------------------------------------------------------+-----+------+---------+----+
|origin                                                            |width|height|nChannels|mode|
+------------------------------------------------------------------+-----+------+---------+----+
|file:///home/jovyan/work/BAN5600/dataset/cataract/cataract_024.png|2464 |1632  |3        |16  |
|file:///home/jovyan/work/BAN5600/dataset/glaucoma/Glaucoma_081.png|2464 |1632  |3        |16  |
|file:///home/jovyan/work/BAN5600/dataset/glaucoma/Glaucoma_072.png|2464 |1632  |3        |16  |
|file:///home/jovyan/work/BAN5600/dataset/glaucoma/Glaucoma_024.png|2464 |1632  |3        |16  |
|file:///home/jovyan/work/BAN5600/dataset/glaucoma/Glaucoma_071.png|2464 |1632  |3        |16  |
|file:///home/jovyan/work/BAN5600/dataset/glaucoma/Glaucoma_082.png|2464 |1632  |3        |16  |
|file:///home/jovyan/work/BAN5600/dataset/glaucoma/Glaucoma_048.png|2464 |1632  |3        |16  |
|file:///home/jovyan/work/BAN5

In [66]:
from pyspark.sql.functions import col,when

label = when(col('image.origin').contains('cataract'), 'cataract')\
.when(col('image.origin').contains('diabetic_retinopathy'), 'diabetic_retinopathy')\
.when(col('image.origin').contains('glaucoma'), 'glaucoma')\
.otherwise('normal')

df = df.withColumn('type', label)

In [71]:
df.show(1, vertical=True)

-RECORD 0---------------------
 image | {file:///home/jov... 
 type  | cataract             
only showing top 1 row



## Feature Extraction