In [None]:
import sys
import os
# !{sys.executable} -m pip install --upgrade pip
# !{sys.executable} -m pip install numpy
# !{sys.executable} -m pip install scikit-image
# !{sys.executable} -m pip install matplotlib
# !{sys.executable} -m pip install scikit-learn
# !{sys.executable} -m pip install h5py
# !{sys.executable} -m pip install pandas
# !{sys.executable} -m pip install tables
# !{sys.executable} -m pip install requests
# !{sys.executable} -m pip install Glymur
# !{sys.executable} -m pip install tqdm
# !{sys.executable} -m pip install opencv-python
# !{sys.executable} -m pip install keras
# !{sys.executable} -m pip install tensorflow
# !{sys.executable} -m pip install seaborn
# !{sys.executable} -m pip install ipyparallel
# !{sys.executable} -m pip install pyarrow
# !{sys.executable} -m pip install swifter
# !{sys.executable} -m pip install pandarallel
# !{sys.executable} -m pip install dask
# !{sys.executable} -m pip install matplotlib-scalebar

In [None]:
import h5py
import pandas as pd
from pandarallel import pandarallel
import multiprocessing as mp
from glob import glob
from IPython.core.display import HTML
from tqdm import tqdm
import os
import glymur
import cv2
import requests
import json
import hashlib
import random
import itertools
from joblib import Parallel, delayed, parallel_backend
import time
import warnings
# from jupyterthemes import jtplot
import swifter

import dask.dataframe as dd

import numpy as np
from numpy.random import randint

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.patches import Rectangle
from matplotlib_scalebar.scalebar import ScaleBar
import seaborn as sns

from scipy import ndimage as ndi

import sklearn
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier


import skimage
from skimage import data
from skimage.filters import threshold_yen, threshold_triangle, threshold_otsu, threshold_li, sobel
from skimage.segmentation import clear_border
from skimage.measure import label, regionprops
from skimage.morphology import closing, square
from skimage.color import label2rgb
from skimage import exposure
from skimage.io import imread
from skimage.color import rgb2gray
from skimage.measure import label
from skimage import color
from skimage.transform import resize, rescale
from skimage.feature import hog


rescale_size = (300, 300)
display(HTML("<style>pre { white-space: pre !important; }</style>"))
num_cores = mp.cpu_count()
plt.rcParams.update({'figure.max_open_warning': 0})
warnings.filterwarnings('ignore')
plt.style.use('dark_background')
# jtplot.style(theme="monokai", context="notebook", ticks=True, grid=False)

In [None]:
current_dir = os.getcwd()
current_dir

In [None]:
drive_location = "/export/data/utkarsh"
data_location = drive_location + '/data'
P4data_location = drive_location + "/P4data"
glob(P4data_location + "/*.csv")

metadata = pd.read_csv(P4data_location + '/P4_catalog_v1.1_metadata.csv')
tiles_coord = pd.read_csv(P4data_location + '/P4_catalog_v1.1_tile_coords_final.csv')
fan = pd.read_csv(P4data_location + "/P4_catalog_v1.1_L1C_cut_0.5_fan.csv")
blotch = pd.read_csv(P4data_location + "/P4_catalog_v1.1_L1C_cut_0.5_blotch.csv")

print(len(tiles_coord))
print(len(fan))
print(len(blotch))

item = tiles_coord.iloc[0].obsid
print(tiles_coord.iloc[0])
print(item)

pd.set_option('display.expand_frame_repr', False)
print(tiles_coord)
print(fan)
pd.set_option('display.expand_frame_repr', True)

In [None]:

def download_file(filename):
    savename = data_location + "/{filename}_RGB.NOMAP.JP2".format(filename=filename)
    if os.path.exists(savename):
        print("{} exists".format(savename))
        return
    components = filename.split("_")
    l = 100*int(int(components[1])/100)
    h = 100*int(1+int(components[1])/100)-1
    url="https://hirise-pds.lpl.arizona.edu/download/PDS/EXTRAS/RDR/ESP/ORB_{low:06d}_{high:06d}/{filename}/{filename}_RGB.NOMAP.JP2".format(low=l,high=h,filename=filename)
    print(url)
    myfile = requests.get(url)
    with open(savename, 'wb') as file:
      file.write(myfile.content)
      file.flush()
      file.close()
    
def load_file(filename):
    savename = data_location + "/{filename}_RGB.NOMAP.JP2".format(filename=filename)
    if not os.path.exists(savename):
        download_file(filename)
    return glymur.Jp2k(savename)


nx,ny = 840, 648

def cv2_imshow(a, **kwargs):
    a = a.clip(0, 255).astype('uint8')
    # cv2 stores colors as BGR; convert to RGB
    if a.ndim == 3:
        if a.shape[2] == 4:
            a = cv2.cvtColor(a, cv2.COLOR_BGRA2RGBA)
        else:
            a = cv2.cvtColor(a, cv2.COLOR_BGR2RGB)

    return plt.imshow(a, **kwargs)

def fan_mask(fan, tile):
    xc,yc = fan.image_x - (tile.x_hirise-nx//2),fan.image_y - (tile.y_hirise-ny//2)
    
    fan_s = fan.distance / (1+np.tan(np.deg2rad(fan.spread//2)))
    fan_r = fan_s * np.tan(np.deg2rad(fan.spread//2))
    circ_c_points = np.cos(np.deg2rad(np.arange(0,180,10)))
    circ_s_points = np.sin(np.deg2rad(np.arange(0,180,10)))
    xp = np.hstack([0,
          fan_s,
          fan_s+fan_r*circ_s_points,
          fan_s,
          0
         ])
    yp = np.hstack([0,
          fan_r,
          fan_r*circ_c_points,
          -fan_r,
          0
         ])
    rx,ry = np.cos(np.deg2rad(fan.angle)), np.sin(np.deg2rad(fan.angle))
    rot = np.array([[rx,-ry],[ry,rx]])
    xr,yr=np.dot(rot,np.vstack([xp,yp]))

    return (xc+xr, yc+yr)

def blotch_mask(blotch, tile):
    xc,yc = blotch.image_x - (tile.x_hirise-nx//2),blotch.image_y - (tile.y_hirise-ny//2)

    t = np.linspace(0, 2*np.pi, 22)
    xp = blotch.radius_1 * np.cos(t)
    yp = blotch.radius_2 * np.sin(t)

    rx,ry = np.cos(np.deg2rad(blotch.angle)), np.sin(np.deg2rad(blotch.angle))
    rot = np.array([[rx,-ry],[ry,rx]])
    xr,yr=np.dot(rot,np.vstack([xp,yp]))

    return (xc+xr, yc+yr)


def get_image(tiles_coord, jp, irow):
    row = tiles_coord.iloc[irow]
    sx = slice(int(row.x_hirise-nx//2),int(row.x_hirise+nx//2))
    sy = slice(int(row.y_hirise-ny//2),int(row.y_hirise+ny//2))
    im16 = np.copy(jp[sy,sx])
    ratio = np.amax(im16) / 256
    img8 = (im16 / ratio).astype('uint8')
    return img8

def show_image(tiles_coord, fan_or_blotch, jp, irow, isfan=True):
    """
    isfan: True for fan and False for blotch
    """
    row = tiles_coord.loc[irow]
    img8 = get_image(tiles_coord, jp, irow)
    cv2_imshow(img8)

    if isfan:
      myfans = fan_or_blotch[fan_or_blotch.tile_id == tiles_coord.loc[irow].tile_id]
      for ifan, fan in myfans.iterrows():
            
            x,y = fan_mask(fan, row)
            x = np.where(x<0, 1, x) 
            y = np.where(y<0, 1, y) 
            x = np.where(x>nx, nx - 1, x) 
            y = np.where(y>ny, ny - 1, y) 
            plt.plot(x,y,alpha=1.0)
            
    
#     else:
#       myblotches = fan_or_blotch[fan_or_blotch.tile_id == tiles_coord.loc[irow].tile_id]
#       print(tiles_coord.loc[irow].tile_id)
#       for iblotch, blotch in myblotches.iterrows():
#           x,y = blotch_mask(blotch, row)
#           plt.plot(x,y,alpha=1.0) # Removed since we do not need blotches, false will imply no fans shown

    return img8 # This will return an array as well


irow = 400
# for irow in [1,50,200,400,600,800,1200,1500,2000,2500,3000,3500,4000]:
row = tiles_coord.iloc[irow]
jp = load_file(row.obsid)
print(row.obsid, row.tile_id)
plt.figure(figsize = (16,8))
show_image(tiles_coord, fan, jp, irow, isfan = True)
plt.title(f"{row.tile_id} /w Marked Fans")
# plt.title(f"{row.tile_id} /w Raw")
plt.xlabel("Pixels (nx)")
plt.ylabel("Pixels (ny)")
scalebar = ScaleBar(1, box_alpha = 0.3) # 1 pixel = 0.2 meter
plt.gca().add_artist(scalebar)
plt.savefig("fig2.pdf")
    
# print(show_image(tiles_coord, fan, jp, 100, isfan = True)) # (NO LONGER) Gives and error

In [None]:
def expand_bbox(minx, miny, maxx, maxy, scale = 1.2):
    width = abs(maxx - minx)
    height = abs(maxy - miny)
    new_width = width * scale
    new_height = height * scale
    new_minx = minx - (new_width - width)//2
    new_maxx = maxx + (new_width - width)//2
    new_miny = miny - (new_height - height)//2
    new_maxy = maxy + (new_height - height)//2
    
    if new_minx < 1:
        new_minx = 1
    if new_maxx > (nx-1):
        new_maxx = nx - 1
    if new_miny < 1:
        new_miny = 1
    if new_maxy > (ny-1):
        new_maxy = ny - 1
        
    new_minx = int(new_minx)
    new_miny = int(new_miny)
    new_maxx = int(new_maxx)
    new_maxy = int(new_maxy)
    return new_minx, new_miny, new_maxx, new_maxy

In [None]:
def extract_fan_box_marking_id(tiles_coord, jp, irow):
    row = tiles_coord.iloc[irow]
    jp = load_file(row.obsid)
    img = get_image(tiles_coord, jp, irow)
    row = tiles_coord.loc[irow]
    img8 = get_image(tiles_coord, jp, irow)
    marking_id_list = []

    myfans = fan[fan.tile_id == tiles_coord.loc[irow].tile_id]
    for ifan, fan0 in myfans.iterrows():
        marking_id_list.append(myfans.marking_id[ifan])     
        
    if len(marking_id_list) == 0:
        return [None]
    return marking_id_list

def extract_fan_box(tiles_coord, jp, irow, loc_list):
    row = tiles_coord.loc[irow]
    img8 = get_image(tiles_coord, jp, irow)
    cropped = []
    myfans = fan[fan.tile_id == tiles_coord.loc[irow].tile_id]

    for ifan, fan0 in myfans.iterrows():
        x,y = fan_mask(fan0, row)
        x = np.where(x<0, 1, x) 
        y = np.where(y<0, 1, y) 
        x = np.where(x>nx, nx - 1, x) 
        y = np.where(y>ny, ny - 1, y)
        min_x = int(min(x))
        max_x = int(max(x))
        min_y = int(min(y))
        max_y = int(max(y))
#         min_x, min_y, max_x, max_y = expand_bbox(min_x, min_y, max_x, max_y, scale = 1.2)
        cropped.append(img8[min_y:max_y, min_x:max_x])
        width = abs(max_x - min_x)
        height = abs(max_y - min_y)
        loc_list.append([(min_x, min_y), width, height])

    if cropped == []:
        return [None]
    else:
        return cropped # List of img8 files of cropped fan images

def intersection(arr1, arr2):
    p1 = pd.DataFrame([r.flatten() for r in arr1]).drop_duplicates()
    p2 = pd.DataFrame([r.flatten() for r in arr2]).drop_duplicates()
    res = p1.merge(p2)
    return res

def random_square_edges(threshold_size = 25):
    x1 = randint(0, nx - 1)
    y1 = randint(0, ny - 1)
    x2 = randint(0, nx - 1)
    y2 = randint(0, ny - 1) 
    
    if abs(y2-y1) < threshold_size:
#         print("random_square_edges() FAILED BOX SIZE: Trying again")
        return random_square_edges(threshold_size)

    if abs(x2-x1) < threshold_size:
#         print("random_square_edges() FAILED BOX SIZE: Trying again")
        return random_square_edges(threshold_size)
        
    if x1>x2 and y1>y2:
        x_low = x2
        x_high = x1
        y_low = y2
        y_high = y1
        
    if x1>x2 and y2>y1:
        x_low = x2
        x_high = x1
        y_low = y1
        y_high = y2
        
    if x2>x1 and y1>y2:
        x_low = x1
        x_high = x2
        y_low = y2
        y_high = y1
        
    if x2>x1 and y2>y1:
        x_low = x1
        x_high = x2
        y_low = y1
        y_high = y2
        
    return x_low, x_high, y_low, y_high

def square_generator(xmin, xmax, ymin, ymax):
    xrun = np.arange(xmin, xmax, 1)
    x = np.array([])
    y = np.array([])
    
    for xval in xrun:
        yline = np.arange(ymin, ymax, 1)
        xline = np.linspace(xval, xval, len(yline))
        x = np.append(x, xline)
        y = np.append(y, yline)
    return x,y

def extract_random_box(tiles_coord, jp, irow, loc_list ,check_intersection = False):
    """ This function extracts a random box which is not a fan but may be a blotch
    """
    
    row = tiles_coord.iloc[irow]
    jp = load_file(row.obsid)
    img = get_image(tiles_coord, jp, irow)
    row = tiles_coord.loc[irow]
    img8 = get_image(tiles_coord, jp, irow)

    myfans = fan[fan.tile_id == tiles_coord.loc[irow].tile_id]
    
    fansx = np.array([])
    fansy = np.array([])
    
    x_low, x_high, y_low, y_high = random_square_edges(threshold_size = 20)

        
    if check_intersection:
        for ifan, fan0 in myfans.iterrows():
            x,y = fan_mask(fan0, row)
            x = np.where(x<0, 1, x) 
            y = np.where(y<0, 1, y) 
            x = np.where(x>nx, nx - 1, x) 
            y = np.where(y>ny, ny - 1, y)
            xtemp, ytemp = square_generator(min(x), max(x), min(y), max(y))
            xtemp, ytemp = np.round(xtemp), np.round(ytemp)
            fansx = np.append(fansx, xtemp)
            fansy = np.append(fansy, ytemp)
    #         plt.plot(xtemp,ytemp, alpha = 0.3, color = "g")        
        X,Y = square_generator(x_low, x_high, y_low, y_high)
        arr1 = np.vstack((fansx,fansy)).T
        arr2 = np.vstack((X,Y)).T
        intersect = intersection(arr1, arr2)
        if intersect.size != 0:
#             print("extract_random_box() FAILED INTERSECTION: Trying again")
            return extract_random_box(tiles_coord, jp, irow, check_intersection = True) #If intersecting with fan masks, recurse. 
        
        maxx, minx = int(max(X)), int(min(X))
        maxy, miny = int(max(Y)), int(min(Y))
#     cv2_imshow(img8)
#     plt.plot(X,Y, alpha = 0.3, color = "r")
    else:
        maxx, minx = int(x_high), int(x_low)
        maxy, miny = int(y_high), int(y_low)
    
    width = abs(maxx - minx)
    height = abs(maxy - miny)
    loc_list += [(minx, miny), width, height]
    
    return img8[miny:maxy, minx:maxx]

def plot_bar(y, loc='left', relative=True):
    width = 0.35
    if loc == 'left':
        n = -0.5
    elif loc == 'right':
        n = 0.5
 
    # calculate counts per type and sort, to ensure their order
    unique, counts = np.unique(y, return_counts=True)
    sorted_index = np.argsort(unique)
    unique = unique[sorted_index]
 
    if relative:
        # plot as a percentage
        counts = 100*counts[sorted_index]/len(y)
        ylabel_text = '% count'
    else:
        # plot counts
        counts = counts[sorted_index]
        ylabel_text = 'count'
 
    xtemp = np.arange(len(unique))
 
    plt.bar(xtemp + n*width, counts, align='center', alpha=.7, width=width)
    plt.xticks(xtemp, unique)
    plt.xlabel('Label Type')
    plt.ylabel(ylabel_text)

In [None]:
blotch = pd.read_csv(P4data_location + "/P4_catalog_v1.1_L1C_cut_0.5_blotch.csv")

def extract_blotch_box_marking_id(tiles_coord, jp, irow):
    row = tiles_coord.iloc[irow]
    jp = load_file(row.obsid)
    img = get_image(tiles_coord, jp, irow)
    img8 = get_image(tiles_coord, jp, irow)
    marking_id_list = []

    myblotches = blotch[blotch.tile_id == tiles_coord.loc[irow].tile_id]
    
    for iblotch, blotch0 in myblotches.iterrows():
        marking_id_list.append(myblotches.marking_id[iblotch])     
        
    if len(marking_id_list) == 0:
        return [None]
    return marking_id_list

def extract_blotch_box(tiles_coord, jp, irow, loc_list):
    row = tiles_coord.loc[irow]
    img8 = get_image(tiles_coord, jp, irow)
    cropped = []
    myblotches = blotch[blotch.tile_id == tiles_coord.loc[irow].tile_id]

    for iblotch, blotch0 in myblotches.iterrows():
        x,y = blotch_mask(blotch0, row)
        x = np.where(x<0, 1, x) 
        y = np.where(y<0, 1, y) 
        x = np.where(x>nx, nx - 1, x) 
        y = np.where(y>ny, ny - 1, y)
        min_x = int(min(x))
        max_x = int(max(x))
        min_y = int(min(y))
        max_y = int(max(y))
#         min_x, min_y, max_x, max_y = expand_bbox(min_x, min_y, max_x, max_y, scale = 1.2)
        cropped.append(img8[min_y:max_y, min_x:max_x])
        width = abs(max_x - min_x)
        height = abs(max_y - min_y)
        loc_list.append([(min_x, min_y), width, height])

    if cropped == []:
        return [None]
    else:
        return cropped # List of img8 files of cropped fan images
    
# for i in tqdm([1,2,5,20,29,100,899, 1000, 1001, 1002, 1003, 1004, 1005]):
#     irow = i
#     row = tiles_coord.iloc[irow]
#     jp = load_file(row.obsid)
#     img = get_image(tiles_coord, jp, irow)
#     marking_id_list = extract_blotch_box_marking_id(tiles_coord, jp, irow)
#     loc_list = []
#     blotch_boxes = extract_blotch_box(tiles_coord, jp, irow, loc_list)
# #     print(loc_list)
#     count = 0
#     for image in blotch_boxes:
#         if image is None:
#             continue
#         plt.figure()
#         plt.title(marking_id_list[count])    
#         count += 1
#         plt.imshow(image)
# #         cv2_imshow(image)

In [None]:
def process_image(img0):
    img = cv2.cvtColor(img0, cv2.COLOR_BGR2GRAY)
    img = resize(image = img, output_shape = rescale_size, anti_aliasing=True)
#     p2, p98 = np.percentile(img, (5, 95))
#     rescaled = exposure.rescale_intensity(img, in_range=(p2, p98))
#     sigma = 1.5
#     thresh = np.mean(rescaled) -  sigma * np.std(rescaled) 
#     img = img > threshold_otsu(img)
#     img = closing(rescaled < thresh, square(1))
#     img = ndi.binary_fill_holes(img)
    return img

# irow = 8
# row = tiles_coord.iloc[irow]
# jp = load_file(row.obsid)

# plt.figure(figsize = (16,9))
# loc_list = []
# random_img = extract_fan_box(tiles_coord, jp, irow, loc_list)[0]
# plt.imshow(process_image(random_img))
# plt.figure()
# plt.imshow(random_img)
# plt.show()

In [None]:
print("Number of CPU processors:", num_cores)

print("No. Tiles:", len(tiles_coord))
print("Total Sample Size:", len(fan))


class HogTransformer(BaseEstimator, TransformerMixin):
    """
    Expects an array of 2d arrays (1 channel images)
    Calculates hog features for each img
    """
 
    def __init__(self, y=None, orientations=9,
                 pixels_per_cell=(8, 8),
                 cells_per_block=(3, 3), block_norm='L2-Hys'):
        self.y = y
        self.orientations = orientations
        self.pixels_per_cell = pixels_per_cell
        self.cells_per_block = cells_per_block
        self.block_norm = block_norm
 
    def fit(self, X, y=None):
        return self
 
    def transform(self, X, y=None):
 
        def local_hog(X):
            return hog(X,
                       orientations=self.orientations,
                       pixels_per_cell=self.pixels_per_cell,
                       cells_per_block=self.cells_per_block,
                       block_norm=self.block_norm)
 
        try: # parallel
            return np.array([local_hog(img) for img in X])
        except:
            return np.array([local_hog(img) for img in X])
        
        
hogify = HogTransformer(
            pixels_per_cell=(15, 15),
            cells_per_block=(3,3),
            orientations=9,
            block_norm='L2-Hys')

scalify = StandardScaler()

data = pd.DataFrame({'tile_id' : [], "marking_id": [], "label": [], "item_loc": [],"img": []})

# data_size = 2500

# skip_factor = len(tiles_coord) // data_size

# # print(f"Extracting a tile every {skip_factor} tiles")

# def parallel_extract(i, rescale_size, tiles_coord):
#     data = pd.DataFrame({'tile_id' : [], "marking_id": [], "label": [], "item_loc": [],"img": []})
#     irow = i  + 2500 + 2500 + 2500 + 2500 + 2500 + 2500 + 2500
#     # A + B + C + D + E + F + G => H
#     row = tiles_coord.iloc[irow]
#     jp = load_file(row.obsid)
    
#     marking_id_list = extract_fan_box_marking_id(tiles_coord, jp, irow)
#     marking_id_list_blotch = extract_blotch_box_marking_id(tiles_coord, jp, irow)
#     fan_locations = []
#     blotch_locations = []
#     fan_boxes = extract_fan_box(tiles_coord, jp, irow, fan_locations)
#     blotch_boxes = extract_blotch_box(tiles_coord, jp, irow, blotch_locations)
    
#     if marking_id_list[0] is None:
#         marking_id_list.pop(0)
        
#     if marking_id_list_blotch[0] is None:
#         marking_id_list_blotch.pop(0)
    
# #     assert len(marking_id_list) == len(fan_boxes)
    
#     for k in range(len(marking_id_list_blotch)):
        
#         blotch_box = process_image(blotch_boxes[k])
#         blotch_box = hog(blotch_box,
#                          orientations= 9,
#                          pixels_per_cell= (8, 8),
#                          cells_per_block= (3, 3),
#                          block_norm = 'L2-Hys')
        
#         boxed_blotch = pd.DataFrame({'tile_id' : [tiles_coord.iloc[irow].tile_id],
#                             "marking_id": [marking_id_list_blotch[k]],
#                             "label": ["notfan"], 
#                             "item_loc": [blotch_locations[k]],
#                             "img": [blotch_box]})
#         data = pd.concat([data, boxed_blotch])
    
#     for j in range(len(marking_id_list)):
        
#         fan_box = process_image(fan_boxes[j])
        
#         fan_box = hog(fan_box,
#                         orientations= 9,
#                         pixels_per_cell= (8, 8),
#                         cells_per_block= (3, 3),
#                         block_norm = 'L2-Hys')
        
# #         random_box_loc = []
# #         random_box = extract_random_box(tiles_coord, load_file(tiles_coord.iloc[irow].obsid), irow, random_box_loc)
# #         random_box = process_image(random_box)
        
# #         random_box2_loc = []
# #         random_box2 = extract_random_box(tiles_coord, load_file(tiles_coord.iloc[irow].obsid), irow, random_box2_loc)
# #         random_box2 = process_image(random_box2)

# #         random_box3_loc = []
# #         random_box3 = extract_random_box(tiles_coord, load_file(tiles_coord.iloc[irow].obsid), irow, random_box3_loc)
# #         random_box3 = cv2.cvtColor(random_box3, cv2.COLOR_BGR2GRAY)
# #         random_box3 = resize(image = random_box3, output_shape = rescale_size, anti_aliasing=True)        
        
#         boxed_fan = pd.DataFrame({'tile_id' : [tiles_coord.iloc[irow].tile_id],
#                             "marking_id": [marking_id_list[j]],
#                             "label": ["fan"], 
#                             "item_loc": [fan_locations[j]],
#                             "img": [fan_box]})
#         data = pd.concat([data, boxed_fan])
        
# #         random = pd.DataFrame({'tile_id' : [tiles_coord.iloc[irow].tile_id],
# #                             "marking_id": ["R" + marking_id_list[j][1:]],
# #                             "label": ["notfan"], 
# #                             "item_loc": [random_box_loc],
# #                             "img": [random_box]})
# #         data = pd.concat([data, random])
        
        
# #         random2 = pd.DataFrame({'tile_id' : [tiles_coord.iloc[irow].tile_id],
# #                             "marking_id": ["E" + marking_id_list[j][1:]],
# #                             "label": ["notfan"], 
# #                             "item_loc": [random_box2_loc],
# #                             "img": [random_box2]})
# #         data = pd.concat([data, random2])

# #         random3 = pd.DataFrame({'tile_id' : [tiles_coord.iloc[irow].tile_id],
# #                             "marking_id": ["Q" + marking_id_list[j][1:]],
# #                             "label": ["notfan"], 
# #                             "item_loc": [random_box3_loc],
# #                             "img": [random_box3]})
# #         data = pd.concat([data, random3])
#     return data

# # with parallel_backend('threading', n_jobs=4):
# current = Parallel(n_jobs = 50, verbose=8, prefer = 'threads')(delayed(parallel_extract)\
#                                                   (i, rescale_size, tiles_coord) for i in range(data_size))
# all_data = pd.concat(current)
# data = pd.concat([data, all_data])
# data.reset_index(drop=True, inplace=True)
# print("Current Sample Size:" , data.shape[0])

# t = time.time()
# path_to_data = drive_location + f"/data{data_size}H.p"
# # path_to_data = drive_location + f"/test1025.p"
# data.to_pickle(path_to_data, protocol = 3)
# print(f"Write time: {round(time.time() - t, 2)}s")

# data

In [None]:
data_size_to_extract = 2000
t = time.time()
path_to_data = drive_location + f"/data{data_size_to_extract}A.p"
data = pd.read_pickle(path_to_data)
print(f"Read time: {round(time.time() - t, 2)}s")
print("Total Sample Size:" , data.shape[0])
# data = data.head(8333)
print("Current Sample Size:" , data.shape[0])
data

In [None]:
X = data['img']
y = data['label']

In [None]:
seed = 42

print(X[0].shape)
print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    shuffle=True,
    random_state=seed,
)

img = X[0]
 
# scale down the image to one third
img = rescale(img, 1/3, mode='reflect') 

# calculate the hog and return a visual representation.
img_hog, img_hog_img = hog(img,
                           pixels_per_cell=(12, 12),
                           cells_per_block=(2,2),
                           orientations=8,
                           visualize=True,
                           block_norm='L2-Hys')
 
fig, ax = plt.subplots(1,2)
fig.set_size_inches(16,12)
# # remove ticks and their labels
# [a.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
#     for a in ax]
 
ax[0].imshow(img)
ax[0].set_title('img')
ax[1].imshow(img_hog_img, cmap='gray')
ax[1].set_title('img hog')
print('\nNumber of pixels: ', img.shape[0] * img.shape[1])
print('Number of hog features: ', img_hog.shape[0])

plt.figure(figsize = (12,6))
plt.suptitle("Photo's per type")
plot_bar(y_train, loc='left')
plot_bar(y_test, loc='right')
plt.legend([
    'train ({0} photos)'.format(len(y_train)),
    'test ({0} photos)'.format(len(y_test))
]);

print(len(X_test))
print(len(y_test))

In [None]:
class HogTransformer(BaseEstimator, TransformerMixin):
    """
    Expects an array of 2d arrays (1 channel images)
    Calculates hog features for each img
    """
 
    def __init__(self, y=None, orientations=9,
                 pixels_per_cell=(8, 8),
                 cells_per_block=(3, 3), block_norm='L2-Hys'):
        self.y = y
        self.orientations = orientations
        self.pixels_per_cell = pixels_per_cell
        self.cells_per_block = cells_per_block
        self.block_norm = block_norm
 
    def fit(self, X, y=None):
        return self
 
    def transform(self, X, y=None):
 
        def local_hog(X):
            return hog(X,
                       orientations=self.orientations,
                       pixels_per_cell=self.pixels_per_cell,
                       cells_per_block=self.cells_per_block,
                       block_norm=self.block_norm)
 
        try: # parallel
            return np.array([local_hog(img) for img in X])
        except:
            return np.array([local_hog(img) for img in X])


hogify = HogTransformer(
            pixels_per_cell=(15, 15),
            cells_per_block=(3,3),
            orientations=9,
            block_norm='L2-Hys')

scalify = StandardScaler()

# X_train_hog = hogify.fit_transform(X_train)
# X_train_processed = scalify.fit_transform(X_train_hog)

In [None]:
SGD =  SGDClassifier(max_iter=1000, tol=1e-2, loss = "hinge", 
                     random_state = seed, warm_start = True, 
                     n_iter_no_change= 100, verbose = 0)

LR = LogisticRegression(random_state = seed, solver = 'saga', verbose = 0)

SVCl = SVC(kernel='rbf', random_state = seed)

RF = RandomForestClassifier(n_estimators=100, random_state = seed)
RF1 = RandomForestClassifier(n_estimators=1000, random_state = seed)
RF2 = RandomForestClassifier(n_estimators=10, random_state = seed)
RF3 = RandomForestClassifier(n_estimators=10000, random_state = seed)

KNN = KNeighborsClassifier()

SGD1og =  SGDClassifier(max_iter=1000, tol=1e-2, loss = "log", 
                     random_state = seed, warm_start = True, 
                     n_iter_no_change= 100, verbose = 0)

SGDmodified_huber =  SGDClassifier(max_iter=1000, tol=1e-2, loss = "modified_huber", 
                     random_state = seed, warm_start = True, 
                     n_iter_no_change= 100, verbose = 0)

SGDsquared_hinge =  SGDClassifier(max_iter=1000, tol=1e-2, loss = "squared_hinge", 
                     random_state = seed, warm_start = True, 
                     n_iter_no_change= 100, verbose = 0)

SGDperceptron =  SGDClassifier(max_iter=1000, tol=1e-2, loss = "perceptron", 
                     random_state = seed, warm_start = True, 
                     n_iter_no_change= 100, verbose = 0)

MLP = MLPClassifier(random_state=seed, max_iter=200)

# models = []
# models.append(("SGD", SGD))
# models.append(('LR', LR))
# models.append(('NB', GaussianNB()))
# models.append(('LDA', LinearDiscriminantAnalysis()))
# models.append(('KNN', KNN))
# models.append(('CART', DecisionTreeClassifier(random_state = seed)))
# models.append(('RF', RF))
# models.append(('SVM', SVCl))
# models.append(("MLP", MLP))


# # variables to hold the results and names
# results0 = []
# names   = []

# # 10-fold cross validation
# def ml_comparison(name, model, seed):
#     scoring = "accuracy"
#     t = time.time()
#     kfold = KFold(n_splits=10, random_state = seed)
#     cv_results = cross_val_score(model, X_train_processed, y_train, cv=kfold, scoring=scoring)
#     run_time = round(time.time() - t)
#     return cv_results, name, run_time, cv_results.mean()


# jobs = 1

# if num_cores//2 < len(models):
#     jobs = num_cores//2
# else:
#     jobs = len(models)

# print(f"Using {jobs} cores")
    
# current = Parallel(n_jobs = jobs, verbose=12)(delayed(ml_comparison)(name, model, seed) for name, model in models)

# while current[0] == []:
#     current.pop(0)

# temp = list(zip(*current))
# results0 = temp[0]
# names = temp[1]
# run_times = temp[2]    
# accuracies = temp[3]    

# for i in range(len(run_times)):
#     left_aligned = f"{names[i]}:"
#     center = f"{round(accuracies[i], 5)}"
#     right_aligned = f"({run_times[i]}s)"
#     print(f"{left_aligned:<10}{center:^10}{right_aligned:>10}")


In [None]:
# plotdata = []
# plotdata.append(("SGD", 0.75203, 938))
# plotdata.append(("LR", 0.75968, 654))
# plotdata.append(("NB", 0.68227, 23))
# plotdata.append(("LDA", 0.68017, 688))
# plotdata.append(("KNN", 0.65467, 901))
# plotdata.append(("CART", 0.67747, 742))
# plotdata.append(("RF", 0.79823, 453))
# plotdata.append(("SVM", 0.82613, 3481))
# plotdata.append(("MLP", 0.80288, 404))
# df = list(zip(*plotdata))
# import matplotlib.cm as cm
# plt.figure(figsize = (16,8))
# plt.scatter(x=df[2], y=df[1], s = 100, color = plt.get_cmap('plasma')(np.linspace(0, 1, len(df[1]))))
# plt.xscale('log')
# plt.xlabel("Log Runtime (s)")
# plt.ylabel("Accuracy (F1-Score)")
# plt.xlim(0,10500)
# plt.ylim(0.64,0.86)


# fontsize = 12
# weight = 'bold'

# i = 0
# x = df[2][i]
# y = df[1][i]
# z = df[0][i]
# label = f"{z}\n{x}s"

# # this method is called for each point
# plt.annotate(label, # this is the text
#              (x,y), # this is the point to label
#              textcoords="offset points", # how to position the text
#              xytext=(0,10), # distance from text to points (x,y)
#              ha='center',
#              weight = weight, fontsize = fontsize)

# i = 1
# x = df[2][i]
# y = df[1][i]
# z = df[0][i]
# label = f"{z}\n{x}s"

# # this method is called for each point
# plt.annotate(label, # this is the text
#              (x,y), # this is the point to label
#              textcoords="offset points", # how to position the text
#              xytext=(0,10), # distance from text to points (x,y)
#              ha='center',
#              weight = weight, fontsize = fontsize)

# i = 2
# x = df[2][i]
# y = df[1][i]
# z = df[0][i]
# label = f"{z}\n{x}s"

# # this method is called for each point
# plt.annotate(label, # this is the text
#              (x,y), # this is the point to label
#              textcoords="offset points", # how to position the text
#              xytext=(0,10), # distance from text to points (x,y)
#              ha='center',
#              weight = weight, fontsize = fontsize)

# i = 3
# x = df[2][i]
# y = df[1][i]
# z = df[0][i]
# label = f"{z}\n{x}s"

# # this method is called for each point
# plt.annotate(label, # this is the text
#              (x,y), # this is the point to label
#              textcoords="offset points", # how to position the text
#              xytext=(0,10), # distance from text to points (x,y)
#              ha='right',
#              weight = weight, fontsize = fontsize)

# i = 4
# x = df[2][i]
# y = df[1][i]
# z = df[0][i]
# label = f"{z}\n{x}s"

# # this method is called for each point
# plt.annotate(label, # this is the text
#              (x,y), # this is the point to label
#              textcoords="offset points", # how to position the text
#              xytext=(10,-10), # distance from text to points (x,y)
#              ha='left',
#              weight = weight, fontsize = fontsize)

# i = 5
# x = df[2][i]
# y = df[1][i]
# z = df[0][i]
# label = f"{z}\n{x}s"

# # this method is called for each point
# plt.annotate(label, # this is the text
#              (x,y), # this is the point to label
#              textcoords="offset points", # how to position the text
#              xytext=(10,-10), # distance from text to points (x,y)
#              ha='left',
#              weight = weight, fontsize = fontsize)

# i = 6
# x = df[2][i]
# y = df[1][i]
# z = df[0][i]
# label = f"{z}\n{x}s"

# # this method is called for each point
# plt.annotate(label, # this is the text
#              (x,y), # this is the point to label
#              textcoords="offset points", # how to position the text
#              xytext=(10,-10), # distance from text to points (x,y)
#              ha='left',
#              weight = weight, fontsize = fontsize)

# i = 7
# x = df[2][i]
# y = df[1][i]
# z = df[0][i]
# label = f"{z}\n{x}s"

# # this method is called for each point
# plt.annotate(label, # this is the text
#              (x,y), # this is the point to label
#              textcoords="offset points", # how to position the text
#              xytext=(10,-20), # distance from text to points (x,y)
#              ha='left',
#              weight = weight, fontsize = fontsize)

# i = 8
# x = df[2][i]
# y = df[1][i]
# z = df[0][i]
# label = f"{z}\n{x}s"

# # this method is called for each point
# plt.annotate(label, # this is the text
#              (x,y), # this is the point to label
#              textcoords="offset points", # how to position the text
#              xytext=(0,10), # distance from text to points (x,y)
#              ha='center',
#              weight = weight, fontsize = fontsize)
# plt.title("Accuracy vs Runtime of different ML Models")
# plt.savefig('fig12.pdf')
# plt.show()

In [None]:
# import seaborn as sns
# import matplotlib.patches as patches
# custom_style = {'axes.labelcolor': 'white',
#                 'xtick.color': 'white',
#                 'ytick.color': 'white'}

# # boxplot algorithm comparison
# fig = plt.figure(figsize = (15,6))
# fig.suptitle('ML Algorithm Comparison (1% Full Data)')
# sns.set(style="ticks", rc = custom_style, palette="pastel")
# ax = sns.boxplot(data = results0)
# ax.set_xticklabels(names)
# ax.patch.set_facecolor('black')
# ax.set_ylabel("Accuracy")
# ax.set_xlabel("Model")

# from matplotlib.legend_handler import HandlerBase
# from matplotlib.text import Text
# from matplotlib.legend import Legend


# class TextHandlerB(HandlerBase):
#     def create_artists(self, legend, text ,xdescent, ydescent,
#                         width, height, fontsize, trans):
#         tx = Text(width/2.,height/2, text, fontsize=fontsize,
#                   ha="center", va="center", fontweight="bold", color = 'white')
#         return [tx]

# Legend.update_default_handler_map({str : TextHandlerB()})
    
    
# handles = ["SGD", "LR", "NB", "LDA", "KNN", "CART", "RF", "SVM","MLP"]
# labels = ["Stochastic Gradient Descent", "Logistic Regression", "Gaussian Naive Bayes", "Linear Discriminant Analysis",
#           "K-Nearest Neighbours", "Decision Tree Classifier", "Random Forest Classifier",
#           "Support Vector Machines","Multi-Layer Perceptron"]

# l = ax.legend(handles=handles, labels=labels, framealpha = 0.1, borderpad = 0.8)

# for text in l.get_texts():
#     text.set_color("white")

# plt.savefig("fig8.pdf")
# plt.show()

In [None]:
HOG_pipeline = Pipeline([
    ('hogify', HogTransformer(
        pixels_per_cell=(15, 15),
        cells_per_block=(3,3),
        orientations=20,
        block_norm='L2-Hys')
    ),
    ('scalify', StandardScaler()),
#     ('classify', SGD)
#     ('classify', SVCl)
    ('classify', MLP)
#     ('classify', LR)
])
 
param_grid = [
    {'hogify__orientations': [9,18],
     'hogify__cells_per_block': [(3, 3), (2, 2)],
     'hogify__pixels_per_cell': [(15, 15), (9,9), (18,18)],
     'classify': [SGD, LR, RF, KNN]}
]    

grid_search = GridSearchCV(HOG_pipeline,
                           param_grid,
                           cv=3,
                           n_jobs= 40,
                           scoring='accuracy',
                           verbose=10,
                           return_train_score=True)
t1 = time.time()
# clf = grid_search.fit(X_train, y_train) # Usually 4% more than HOG_pipline direct
clf = HOG_pipeline.fit(X_train, y_train)
t2 = time.time()
print(f"Total Runtime: {round(t2-t1, 2)}s\n")

In [None]:
# print("Best Config:", clf.best_params_)
# print(clf.best_estimator_)
# print(f'Train Data Best Score: {clf.best_score_ * 100}')

y_pred = clf.predict(X_test)
index = X_test.index[0]
cmx = confusion_matrix(y_test, y_pred)
data_predictions = pd.DataFrame(data=y_pred, index= X_test.index)
print('X_test Percentage correct: ', accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", cmx)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize = (10,7))
group_names = ['True Positive',
               'False Negative',
               'False Positive',
               'True Negative']
group_counts = ["{0:0.0f}".format(value) for value in
                cmx.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cmx.flatten()/np.sum(cmx)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cmx, annot=labels, fmt='', cmap='Blues')
plt.title("Percentage Confusion Matrix of Test Data")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.savefig("cmx1.pdf")
plt.show()

In [None]:
max_iters  = min(len(X_test), 200)
results = pd.DataFrame({'tile_id': data.tile_id, 'item_loc': data.item_loc, 'prediction': y_pred, 
                        'actual': y_test}, index = X_test.index)
results = results.sort_values(by=['tile_id'])
results = results.head(max_iters)

fig, ax = plt.subplots(figsize=(12,12)) 
row = tiles_coord[tiles_coord.tile_id == results.iloc[0].tile_id].squeeze()
irow = row.name
jp = load_file(row.obsid)
img = get_image(tiles_coord, jp, irow)
ax.imshow(img)
tile_id = results.loc[results.index[0]].tile_id

for index, result in tqdm(results.iterrows(), total=results.shape[0]):
    row = tiles_coord[tiles_coord.tile_id == result.tile_id].squeeze()
    irow = row.name
    jp = load_file(row.obsid)
    img = get_image(tiles_coord, jp, irow)
    
    if not tile_id == result.tile_id:
        fig, ax = plt.subplots(figsize=(12,12)) 
        ax.imshow(img)
    
    rect = Rectangle(result.item_loc[0], width = result.item_loc[1] , 
                     height = result.item_loc[2],
                     fill=False, edgecolor='blue', linewidth=1.5)
    ax.add_patch(rect)

    fontsize = 12
    h_offset = 0.3
    label = f"{result.actual}"
#     ax.text(result.item_loc[0][0], result.item_loc[0][1] - h_offset * fontsize, label, 
#             fontsize= fontsize, weight='bold', color = "blue")

    label_ = f"{result.prediction}"
    ax.text(result.item_loc[0][0] + result.item_loc[1] - 9 * len(label_), 
            result.item_loc[0][1] - h_offset * fontsize, label_, 
            fontsize= fontsize, weight='bold', color = "darkred")
    
    tile_id = result.tile_id
    ax.set_title(f'{tile_id}')
    plt.savefig(f"{tile_id}.pdf")
plt.show()
results

In [None]:
# def extract_image(tile_id, item_loc):
#     row = tiles_coord[tiles_coord.tile_id == tile_id].squeeze()
#     irow = row.name
#     jp = load_file(row.obsid)
#     img8 = get_image(tiles_coord, jp, irow)
#     minx = item_loc[0][0]
#     miny = item_loc[0][1]
#     maxx = minx + item_loc[1]
#     maxy = miny + item_loc[2]
#     minx, miny, maxx, maxy = expand_bbox(minx, miny, maxx, maxy, scale = 1.2)
#     processing0 = img8[miny:maxy, minx:maxx]
#     processing1 = cv2.cvtColor(processing0, cv2.COLOR_BGR2GRAY)
#     processing2 = resize(image = processing1, output_shape = rescale_size, anti_aliasing=True)
#     processing3 = hog(processing2,
#                     orientations= 9,
#                     pixels_per_cell= (8, 8),
#                     cells_per_block= (3, 3),
#                     block_norm = 'L2-Hys')
        
        
#     extracted_image = processing3
#     row['img'] = extracted_image
#     return extracted_image
# # USE combined_classify_false_negative.json
# # file_str = "combined_classify_false_negative.json"
# # file_str = "classify_false_negative.json"
# file_str = "fan_classify_false_negative.json"
# with open(file_str, "r") as f:
#     load = json.load(f)
#     f.close()

# predicted_fans = pd.DataFrame(load)#.sort_values(by=['tile_id'])
# predicted_fans["isfan"].replace({0: "notfan", 1: "fan"}, inplace=True)

# # [(minx, miny), width, height]
# item_loc = [(predicted_fans.bbox[0][0], predicted_fans.bbox[0][1]),
#             predicted_fans.bbox[0][2],
#             predicted_fans.bbox[0][3]]

# predicted_fans.loc[:, 'item_loc'] = predicted_fans.bbox.map(lambda x: [(int(x[0]), int(x[1])),
#                                                                         int(x[2]), int(x[3])])
# predicted_fans = predicted_fans.drop(columns = ['bbox']).rename(columns={'isfan':'label'})

# predicted_fans['img'] = 1

# predicted_fans = predicted_fans.loc[~predicted_fans['tile_id'].str.contains(r'_{1}')]  
# predicted_fans.reset_index(drop=True, inplace=True)

# print(f"Tasks: {predicted_fans.shape[0]}" )

# extracted_list = Parallel(n_jobs = 50, verbose=6, prefer = 'threads')\
#                     (delayed(extract_image)(predicted_fans.iloc[i].tile_id,
#                                             predicted_fans.iloc[i].item_loc) for i in range(predicted_fans.shape[0]))

# predicted_fans['img'] = pd.Series(extracted_list)

# t = time.time()
# path_to_class = drive_location + "/classify_false_negative_all_D.p"
# predicted_fans.to_pickle(path_to_class, protocol = 3)
# print(f"Write time: {round(time.time() - t, 2)}s")

# predicted_fans

In [None]:
t = time.time()
path_to_class = drive_location + "/classify_false_negative_all_C.p"
predicted_fans = pd.read_pickle(path_to_class)
print(f"Read time: {round(time.time() - t, 2)}s")
predicted_fans = predicted_fans.head(1000)
predicted_fans

In [None]:
t = time.time()
predictions = clf.predict(predicted_fans['img'])
predicted_fans['prediction'] = predictions
pred = predicted_fans['prediction']
predicted_fans.drop(labels=['prediction'], axis=1,inplace = True)
predicted_fans.insert(2, 'prediction', pred)
y_test = predicted_fans['label']
cmx = confusion_matrix(y_test, predictions)
print(f"Computing time: {round(time.time() - t, 2)}s")
print('Prediction Percentage Overlap: ', accuracy_score(predictions, y_test))
print("Confusion Matrix:\n", cmx)
# print("\nClassification Report:\n", classification_report(y_test, predictions))

In [None]:
plt.figure(figsize = (10,7))
group_names = ['We all agree these are fan',
               'Aslesha and ground truth say fan \n I say not fan',
               'Aslesha and I say fan \n Ground truth say not fan',
               'Ground truth and I say not fans \n Aslesha says fans']
# group_names = ["ground truth says blotch",
#                "ground truth says blotch",
#                "ground truth says blotch",
#                "ground truth says neither",
#                "ground truth says neither",
#                "ground truth says neither",
#                "ground truth says fan",
#                "ground truth says fan",
#                "ground truth says fan"]
group_counts = ["{0:0.0f}".format(value) for value in
                cmx.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cmx.flatten()/np.sum(cmx)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cmx, annot=labels, fmt='', cmap='Blues')
plt.title("Percentage Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
# max_iters  = min(len(predicted_fans['img']), 50)
# results = pd.DataFrame({'tile_id': predicted_fans.tile_id, 'item_loc': predicted_fans.item_loc,
#                         'actual': predicted_fans['label'], 
#                         'prediction': predictions}, index = predicted_fans['img'].index)

# results = results.sort_values(by=['tile_id'])
# results = results.tail(max_iters)

# fig, ax = plt.subplots(figsize=(12,12)) 
# row = tiles_coord[tiles_coord.tile_id == results.iloc[0].tile_id].squeeze()
# irow = row.name
# jp = load_file(row.obsid)
# img = get_image(tiles_coord, jp, irow)
# ax.imshow(img)
# tile_id = results.loc[results.index[0]].tile_id

# for index, result in tqdm(results.iterrows(), total=results.shape[0]):
#     row = tiles_coord[tiles_coord.tile_id == result.tile_id].squeeze()
#     irow = row.name
#     jp = load_file(row.obsid)
#     img = get_image(tiles_coord, jp, irow)
    
#     if not tile_id == result.tile_id:
#         fig, ax = plt.subplots(figsize=(12,12)) 
#         ax.imshow(img)
    
#     rect = Rectangle(result.item_loc[0], width = result.item_loc[1] , 
#                      height = result.item_loc[2],
#                      fill=False, edgecolor='blue', linewidth=1.5)
#     ax.add_patch(rect)

#     fontsize = 12
#     h_offset = 0.32
#     label = f"{result.actual}"
#     label_ = f"{result.prediction}"
#     ax.text(result.item_loc[0][0], 
#             result.item_loc[0][1] + result.item_loc[2] + 8 + h_offset * fontsize, label_, 
#             fontsize= fontsize, weight='bold', color = "blue")

#     ax.text(result.item_loc[0][0], result.item_loc[0][1] - h_offset * fontsize, label, 
#             fontsize= fontsize, weight='bold', color = "darkred")
    
#     tile_id = result.tile_id
#     ax.set_title(f'{tile_id}')
#     red_patch = mpatches.Patch(color='darkred', label='Ground truth')
#     blue_patch = mpatches.Patch(color='blue', label='Prediction')
#     ax.legend(handles=[red_patch, blue_patch], fancybox=True, framealpha=0.5, shadow=True, borderpad=1)
# plt.show()
# results

In [None]:
# Only plotting when Aslesha and I agree on what is a fan but the ground truth says no. 
max_iters  = min(len(predicted_fans['img']), 20)
results = pd.DataFrame({'tile_id': predicted_fans.tile_id, 'item_loc': predicted_fans.item_loc,
                        'actual': predicted_fans['label'], 
                        'predictions': predictions}, index = predicted_fans['img'].index)

results = results.sort_values(by=['tile_id'])
results = results.tail(max_iters)

fig, ax = plt.subplots(figsize=(12,12)) 
row = tiles_coord[tiles_coord.tile_id == results.iloc[0].tile_id].squeeze()
irow = row.name
jp = load_file(row.obsid)
img = get_image(tiles_coord, jp, irow)
ax.imshow(img)
tile_id = results.loc[results.index[0]].tile_id

for index, result in tqdm(results.iterrows(), total=results.shape[0]):
#     if result.actual == "fan" or result.predictions == "notfan":
#         continue
    
    row = tiles_coord[tiles_coord.tile_id == result.tile_id].squeeze()
    irow = row.name
    jp = load_file(row.obsid)
    img = get_image(tiles_coord, jp, irow)
    
    if not tile_id == result.tile_id:
        fig, ax = plt.subplots(figsize=(12,12)) 
        ax.imshow(img)
    
    rect = Rectangle(result.item_loc[0], width = result.item_loc[1] , 
                     height = result.item_loc[2],
                     fill=False, edgecolor='blue', linewidth=1.5)
    ax.add_patch(rect)

    fontsize = 12
    h_offset = 0.32
    ax.text(result.item_loc[0][0], 
            result.item_loc[0][1] + result.item_loc[2] + 8 + h_offset * fontsize, f"{result.predictions}", 
            fontsize= fontsize, weight='bold', color = "blue")

    ax.text(result.item_loc[0][0], result.item_loc[0][1] - h_offset * fontsize, f"{result.actual}", 
            fontsize= fontsize, weight='bold', color = "darkred")
    
    tile_id = result.tile_id
    ax.set_title(f'{tile_id}')
    red_patch = mpatches.Patch(color='darkred', label='Ground truth')
    blue_patch = mpatches.Patch(color='blue', label='Prediction')
    ax.legend(handles=[red_patch, blue_patch], fancybox=True, framealpha=0.5, shadow=True, borderpad=1)
plt.show()
results

In [None]:
# lab = predicted_fans['label']
# predicted_fans.drop(labels=['label'], axis=1,inplace = True)
# del predicted_fans['img']
# predicted_fans.insert(2, 'ground_truth', lab)
# t1 = time.time()
# predicted_fans.to_json("processed_classify_false_negative.json")
# t2 = time.time()
# print(f"Write Time: {t2-t1}")
# predicted_fans