In [1]:
import h5py
import pandas as pd
from pandarallel import pandarallel
import multiprocessing as mp
from glob import glob
from IPython.core.display import HTML
from tqdm import tqdm
import os
import glymur
import cv2
import requests
import json
import hashlib
import random
import itertools
from joblib import Parallel, delayed, parallel_backend
import time
import warnings
# from jupyterthemes import jtplot
import swifter

import dask.dataframe as dd

import numpy as np
from numpy.random import randint

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.patches import Rectangle
import seaborn as sns

from scipy import ndimage as ndi

import sklearn
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier


import skimage
from skimage import data
from skimage.filters import threshold_yen, threshold_triangle, threshold_otsu, threshold_li, sobel
from skimage.segmentation import clear_border
from skimage.measure import label, regionprops
from skimage.morphology import closing, square
from skimage.color import label2rgb
from skimage import exposure
from skimage.io import imread
from skimage.color import rgb2gray
from skimage.measure import label
from skimage import color
from skimage.transform import resize, rescale
from skimage.feature import hog

plt.style.use('dark_background')
rescale_size = (150, 270)
display(HTML("<style>pre { white-space: pre !important; }</style>"))
num_cores = mp.cpu_count()
plt.rcParams.update({'figure.max_open_warning': 0})
warnings.filterwarnings('ignore')
# jtplot.style(theme="monokai", context="notebook", ticks=True, grid=False)

In [2]:
drive_location = "/export/data/utkarsh"
data_location = drive_location + '/data'
P4data_location = drive_location + "/P4data"
glob(P4data_location + "/*.csv")

metadata = pd.read_csv(P4data_location + '/P4_catalog_v1.1_metadata.csv')
tiles_coord = pd.read_csv(P4data_location + '/P4_catalog_v1.1_tile_coords_final.csv')
fan = pd.read_csv(P4data_location + "/P4_catalog_v1.1_L1C_cut_0.5_fan.csv")
blotch = pd.read_csv(P4data_location + "/P4_catalog_v1.1_L1C_cut_0.5_blotch.csv")

item = tiles_coord.iloc[0].obsid

In [3]:
seed = 42

In [4]:
class HogTransformer(BaseEstimator, TransformerMixin):
    """
    Expects an array of 2d arrays (1 channel images)
    Calculates hog features for each img
    """
 
    def __init__(self, y=None, orientations=9,
                 pixels_per_cell=(8, 8),
                 cells_per_block=(3, 3), block_norm='L2-Hys'):
        self.y = y
        self.orientations = orientations
        self.pixels_per_cell = pixels_per_cell
        self.cells_per_block = cells_per_block
        self.block_norm = block_norm
 
    def fit(self, X, y=None):
        return self
 
    def transform(self, X, y=None):
 
        def local_hog(X):
            return hog(X,
                       orientations=self.orientations,
                       pixels_per_cell=self.pixels_per_cell,
                       cells_per_block=self.cells_per_block,
                       block_norm=self.block_norm)
 
        try: # parallel
            return np.array([local_hog(img) for img in X])
        except:
            return np.array([local_hog(img) for img in X])


hogify = HogTransformer(
            pixels_per_cell=(15, 15),
            cells_per_block=(3,3),
            orientations=9,
            block_norm='L2-Hys')

scalify = StandardScaler()

In [5]:
SGD =  SGDClassifier(max_iter=10, tol=1e-2, loss = "hinge", 
                     random_state = seed, warm_start = True, 
                     n_iter_no_change= 10, verbose = 0, penalty="elasticnet")


MLP = MLPClassifier(random_state=seed, max_iter=200, warm_start = True)

HOG_pipeline = Pipeline([
    ('hogify', HogTransformer(
        pixels_per_cell=(15, 15),
        cells_per_block=(3,3),
        orientations=20,
        block_norm='L2-Hys')
    ),
    ('scalify', StandardScaler()),
    ('classify', SGD)
])

In [6]:
start_time = time.time()
path_to_data = drive_location + f"/data25A.p"
data = pd.read_pickle(path_to_data)
print(f"Read time: {round(time.time() - start_time, 2)}s")

X_train = data['img']
y_train = data['label']

t0 = time.time()
X_train_processed = scalify.fit_transform(list(X_train))
print(f"Preprocessing time: {round(time.time() - t0, 2)}s")

t1 = time.time()
clf = MLP.partial_fit(X_train_processed, y_train, classes=np.unique(y_train))
# clf = SGD.partial_fit(X_train_processed, y_train, classes=np.unique(y_train))
# clf = RandomForestClassifier(n_estimators=50, random_state = seed, max_features = 10 , warm_start = True)
# clf = clf.fit(X_train_processed, y_train)
print(f"Train time: {round(time.time() - t1, 2)}s")

Read time: 0.05s
Preprocessing time: 0.58s
Train time: 0.81s


In [None]:
def my_partial_fit(path_to_data, clf, to_precict, aslesha_predict, live_predict = False, n_tree = 0):
    t = time.time()
    data = pd.read_pickle(path_to_data)
    print(f"Read time: {round(time.time() - t, 2)}s")

    X_train = data['img']
    y_train = data['label']

    t0 = time.time()
    X_train_processed = scalify.fit_transform(list(X_train))
    print(f"Preprocessing time: {round(time.time() - t0, 2)}s")

    t1 = time.time()
#     updated_clf = clf.partial_fit(X_train_processed, y_train, classes=np.unique(y_train),
#                                   sample_weight = compute_sample_weight(class_weight='balanced', y=y_train))
    updated_clf = clf.partial_fit(X_train_processed, y_train, classes=np.unique(y_train))
#     clf.set_params(n_estimators= n_tree + 50)
#     updated_clf = clf.fit(X_train_processed, y_train)
    print(f"Train time: {round(time.time() - t1, 2)}s")
    
    if live_predict:
        y_pred = updated_clf.predict(to_predict)
        print('X_test Accuracy score: ', accuracy_score(y_test, y_pred))
        predictions = updated_clf.predict(aslesha_predict)
        y_test0 = predicted_fans['label']
        print('Prediction Percentage Overlap: ', accuracy_score(predictions, y_test0))
    return updated_clf

path_to_data = drive_location + f"/test1025.p"
t2 = time.time()
data = pd.read_pickle(path_to_data) #This is out test data
print(f"Read time: {round(time.time() - t2, 2)}s")

X_test = data['img']
y_test = data['label']

In [None]:
t = time.time()
path_to_class = drive_location + "/classify_false_negative_all_D.p"
predicted_fans = pd.read_pickle(path_to_class)
# predicted_fans = predicted_fans.head(5000)
print(f"Read time: {round(time.time() - t, 2)}s")
to_predict = scalify.fit_transform(list(X_test))
aslesha_predict = scalify.fit_transform(list(predicted_fans['img']))
print(f"Preprocessing time: {round(time.time() - t, 2)}s")
predicted_fans

In [None]:
# trees = 50

# path_to_data = drive_location + f"/data100.p"
# clf = my_partial_fit(path_to_data, clf, to_predict, aslesha_predict, live_predict = True, n_tree = trees)
# trees += 50
# print("")

# path_to_data = drive_location + f"/data100.p"
# clf = my_partial_fit(path_to_data, clf, to_predict, aslesha_predict, live_predict = True, n_tree = trees)
# trees += 50
# print("")

In [None]:
trees = 50
# D Skipped due to image reading issues. 
for letter in ['A', 'B', 'C', 'D', 'F', 'G']:
    path_to_data = drive_location + f"/data2500{letter}.p"
    clf = my_partial_fit(path_to_data, clf, to_predict, aslesha_predict, live_predict = True, n_tree = trees)
    trees += 50
    print(f"Finished {letter}\n")

In [None]:
path_to_data = drive_location + f"/test1025.p"
t2 = time.time()
data = pd.read_pickle(path_to_data) #This is out test data
print(f"Read time: {round(time.time() - t2, 2)}s")

X_test = data['img']
y_test = data['label']

t1 = time.time()
y_pred = clf.predict(scalify.fit_transform(list(X_test)))
cmx = confusion_matrix(y_test, y_pred)
data_predictions = pd.DataFrame(data=y_pred, index= X_test.index)
print(f"Predict time: {round(time.time() - t1, 2)}s")
print(f"Total Runtime: {round(time.time()-start_time, 2)}s\n")
print('X_test Accuracy score: ', accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", cmx)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
t = time.time()
predictions = clf.predict(aslesha_predict)
predicted_fans['prediction'] = predictions
pred = predicted_fans['prediction']
predicted_fans.drop(labels=['prediction'], axis=1,inplace = True)
predicted_fans.insert(2, 'prediction', pred)
y_test0 = predicted_fans['label']
cmx = confusion_matrix(y_test0, predictions)
print(f"Computing time: {round(time.time() - t, 2)}s")
print('Prediction Percentage Overlap: ', accuracy_score(predictions, y_test0))
print("Confusion Matrix:\n", cmx)

In [None]:
plt.figure(figsize = (10,7))
group_names = ['We all agree these are fan',
               'Aslesha and ground truth say fan \n I say not fan',
               'Aslesha and I say fan \n Ground truth say not fan',
               'Ground truth and I say not fans \n Aslesha says fans']
# group_names = ["ground truth says blotch",
#                "ground truth says blotch",
#                "ground truth says blotch",
#                "ground truth says neither",
#                "ground truth says neither",
#                "ground truth says neither",
#                "ground truth says fan",
#                "ground truth says fan",
#                "ground truth says fan"]
group_counts = ["{0:0.0f}".format(value) for value in
                cmx.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cmx.flatten()/np.sum(cmx)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cmx, annot=labels, fmt='', cmap='Blues')
plt.title("Percentage Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()
#1 is notfan
#0 is fan

In [None]:
# plt.figure(figsize = (10,7))
# group_names = ['True Positive',
#                'False Negative',
#                'False Positive',
#                'True Negative']
# group_counts = ["{0:0.0f}".format(value) for value in
#                 cmx.flatten()]
# group_percentages = ["{0:.2%}".format(value) for value in
#                      cmx.flatten()/np.sum(cmx)]
# labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
#           zip(group_names,group_counts,group_percentages)]
# labels = np.asarray(labels).reshape(2,2)
# sns.heatmap(cmx, annot=labels, fmt='', cmap='Blues')
# plt.title("Percentage Confusion Matrix (Detectron fan Predictions)")
# plt.xlabel("Predicted Label")
# plt.ylabel("True Label")
# plt.savefig('cmx2.pdf')
# #1 is notfan
# #0 is fan

In [None]:
def download_file(filename):
    savename = data_location + "/{filename}_RGB.NOMAP.JP2".format(filename=filename)
    if os.path.exists(savename):
        print("{} exists".format(savename))
        return
    components = filename.split("_")
    l = 100*int(int(components[1])/100)
    h = 100*int(1+int(components[1])/100)-1
    url="https://hirise-pds.lpl.arizona.edu/download/PDS/EXTRAS/RDR/ESP/ORB_{low:06d}_{high:06d}/{filename}/{filename}_RGB.NOMAP.JP2".format(low=l,high=h,filename=filename)
    print(url)
    myfile = requests.get(url)
    with open(savename, 'wb') as file:
      file.write(myfile.content)
      file.flush()
      file.close()
    
def load_file(filename):
    savename = data_location + "/{filename}_RGB.NOMAP.JP2".format(filename=filename)
    if not os.path.exists(savename):
        download_file(filename)
    return glymur.Jp2k(savename)


nx,ny = 840, 648

def cv2_imshow(a, **kwargs):
    a = a.clip(0, 255).astype('uint8')
    # cv2 stores colors as BGR; convert to RGB
    if a.ndim == 3:
        if a.shape[2] == 4:
            a = cv2.cvtColor(a, cv2.COLOR_BGRA2RGBA)
        else:
            a = cv2.cvtColor(a, cv2.COLOR_BGR2RGB)

    return plt.imshow(a, **kwargs)

def get_image(tiles_coord, jp, irow):
    row = tiles_coord.iloc[irow]
    sx = slice(int(row.x_hirise-nx//2),int(row.x_hirise+nx//2))
    sy = slice(int(row.y_hirise-ny//2),int(row.y_hirise+ny//2))
    im16 = np.copy(jp[sy,sx])
    ratio = np.amax(im16) / 256
    img8 = (im16 / ratio).astype('uint8')
    return img8

In [None]:
# Only plotting when Aslesha and I agree on what is a fan but the ground truth says no. 
max_iters  = min(len(predicted_fans['img']), 200)
# max_iters = len(predicted_fans['img'])
results = pd.DataFrame({'tile_id': predicted_fans.tile_id, 'item_loc': predicted_fans.item_loc,
                        'actual': predicted_fans['label'], 
                        'predictions': predictions}, index = predicted_fans['img'].index)

results = results.sort_values(by=['tile_id'])
results = results.tail(max_iters)

fig, ax = plt.subplots(figsize=(12,12)) 
row = tiles_coord[tiles_coord.tile_id == results.iloc[0].tile_id].squeeze()
irow = row.name
jp = load_file(row.obsid)
img = get_image(tiles_coord, jp, irow)
ax.imshow(img)
tile_id = results.loc[results.index[0]].tile_id

for index, result in tqdm(results.iterrows(), total=results.shape[0]):
#     if result.actual == "fan" or result.predictions == "fan":
#     if result.predictions == "notfan" or result.actual == "fan":
#         continue
    
    row = tiles_coord[tiles_coord.tile_id == result.tile_id].squeeze()
    irow = row.name
    jp = load_file(row.obsid)
    img = get_image(tiles_coord, jp, irow)
    
    if not tile_id == result.tile_id:
        fig, ax = plt.subplots(figsize=(12,12)) 
        ax.imshow(img)
    
    rect = Rectangle(result.item_loc[0], width = result.item_loc[1] , 
                     height = result.item_loc[2],
                     fill=False, edgecolor='blue', linewidth=1.5)
    ax.add_patch(rect)

    fontsize = 12
    h_offset = 0.32
    ax.text(result.item_loc[0][0], 
            result.item_loc[0][1] + result.item_loc[2] + 8 + h_offset * fontsize, f"{result.predictions}", 
            fontsize= fontsize, weight='bold', color = "blue")

    ax.text(result.item_loc[0][0], result.item_loc[0][1] - h_offset * fontsize, f"{result.actual}", 
            fontsize= fontsize, weight='bold', color = "darkred")
    
    tile_id = result.tile_id
    ax.set_title(f'{tile_id}')
    red_patch = mpatches.Patch(color='darkred', label='Ground truth')
    blue_patch = mpatches.Patch(color='blue', label='Prediction')
    ax.legend(handles=[red_patch, blue_patch], fancybox=True, framealpha=0.5, shadow=True, borderpad=1)
    plt.savefig(f"{tile_id}.pdf")
plt.show()
results

In [None]:
lab = predicted_fans['label']
predicted_fans.drop(labels=['label'], axis=1,inplace = True)
del predicted_fans['img']
predicted_fans.insert(2, 'ground_truth', lab)
t1 = time.time()
predicted_fans.to_json("processed_classify_false_negative.json")
t2 = time.time()
print(f"Write Time: {t2-t1}")
predicted_fans