# Handling Blobs
Sample code to query for and possibly delete blobs. Also loading QR codes as blobs

In [1]:
import os
import glob
import json
import io
import sys
import datetime
import pandas
import lxml.etree as ET
import importlib
import hashlib
from cdcs import CDCS

In [2]:
import pyxb
pyxb.RequireValidWhenGenerating(False);

In [3]:
from myconfig import *

In [4]:
try: USER
except: USER = input('username: ')
try: PASS
except: PASS = getpass.getpass('enter password ')
AUTH=(USER,PASS)

In [5]:
pyUTILS_path=f"{os.getcwd()}/py"
sys.path.insert(0, pyUTILS_path)
import cdcs_utils
import amdoc

In [6]:
importlib.reload(amdoc);

In [7]:
importlib.reload(cdcs_utils)
from cdcs_utils import *

In [8]:
ambench2022=AMBench2022(TEMPLATE,AMBENCH_URL,auth=AUTH)
VALIDATOR = ambench2022.create_schema_validator(); 

# find existing blobs

In [12]:
blobs=ambench2022.query_all_amblobs()
for t in blobs.itertuples():
    blob=amdoc.CreateFromDocument(t.xml_content)
    r=requests.get(blob.handle,auth=AUTH,verify=False)
    checksum=hashlib.md5(r.content).hexdigest()
    print(blob.handle,checksum)
    

https://test-ambench2022.nist.gov/rest/blob/download/61953462ef2ff2d7523d693b/ 337971b4f1dc1fb600c7fe03fb67f936
https://test-ambench2022.nist.gov/rest/blob/download/61953463eabc81389ad8aae8/ 59060b86d1d162a0650051c7fbda022e
https://test-ambench2022.nist.gov/rest/blob/download/6195346408ff6bab6ab2097c/ 2e0434883853fa27a5ebcdcfddae5bb7
https://test-ambench2022.nist.gov/rest/blob/download/61953465b39255f726bacd89/ ea5bc65435387a3ec8d82c3950eb979d
https://test-ambench2022.nist.gov/rest/blob/download/61953465ef2ff2d7523d6943/ c8c8dd330dc3e93bdff04810bdcf814a
https://test-ambench2022.nist.gov/rest/blob/download/619534660ae39d09d47244c3/ 18b883adfbeb1e57747c8fc0559f6ae9
https://test-ambench2022.nist.gov/rest/blob/download/61953467ee1985f7165bb237/ 0269a323dfc4fe7f260b60287ea488d8
https://test-ambench2022.nist.gov/rest/blob/download/619534680ae39d09d47244cb/ 5692952162eaa6cb2d74cfebea4c88b7
https://test-ambench2022.nist.gov/rest/blob/download/61953469ef2ff2d7523d694e/ 79fab789d960ef03da6b84739

In [15]:
amblobs=ambench2022.query_all_amblobs()
ch={}
for t in amblobs.itertuples():
    el=xpath(t.xml_content,'//AMBlob')[0]
    handle=el.xpath("handle/text()")[0]
    format1=el.xpath("format/text()")[0]
    checksum1=el.xpath("checksum/text()")[0]
    r=requests.get(handle,auth=AUTH,verify=False)
    checksum2=checksum4bytes(r.content)
    format2=imageFormatForBytes(r.content)
    if checksum1 != checksum2:
        print("problem with",handle,format1,format2)


In [16]:
spec=ambench2022.query_docs_by_type('AMSpecimen')

In [19]:
handles={}
for _,row in spec.iterrows():
    s=amdoc.CreateFromDocument(row['xml_content'])
    if s.AMSpecimen.processingSteps is not None:
        for ps in s.AMSpecimen.processingSteps.ProcessingStep:
            if ps.processingIllustration is not None:
                for i in ps.processingIllustration:
                    r=requests.get(i.handle,auth=AUTH,verify=False)
                    checksum=hashlib.md5(r.content).hexdigest()
                    if i.checksum != checksum:
                        print("PROBLEM:",i.handle,i.checksum,checksum)


## add AMBlob-s for blobs

In [None]:
blobs=ambench2022.get_blobs()
for blob in blobs.itertuples():
    amblob=amdoc.AMBlob()
    amblob.handle=blob.handle
    r=requests.get(blob.handle,auth=AUTH,verify=False)
    try:
        image = Image.open(io.BytesIO(r.content))
        frmt=image.format
        checksum=checksum4image(image)
        amblob.checksum=checksum
        amblob.format=frmt
        xml_content=amblob.toxml("utf-8")#.decode('utf-8')
        r = ambench2022.upload_data(xml_content=xml_content,title=amblob.checksum)
        print("UPLOADED",blob.handle,frmt,checksum)
    except Exception as e:
        print(blob.handle,e)
         

In [None]:
amblobs=ambench2022.query_all_amblobs()
amblobs

In [None]:
for t in amblobs.itertuples():
    print(t.xml_content)
    root=ET.fromstring(bytes(t.xml_content))
    break

## retrieve images from excel sheet

In [None]:
import string
import itertools
def read_excel(EXCEL_FILE, sheet_name):
    xl = pandas.ExcelFile (EXCEL_FILE)
    df=xl.parse(sheet_name).dropna(how='all') 
    return df

def checksum4image(image):
    b = pil2bytes(image)
    return checksum4bytes(b)

def pil2bytes(image):
    '''
    transform a PIL image to a bytearray
    '''
    imgByteArr = io.BytesIO()
    image.save(imgByteArr, format=image.format)
    imgByteArr = imgByteArr.getvalue()
    return imgByteArr

def retrieveAndLoadProcessingStepImages(ambench2022,excel,image_column,sheetname='Specimens'):
    '''
    retrieve processing step images for specimen
    return dict checksum:handle pairs
    '''
    pandas_sheet=read_excel(excel,sheetname)
    pxl_doc = openpyxl.load_workbook(excel)
    pyxl_sheet = pxl_doc[SHEET]    
    
    image_loader = SheetImageLoader(pyxl_sheet)

    ####################################################################################
    # create mapping between pandas columns and openpyxl column headers
    # for working with openpyxl, need to have the excel sheet headers. 
    # here a list is created from A-Z+AA-ZZ and this is mapped to the column headers in pandas data frame
    alphabet = list(string.ascii_uppercase)
    alphabets=[f'{a[0]}{a[1]}' for a in itertools.product(alphabet,alphabet)]
    excel_header=alphabet+alphabets
    colsmap={a[0]:a[1] for a in zip(pandas_sheet.columns,excel_header)}
    ####################################################################################

    header=colsmap[image_column]
    # query for existing blobs in the database
    all_images=ambench2022.query_amblob_refs()
    cells=[]
    blobRefs={}
    for i,row in pandas_sheet.iterrows():
        # cell possibly containing image for the current row
        cell=f'{header}{i+2}'
        cells.append(cell)
        try:
            image = image_loader.get(cell)
            checksum=checksum4image(image)
            
            if checksum != checksum1:
                print(cell,checksum,checksum1)

            if checksum not in all_images:
                print("MUST LOAD",cell,checksum)
#                 handle=ambench2022.upload_amblob_and_blob(image=image)
#                 all_images[checksum]=handle

            blobRef=amdoc.AMBlobReference()
            blobRef.checksum=checksum
            blobRef.handle=handle
            blobRefs[cell]=blobRef
        except Exception as e:
            if str(e).endswith("doesn't contain an image"):
                pass
            else:
                raise e
    pandas_sheet['processing_images_cells']=cells
    return blobRefs


In [None]:
from myconfig import *
EXCEL_FILE=SAMPLES_EXCEL_FILE

In [None]:
SHEET='Specimens'
sheet=read_excel(EXCEL_FILE,SHEET)
pxl_doc = openpyxl.load_workbook(EXCEL_FILE)
_sheet = pxl_doc[SHEET]

In [None]:
# check whether images exist for any of the processing steps for these specimens and load those
# returned a dict checksum:handle of all loaded blobs
images = retrieveAndLoadProcessingStepImages(ambench2022,EXCEL_FILE,'Processing_diagrams_and_photos','Specimens')

In [None]:
all_images=ambench2022.query_amblob_refs()
inv_map = {v: k for k, v in all_images.items()}

In [None]:
blobs=ambench2022.get_blobs()
for blob in blobs.itertuples():
    r=requests.get(blob.handle,auth=AUTH,verify=False)
    checksum=hashlib.md5(r.content).hexdigest()
    print(blob.handle,checksum,checksum in all_images,blob.handle in inv_map)

In [None]:
s='https://test-ambench2022.nist.gov/rest/blob/download/619534690ae39d09d47244ce/'
r = requests.get(s, stream=True,verify=False,auth=AUTH).raw

In [None]:
image = np.asarray(bytearray(r.read()), dtype="uint8")
print(checksum4bytes(image),len(image))

In [None]:
with open("a.jpeg","wb") as f:
    f.write(image)

In [None]:
a=Image.open("a.jpeg")
a.save("b.jpeg")

In [None]:
# im=Image.open(io.BytesIO(image))
print(checksum4image(a),im.format)

In [None]:
imgByteArr = io.BytesIO()
im.save(imgByteArr, format=im.format)
imgByteArr = imgByteArr.getvalue()
print(checksum4bytes(imgByteArr),len(imgByteArr))

In [None]:
im=Image.open(io.BytesIO(imgByteArr))
print(checksum4image(im))
imgByteArr = io.BytesIO()
im.save(imgByteArr, format=im.format)
imgByteArr = imgByteArr.getvalue()
print(checksum4bytes(imgByteArr),len(imgByteArr))
print(r.content==imgByteArr)

In [None]:
%pip install opencv-python

In [None]:
import cv2
import numpy as np

In [None]:

resp = requests.get(s, stream=True,verify=False,auth=AUTH).raw
image = np.asarray(bytearray(resp.read()), dtype="uint8")
image = cv2.imdecode(image, cv2.IMREAD_COLOR)

In [None]:
checksum4bytes(io.BytesIO(r.content).getvalue())

In [None]:
for i in range(0,len(r.content)):
    if r.content[i]!=imgByteArr[i]:
        print(i,r.content[i],imgByteArr[i])

In [None]:
blobs=ambench2022.get_blobs()
ims=[]
for blob in blobs.itertuples():
    r=requests.get(blob.handle,auth=AUTH,verify=False)
    im=Image.open(io.BytesIO(r.content))
    ims.append((blob.handle,im))

In [None]:
specimens=ambench2022.docs_by_name_AMDOC('AMSpecimen')

In [None]:
for n,s in specimens.items():
    name=s.AMSpecimen.name
    pss=s.AMSpecimen.processingSteps
    if pss is not None:
        for ps in pss.ProcessingStep:
            for i in  ps.processingIllustration:
                r=requests.get(i.handle,auth=AUTH,verify=False)
                print(name,ps.id, i.handle, i.checksum,i.checksum==checksum4bytes(r.content))
                

# delete the (AM)Blobs

In [None]:
amblobs = ambench2022.query_all_amblobs()
amblobs

In [None]:
for row in amblobs.itertuples():
    print(row)
    ambench2022.delete_record(row)

In [None]:
blobs=ambench2022.get_blobs()
blobs

In [None]:
for blob in blobs.itertuples():
    print(blob.id)
    ambench2022.delete_blob(id=blob.id)
   

In [None]:
ambench2022.query_all_amblobs()

In [None]:
for t in blobs.itertuples():
    ambench2022.delete_amblob_and_blob(handle=t.handle)


# load all QR images as AMBlobs

In [None]:
def checksum4bytes(b):
    return hashlib.md5(b).hexdigest()

def checksum4image(image):
    b = pil2bytes(image)
    return checksum4bytes(b)

def pil2bytes(image):
    '''
    transform a PIL image to a bytearray
    '''
    imgByteArr = io.BytesIO()
    image.save(imgByteArr, format=image.format)
    imgByteArr = imgByteArr.getvalue()
    return imgByteArr


In [None]:
# find QR images
blobs_to_be_loaded=glob.glob('/home/idies/workspace/AMBench/DATA/CDCS/test-ambench2022/AMDocs/records/*/*.QR.png')
blobs_to_be_loaded[:3]

In [None]:
for f in blobs_to_be_loaded:
    with open(f,'rb') as bf:
        byte=bf.read()
        print(" FILE",f,len(byte),checksum4bytes(byte))
    im=Image.open(f)
    byte=pil2bytes(im)
    print("IMAGE",f,len(byte),checksum4bytes(byte))
    byte=im.tobytes()
    print("IMAGE",f,len(byte),checksum4bytes(byte))


In [None]:
handle0=ambench2022.upload_blob(filename=blobs_to_be_loaded[0])
ambench2022.get_blob(id=handle0.split("/")[-2])

In [None]:
response = requests.get(handle0,auth=AUTH,verify=False)
img = Image.open(io.BytesIO(response.content))
print('filename=',img1.filename,'\nformat=',img1.format,"\nheaders=",response.headers)
img

In [None]:
with open(blobs_to_be_loaded[1],"rb") as blob:
    blobbytes=blob.read()
handle1=ambench2022.upload_blob(filename=blobs_to_be_loaded[1],blobbytes=blobbytes)

In [None]:
ambench2022.get_blob(id=handle1.split("/")[-2])

In [None]:
response = requests.get(handle1,auth=AUTH,verify=False)
img1 = Image.open(io.BytesIO(response.content))
img1

In [None]:
response.headers

In [None]:
blobbytes = open(blobs_to_be_loaded[2], 'rb')
handle2=ambench2022.upload_blob(filename="SomeQRImage2.png",blobbytes=blobbytes)

In [None]:
ambench2022.get_blob(id=handle2.split("/")[-2])

In [None]:
response = requests.get(handle2,auth=AUTH,verify=False)
img2 = Image.open(io.BytesIO(response.content))
img2

In [None]:
for h in [handle0,handle1,handle2]:
    _id=h.split("/")[-2]
    ambench2022.delete_blob(id=_id)

In [None]:
def read_excel(EXCEL_FILE):
    xl = pandas.ExcelFile (EXCEL_FILE)
    sheets={}
    for s in xl.sheet_names:
        df=xl.parse(s).dropna(how='all') 
        sheets[s]=df
    return sheets

def pil2bytes(pilimage):
    '''
    retriev PIL image as a byte array
    '''
    buf = io.BytesIO()
    pilimage.save(buf, format='JPEG')
    return buf.getvalue() 

In [None]:
pxl_doc = openpyxl.load_workbook(SAMPLES_EXCEL_FILE)
_sheet = pxl_doc['Specimens']
image_loader = SheetImageLoader(_sheet)

In [None]:
cell='S19'
image = image_loader.get(cell)
print("filename=",image.filename,"\nformat=",image.format)
image