In [1]:
import sys, os, json, importlib, zipfile, shutil, datetime
import rasterio, geohash
import geopandas as gpd
import pandas as pd
import numpy as np
import shapely.wkt

from shapely.geometry import shape, Polygon

#Get reference to imagery object
sys.path.append('../src')

from ImageryObjects import imageryExtents

In [2]:
globalBoundaries = r"R:\GLOBAL\ADMIN\Official Bank Borders\Polygons\Admin0\Admin0_Polys.shp"
globalBoundaries = gpd.read_file(globalBoundaries)
globalBoundaries = globalBoundaries.to_crs({'init': 'epsg:4326'})

  return _prepare_from_string(" ".join(pjargs))


# Extracting Imagery Metadata
This script is designed to extract metadata from our imagery repositories. How the information is to be processed is yet to be determined, but we need to extract the following metadata

### Metadata extracted from imagery
1. Title
2. Country ISO3
3. Storage location
4. Size zipped
5. Resolution
6. Number of bands

### Metadata extracted from deliverable
1. Vendor
2. Sensor
3. Data of Capture

### Manually entered information
1. WB project number
2. Security classification

# Process and Folder structure
Robert Harrison manages the process for the Geocenter, Xiuzhu Yang does the development

## Folder Definitions
<b>Ingest</b> - folder where GOST deposits JSON and zipfiles for upload to platform.  
<b>ImageryWithErrors</b> - where problematic images are moved from Ingest when errors are encountered.  
<b>Errorreport</b> - logging information, with daily logs of status. Daily logs only indicate an error occured; need to go to server to get complete log.  


## Generate information to process

In [4]:
sourceFolder =    r"R:\IMAGERY"# The folder of imagery to process

# These two should not be changed - if you do not have access to the I drive, contact Robert Mansour
outFolder =       r"I:\ddhfiles\internal\imagerysource\Ingest"
processedFolder = r"I:\ddhfiles\internal\imagerysource\Processed" 
processedErrorFolder = r"I:\ddhfiles\internal\imagerysource\ImageryWithErrors" 
log_Folder      = r"I:\ddhfiles\internal\imagerysource\GOST_Ingest_Log" 

processed_file = os.path.join(log_Folder, "already_processed.txt")
bad_meta_file = os.path.join(log_Folder, "bad_meta_folders.txt")
error_file = os.path.join(log_Folder, "error_folders.txt")

In [5]:
#Open list of processed folders and ignore
processed_log = os.path.join(log_Folder, "footgeo.csv")
in_log = pd.read_csv(processed_log)
processed_files = in_log['Footprint_ID'].values

In [6]:
processed_files[:5]

array(['NGA_s617ksgb4b9b_4_0.5_20201111',
       'IDN_qxg0gwyh9vvg_4_0.34_20180618',
       'NGA_s6173jpbzdt8_4_0.3_20201029',
       'NGA_s617dbxuwg3n_4_0.5_20201016',
       'GMB_edkh5vr8ydgr_1_0.5_20200209'], dtype=object)

In [7]:
# Generating lists of zipFiles and tif files
zipFiles = []
imgFolders = []

for root, dirs, files in os.walk(sourceFolder):
    for f in files:
        if f[-4:] in [".zip", ".rar"]:
            zipFiles.append(os.path.join(root, f))
        if f[-4:] in [".tif", ".TIF", '.JP2']:
            process=True
            for x in ['spfeas', 'MappyFeatures', 'Spatial_features', 'LandScan_2012']:
                if x in root:
                    process = False
                if process and not root in imgFolders:
                    imgFolders.append(root)

In [8]:
print(len(zipFiles))
print(len(imgFolders))

120
906


In [None]:
''' DEBUGGING 
importlib.reload(imageryExtents)
inFolder = imgFolders[0]
imgObj = imageryExtents.deliveredImageryFolder(inFolder, outFolder, globalBoundaries, "") 
imgObj.jsonFile
'''

In [None]:
# Processing new folder of imagery
importlib.reload(imageryExtents)

badData = []
errorData = []
newFolders = []
processedFolders = []

for inFolder in imgFolders:
    try:
        imgObj = imageryExtents.deliveredImageryFolder(inFolder, outFolder, globalBoundaries, "") 
        #Check if this imgObj has already been processed
        processedFile      = os.path.join(processedFolder, os.path.basename(imgObj.jsonFile))
        processedErrorFile = os.path.join(processedErrorFolder, os.path.basename(imgObj.jsonFile))
        if (not os.path.exists(imgObj.jsonFile)) and (not imgObj.jsonFile in processed_files):
            metaData = imgObj.getMetadata()
            print(imgObj.jsonFile)
            if imgObj.valid_metadata(metaData):
                thumbnail = imgObj.generateThumbnails()
                zipFile = imgObj.zipData()
                imgJSON = imgObj.createJSON(pNumber="NA", securityClassification="Official Use Only")
            else:
                badData.append(metaData)
            newFolders.append(inFolder)            
        else:
            processedFolders.append(inFolder)
    except:
        errorData.append(inFolder)    

I:\ddhfiles\internal\imagerysource\Ingest\KHM_w63gfv3xm1fj_8_2.0_20150414.json
I:\ddhfiles\internal\imagerysource\Ingest\KHM_w63uh1894sx9_8_2.0_20181221.json
I:\ddhfiles\internal\imagerysource\Ingest\KHM_w63u4dndhgh6_8_2.0_20190128.json
I:\ddhfiles\internal\imagerysource\Ingest\KHM_w63u5fqen3n2_1_0.5_20180408.json
I:\ddhfiles\internal\imagerysource\Ingest\KHM_w63u4bvfpvb1_8_2.0_20171128.json
I:\ddhfiles\internal\imagerysource\Ingest\KHM_w63u4bvfpvb1_1_0.5_20171128.json
I:\ddhfiles\internal\imagerysource\Ingest\KHM_w63uh1894sx9_1_0.5_20181221.json
I:\ddhfiles\internal\imagerysource\Ingest\KHM_w63gfv3xm1fj_1_0.5_20150414.json
I:\ddhfiles\internal\imagerysource\Ingest\KHM_w63u5fqen3n2_8_2.0_20180408.json
I:\ddhfiles\internal\imagerysource\Ingest\KHM_w63u4dndhgh6_1_0.5_20190128.json
I:\ddhfiles\internal\imagerysource\Ingest\KHM_w3cs7swtzn6u_1_0.5_20181203.json
I:\ddhfiles\internal\imagerysource\Ingest\KHM_w3cs7tyex1b1_8_2.0_20170531.json
I:\ddhfiles\internal\imagerysource\Ingest\KHM_w3cs7t



I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcd59tt72nr_4_6.0_YYYYMMDD.json




I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcd59tt72nr_4_6.0_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svccn0xpsmy6_1_2.008_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svccjfxkdv8v_3_6.025_YYYYMMDD.json




I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcspk5wht7y_4_6.0_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcspme1b3h0_3_1.993_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svccgj068bsd_3_6.025_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svccgj068bsd_3_6.025_YYYYMMDD.json




I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcdw9dt640t_4_6.0_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcdqpzefk79_3_2.008_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcs5389teyy_4_6.335_YYYYMMDD.json




I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcs53916h0f_4_6.0_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcdq1ptdk7s_4_0.5_YYYYMMDD.json




I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcdq4jstkpp_4_6.0_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcdq4thnyfj_1_0.5_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcdq1ptdk7s_3_6.025_YYYYMMDD.json




I:\ddhfiles\internal\imagerysource\Ingest\SYR_svceh4n1ybb0_4_12.0_YYYYMMDD.json




I:\ddhfiles\internal\imagerysource\Ingest\SYR_svceh4n1ybb0_4_12.0_YYYYMMDD.json




I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcey4964he4_4_6.0_YYYYMMDD.json




I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcey4964he4_4_6.0_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcccm4d34pg_3_6.025_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svccck9r4rcq_1_5.98_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcdpz7jsq15_3_0.5_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcdpz7415wp_1_0.5_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcc60vmpyuj_3_6.025_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcc62vb6tx6_1_6.025_YYYYMMDD.json


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


I:\ddhfiles\internal\imagerysource\Ingest\SYR_svc9tc8sb8xw_3_0.5_YYYYMMDD.json




I:\ddhfiles\internal\imagerysource\Ingest\SYR_svc9td0mwwuk_1_0.5_YYYYMMDD.json


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


I:\ddhfiles\internal\imagerysource\Ingest\SYR_svc9tcd78vfc_3_6.025_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcfg0ecg1fh_3_0.5_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcfffn2vrwp_1_0.5_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcfg60xgsfx_1_2.008_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcfepbk8y8h_3_6.025_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcfbtz1783n_3_0.5_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcfbtz0527q_1_0.5_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svccen278bhx_3_6.025_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcceq0wewxp_1_6.025_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcube0887db_3_5.999_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcub3msvgxz_3_5.98_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcf64t2pzuj_3_6.025_YYYYMMDD.json




I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcs53916h0f_4_6.0_YYYYMMDD.json




I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcest64nqbk_4_6.0_YYYYMMDD.json




I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcest64nqbk_4_6.0_YYYYMMDD.json




I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcsmsw4xbpq_4_6.0_YYYYMMDD.json




I:\ddhfiles\internal\imagerysource\Ingest\SYR_svcsmsw4xbpq_4_6.0_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\ZWE_ksy1nysmyft0_3_0.5_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\KAZ_txwwx649tnp8_1_0.5_20180804.json
I:\ddhfiles\internal\imagerysource\Ingest\KAZ_txwwjypbwh27_4_2.0_20181025.json
I:\ddhfiles\internal\imagerysource\Ingest\KAZ_txwwnk8zmq72_4_2.0_20160525.json
I:\ddhfiles\internal\imagerysource\Ingest\KAZ_txwwq9ku8uve_4_2.0_20121208.json
I:\ddhfiles\internal\imagerysource\Ingest\KAZ_txwwjypbwh27_1_0.5_20181025.json
I:\ddhfiles\internal\imagerysource\Ingest\KAZ_txwwnk8zmq72_1_0.5_20160525.json
I:\ddhfiles\internal\imagerysource\Ingest\KAZ_txwwx649tnp8_4_2.0_20180804.json
I:\ddhfiles\internal\imagerysource\Ingest\KAZ_txwwq9ku8uve_1_0.5_20121208.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5nk54z25jh1_4_0.5_20171224.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5j8jsbe3nd4_4_0.5_20180512.json


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


I:\ddhfiles\internal\imagerysource\Ingest\NER_s42cmh0dbyfz_4_0.5_20160408.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5ng0fjw3y1d_4_0.5_20191226.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s43ju9sgk1cz_4_0.5_20200817.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s43j7mqp9yzy_4_0.5_20200817.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s78rzce9xxw1_4_0.5_PHR.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s43jd50kskm8_4_0.5_20200817.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5ng3477ehr8_4_0.5_20190505.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s43hhq6qgpgj_4_0.5_20140504.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s43j7mw11px8_4_0.5_20170421.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5m3bxqp8pr7_4_0.5_20160129.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5nexj0fcmme_4_0.5_20161122.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5ng347mmn7u_4_0.5_20151228.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s43hhq6qgpg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


I:\ddhfiles\internal\imagerysource\Ingest\NER_s42uhs2dfv0c_4_0.5_20140504.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5j9kz4es0hw_4_0.5_20160102.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5m3bxqp8pr7_4_0.5_20180228.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5y4xn4376ug_4_0.5_20111022.json


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


I:\ddhfiles\internal\imagerysource\Ingest\NER_s42cmh06gb48_4_0.5_20120407.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s48b0ngn9c1y_4_0.5_20190211.json


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


I:\ddhfiles\internal\imagerysource\Ingest\NER_s42cmh06gb48_4_0.5_20190523.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s4885pfgy2hv_4_0.5_20121225.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5y7y1169v6h_4_0.5_20190209.json


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


I:\ddhfiles\internal\imagerysource\Ingest\NER_s42uhs2dfv0c_4_0.5_20190523.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5nk54z0w9tv_4_0.5_20190929.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5y4xn4376ug_4_0.5_20151230.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s4885pfgy2hv_4_0.5_20190126.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5y7y1169v6h_4_0.5_20160428.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s43892mmhtxs_4_0.5_20171103.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5y7y1169v6h_4_0.5_20130321.json


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


I:\ddhfiles\internal\imagerysource\Ingest\NER_s42uhs2dfv0c_4_0.5_20160429.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s43892mmhtxs_4_0.5_20141223.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s43892mm4f8h_4_0.5_20181002.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5y4xn4376ug_4_0.5_20181220.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s43j7mw11pqg_4_0.6_20111125.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5nexj0g9837_4_0.5_20191207.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s43ju9fcztg4_4_0.5_20170421.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5j9kz4es0hw_4_0.5_20171223.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s48b15xn495d_4_0.5_20160413.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s42xppn1t05t_4_0.5_20160329.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s43ju9fcztg4_4_0.5_20111125.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5j9kz4s4h76_4_0.5_20191216.json
I:\ddhfiles\internal\imagerysource\Ingest\NER_s5ng0f

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


I:\ddhfiles\internal\imagerysource\Ingest\COD_kr4yfwgyh663_4_0.5_20140403.json
I:\ddhfiles\internal\imagerysource\Ingest\COD_kr4yvnnxt86m_4_0.5_20140403.json


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


I:\ddhfiles\internal\imagerysource\Ingest\COD_kr4yc9jm1r8j_4_0.5_20150801.json
I:\ddhfiles\internal\imagerysource\Ingest\COD_kr5nbpypkj1j_4_0.5_20140403.json
I:\ddhfiles\internal\imagerysource\Ingest\COD_kx6rjxs5rngt_4_0.3_20150117.json
I:\ddhfiles\internal\imagerysource\Ingest\ARG_69ydd4c9vb8f_4_0.5_20200929.json
I:\ddhfiles\internal\imagerysource\Ingest\ARG_69y6h8kpurwc_4_0.5_20200921.json
I:\ddhfiles\internal\imagerysource\Ingest\ARG_69yd1wrm7fm0_4_0.5_20200929.json
I:\ddhfiles\internal\imagerysource\Ingest\ARG_69y7fu77csgn_4_0.5_20201012.json
I:\ddhfiles\internal\imagerysource\Ingest\IDN_qxv4zhpwjkv4_8_3.0_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\IDN_qxv703027r31_8_3.0_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\IDN_qxv70kyu5pxt_8_3.0_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\IDN_qxvnyeznsdd5_8_3.0_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\IDN_qxv722w411j5_8_3.0_YYYYMMDD.json
I:\ddhfiles\internal\imagerysource\Ingest\IDN_qxv63k

In [None]:
print(len(imgFolders))
print(len(badData))
print(len(errorData))
print(len(newFolders))
print(len(processedFolders))

# Log processed data

In [None]:
for file in [processed_file, bad_meta_file, error_file]:
    if os.path.exists(file):
        shutil.copy(file, file.replace(".txt", "_%s.txt" % datetime.date.today().strftime("%Y%m%d")))

In [None]:
with open(processed_file, 'w') as out_file:
    for folder in processedFolders:
        out_file.write(f"{folder}\n")
        
with open(error_file, 'w') as out_file:
    for folder in errorData:
        out_file.write(f"{folder}\n")

In [None]:
try:
    del badFinal
except:
    pass
for df in badData:
    try:
        badFinal = badFinal.append(df)
    except:
        badFinal = df
badFinal.to_csv(bad_meta_file)

In [None]:
# The bad data needs to be manually corrected - once it is, it can be read back in and re-written
fixed_bad_data = pd.read_csv(bad_meta_file.replace(".txt", ".csv"), index_col=0)
fixed_bad_data['folder'] = fixed_bad_data['file'].apply(lambda x: os.path.dirname(x))
geoms = fixed_bad_data['geometry'].apply(lambda x: shapely.wkt.loads(x))
fixed_bad_data = gpd.GeoDataFrame(fixed_bad_data, geometry=geoms, crs={'init':'epsg:4326'})

def roundit(x):
    if len(str(x)) > 4:
        x = round(x, 2)
    return(x)
fixed_bad_data['Res'] = fixed_bad_data['Res'].apply(lambda x: roundit(x))
fixed_bad_data['Res'].unique()

In [None]:
importlib.reload(imageryExtents)

fixed = []
still_bad = []
fubar = []

for folder in fixed_bad_data['folder'].unique():
    metadata = fixed_bad_data.loc[fixed_bad_data['folder'] == folder]
    imgObj = imageryExtents.deliveredImageryFolder(folder, outFolder, globalBoundaries, "", metadata) 
    
    #Check if this imgObj has already been processed
    processedFile = os.path.join(processedFolder, os.path.basename(imgObj.jsonFile))
    if not os.path.exists(processedFile) and not os.path.exists(imgObj.jsonFile):
        if imgObj.valid_metadata(metadata):
            print(imgObj.jsonFile)
            #thumbnail = imgObj.generateThumbnails()
            #imgJSON = imgObj.createJSON(pNumber="NA", securityClassification="Official Use Only")
            #zipFile = imgObj.zipData()            
        else:
            still_bad.append(metadata)
        fixed.append(folder)            
    else:
        fubar.append(folder)    
        
# Write the correctly processed imagery files to the processed folders list
with open(processed_file, 'a') as out_file:
    for folder in fixed:
        out_file.write(f'{folder}\n')

# What to do with error data


In [None]:
from shapely.wkt import loads
import json

In [None]:
# For these broken SPOT files, run the following command in arcpy - for some reason
#  The actual .tif files are not spatially referenced
original_location = r'S:\COUNTRY\TJK\IMAGERY\3020003_HEIN_01800_071401_Tajik_Hazard_SO17014201-8-01_DS_SPOT6_201310200533280_FR1_FR1_SE1_SE1_E073N38_01952\PROD_SPOT6_001\VOL_SPOT6_001_A\IMG_SPOT6_MS_001_A\DIM_SPOT6_MS_201310200533280_SEN_2406078101.XML'
xx = arcpy.Raster(original_location)
print(xx.extent.polygon.WKT)
print(xx.bandCount)
print(xx.meanCellHeight)

In [None]:

zip_file_base = "I:\ddhfiles\internal\imagerysource\Ingest"
bbox = "MULTIPOLYGON (((73.083069374010321 38.361483576899239, 73.529317605137848 38.361483576899239, 73.529317605137848 38.682390625431829, 73.083069374010321 38.682390625431829, 73.083069374010321 38.361483576899239)))"
bbox_shp = loads(bbox)
iso3 = ";".join(list(globalBoundaries[globalBoundaries.intersects(bbox_shp)]['ISO3']))
g_hash = geohash.encode(bbox_shp.centroid.y, bbox_shp.centroid.x)
band_count = 4
resolution = 5
vendor = "SPOT"
date = "20131020"
filename = f"{iso3}_{g_hash}_{band_count}_{resolution}_{date}.json"
filename

In [None]:
vals = {
    "title":f"Satellite imagery for {iso3}",
    'iso3':f'{iso3}',
    'location':f'{os.path.join(zip_file_base, filename.replace(".json",".zip"))}',
    'zipped_size':563444000,
    'resolution':f'{resolution}',
    'nBands':f'{band_count}',
    'vendor':f'{vendor}',
    'capture_date':f'{date}',
    'pNumber':'NA',
    "securityClassification": "Official Use Only", 
    "ImageExtent":str(bbox_shp),
    'originalLocation':original_location
}
with open(os.path.join(zip_file_base, filename), 'w') as j:
    json.dump(vals, j)

In [None]:
g_hash

# Debugging

In [None]:
in_folder ="R:\\IMAGERY\\10 cities Africa\\SPOT\\KigaliRwanda\\kigali_2011"

In [None]:
importlib.reload(imageryExtents)
imgObj = imageryExtents.deliveredImageryFolder(in_folder, outFolder, globalBoundaries, "") 
imgObj.jsonFile
        

In [None]:
inputExtent = imgObj.allMetadata.unary_union
imgObj.country['overlap'] = imgObj.country['geometry'].apply(lambda x: x.intersection(inputExtent).area/inputExtent.area)
imgObj.

In [None]:
for folder in imgFolders:
    if "SIIS_Imagery" in folder:
        imgObj = imageryExtents.deliveredImageryFolder(folder, outFolder, globalBoundaries, "")
        print(folder)
        #print(f'{imgObj.jsonFile}: {os.path.exists(imgObj.jsonFile)}')

In [None]:
importlib.reload(imageryExtents)
folder = r"R:\IMAGERY\SIIS_Imagery\K5_20200825170351_000010_38493_D_ES04_HH_GEC_B_L1C"
imgObj = imageryExtents.deliveredImageryFolder(folder, outFolder, globalBoundaries, "")
metaData = imgObj.getMetadata()
metaData

In [None]:
imgObj.valid_metadata(metaData)

In [None]:
imgObj.jsonFile

# Convert imagery to smaller file size

In [None]:
sourceFolder =    r"R:\IMAGERY\ZIMSTAT"# The folder of imagery to process
all_tiffs = []

for root, dirs, files in os.walk(sourceFolder):
    for f in files:
        if f[-4:] == ".TIF":
            all_tiffs.append(os.path.join(root, f))
            
all_tiffs

In [None]:
out_folder = "R:\IMAGERY\ZIMSTAT_SMALL"
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

In [None]:
cur_tif = all_tiffs[0]

new_tif = cur_tif.replace(sourceFolder, out_folder)
cur_base = os.path.dirname(new_tif)
if not os.path.exists(cur_base):
    os.makedirs(cur_base)
    
print(f'gdal_translate -of GTiff -co "COMPRESS=LZW" -co "PREDICTOR=2" -co "TILED=YES" {cur_tif} {new_tif}')

In [None]:
os.path.getsize(cur_tif) / 1028 / 1028

In [None]:
os.path.getsize(new_tif) / 1028 / 1028

In [None]:
xx = rasterio.open(cur_tif)
xx.dtype

In [None]:
xx.dtypes