In [1]:
import os
import argparse
import asyncio
import csv
from pathlib import Path, PurePath
from typing import List
from uuid import uuid4

import pandas as pd
from aiohttp import ClientSession


async def http_get(
    session: ClientSession,
    url: str,
    outfilename: str,
    sas: str,
    headers: dict = {},
    proxy: str = None,
    timeout: int = 10,
) -> bytes:

    response = await session.get(
        url=url + sas, headers=headers, proxy=proxy, timeout=timeout
    )
    content = None

    out = {"url": url, "filename": None}

    try:
        content = await response.read()
        with open(outfilename, "wb") as f:
            f.write(content)
        out["filename"] = outfilename
    except:
        pass

    return out


async def http_get_parallel(
    session: ClientSession,
    indf: pd.DataFrame,
    sas: str,
    headers: dict = {},
    proxy: str = None,
    timeout: int = 10,
) -> List[dict]:

    results = await asyncio.gather(
        *[
            http_get(session, row.url, row.outfile, sas, headers, proxy, timeout)
            for k, row in indf.iterrows()
        ]
    )
    return results


def getSASkey(csvFile: str) -> str:
    with open(csvFile, newline="") as f:
        reader = csv.reader(f)
        SASkey = next(reader)[1]
    return SASkey


In [2]:
csvFile = "./data/categories_bd1f887b-4394-4f46-90c4-a599ed039746_637885127746682846.csv"
outDir = "./data"
isImagery = False


In [3]:

SASkey = getSASkey(csvFile)
df = pd.read_csv(csvFile, skiprows=1)


In [4]:
SASkey

'?sv=2019-12-12&ss=b&srt=co&sp=r&se=2023-03-04T11:01:44Z&st=2021-02-03T03:01:44Z&spr=https&sig=xltwMFWKEl8h4c%2BX%2BtcoftoMQg4p67uefHopgWM1L30%3D'

In [5]:
df.head()

Unnamed: 0,category,lat,lng,val,url,isapproved
0,Exclude,,,,https://geospatialraicstorage001.blob.core.win...,False
1,Exclude,,,,https://geospatialraicstorage001.blob.core.win...,False
2,Exclude,,,,https://geospatialraicstorage001.blob.core.win...,False
3,Exclude,,,,https://geospatialraicstorage001.blob.core.win...,False
4,Exclude,,,,https://geospatialraicstorage001.blob.core.win...,False


In [12]:

df["outfile"] = df["url"].apply(
    lambda x: PurePath("/root/code",
        outDir,
        *list(
            Path(x).parts[-4:] if not isImagery else str(uuid4()) + Path(x).suffix
        ),
    )
)


In [13]:
df.head()

Unnamed: 0,category,lat,lng,val,url,isapproved,outfile
0,Exclude,,,,https://geospatialraicstorage001.blob.core.win...,False,/root/code/data/rgb/21/536037/770357.png
1,Exclude,,,,https://geospatialraicstorage001.blob.core.win...,False,/root/code/data/rgb/21/536230/770499.png
2,Exclude,,,,https://geospatialraicstorage001.blob.core.win...,False,/root/code/data/rgb/21/536010/770410.png
3,Exclude,,,,https://geospatialraicstorage001.blob.core.win...,False,/root/code/data/rgb/21/536047/770306.png
4,Exclude,,,,https://geospatialraicstorage001.blob.core.win...,False,/root/code/data/rgb/21/536020/770554.png


In [15]:
url = df["url"].iloc[0]
outfile = df["outfile"].iloc[0]
sess = ClientSession()
await http_get(sess,url,outfile,SASkey)

sess.close()

<coroutine object ClientSession.close at 0x7fbc69b795c0>

In [None]:

sess = ClientSession()
res = http_get_parallel(sess, df, SASkey, not isImagery, outDir)
saved_csv = str(PurePath(outDir, "ref_" + Path(csvFile).stem + ".csv"))
sess.close()

try:
    pd.DataFrame(res).to_csv(saved_csv)
except:
    print("Failed to save reference CSV")
    return 1

print(f"Successfully saved csv to:\n{saved_csv}")