In [3]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
import os
import csv

import pandas as pd
import boto3
import tqdm

In [5]:
df = pd.read_csv("cleaned_11000.csv")
ori = df.copy()

In [9]:
adware_q = df.loc[df['malware_type'] == "adware"]
adware_list = adware_q["sha256"].tolist()
adware_list

['f042ff63e633b0601c73f289298b37381de793627f0c02937f1f6447aa8220a7',
 '199c1915e8cbabdee3cfc4b144a5bb8d197bbfff71a3fae2bd6174db933ff8cc',
 '6259a46c51563782bd378f8976dec01f7c21f1ed9092e02c71211ce1ad3bdf3e',
 '3034818950ea3762a5af34751ec2793e4d0afe70af069536a9d0d4f26dbe4c6b',
 'f54f2cd9a8b204857af2ae291f2bc817292db098f79141af333681db483c61dd',
 '65158c9ab753d0465cb7aa507026a72df2f6bb85d0f7a4f11c1031f87d5eac4b',
 '11b30d21ed483bd30e29267af75557f4b01625786cd940ab6f2a7c69c0a5e044',
 '79718d841372c9ac95e4bb8dd73b9fefdc2cac1dc8654fc522ca663831a0ec47',
 '07690a38b5267b6bfe423c754990717925e166e78cd5ee2617770e9a43c16ba3',
 '9ea992871f828c7f054837386068ae8927dd6cbda36f21a26669f60928055b47',
 '631983bcb3b2d09c9bf54076400fcbc651a67e921c8518596dc27470eb1ef305',
 '95dca0bc594a99f7937c240c790cb92cdbe0245a63bca67ccacfb4b9b3002018',
 'a6f590ad8a0d527bff62295e4351b7ed14701a3ee77399657344f1847f74b1a5',
 '4bb9a5845491182c2e4b0788dde9f00d73b23b4b9190bbb5e9ecfc9d2d39a059',
 '4a91fd267b5b08602961fdf813945dae

In [15]:
AWS_BUCKET = "sorel-20m"
OUTPUT_DIR = "C:\\Users\\swe19\\Desktop\\mal1\\adware\\"

def download_one_file(bucket: str, output: str, client: boto3.client, s3_file: str):
    """
    Download a single file from S3
    Args:
        bucket (str): S3 bucket where images are hosted
        output (str): Dir to store the images
        client (boto3.client): S3 client
        s3_file (str): S3 object name
        
    print(bucket, s3_file, os.path.join(output, s3_file))
    print("outputname" , output_name)
    """
    s3_dir = "09-DEC-2020/binaries/"
    s3_file = s3_dir+s3_file
    output_file = s3_file.split("/")[-1]
    output_dest = output + output_file
    client.download_file(
        Bucket=bucket, Key=s3_file, Filename=output_dest
    )


files_to_download = adware_list
# Creating only one session and one client
session = boto3.Session()
client = session.client("s3")
# The client is shared between threads
func = partial(download_one_file, AWS_BUCKET, OUTPUT_DIR, client)

# List for storing possible failed downloads to retry later
failed_downloads = []

with tqdm.tqdm(desc="Downloading files from S3", total=len(files_to_download)) as pbar:
    with ThreadPoolExecutor(max_workers=32) as executor:
        # Using a dict for preserving the downloaded file for each future, to store it as a failure if we need that
        futures = {
            executor.submit(func, file_to_download): file_to_download for file_to_download in files_to_download
        }
        for future in as_completed(futures):
            if future.exception():
                failed_downloads.append(futures[future])
            pbar.update(1)
if len(failed_downloads) > 0:
    print("Some downloads have failed. Saving ids to csv")
    with open(
        os.path.join("failed_downloads.csv"), "w", newline=""
    ) as csvfile:
        wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
        wr.writerow(failed_downloads)

Downloading files from S3: 100%|███████████████████████████████████████████████████| 1000/1000 [00:41<00:00, 24.19it/s]

Some downloads have failed. Saving ids to csv





In [None]:
# https://emasquil.github.io/posts/multithreading-boto3/