<a href="https://colab.research.google.com/github/tmabgdata/Azure_Data_Lake_Pipeline/blob/master/save_data_blob.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Create Directory and Extract Data

In [1]:
!mkdir -p data

In [5]:
!pip install requests

import urllib.request
import requests

def extract_data(url, file_name):
    """
    Downloads data from a given URL and saves it to a file.

    Args:
        url (str): The URL of the data to download.
        file_name (str): The path to the file where the data will be saved.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    }  # Adding a User-Agent header to mimic a web browser
    response = requests.get(url, headers=headers, stream=True) # make the request
    response.raise_for_status() # raise an exception for bad responses (like 403)
    with open(file_name, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192): # write in chunks to avoid memory issues with large files
            f.write(chunk)



In [6]:
extract_data('https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/c9509ab4-6f6d-4b97-979a-0cf2a10c922b/download/tmphrybkxuh.csv', 'data/boston_data_2015.csv')

In [7]:
extract_data('https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/b7ea6b1b-3ca4-4c5b-9713-6dc1db52379a/download/tmpzxzxeqfb.csv', 'data/boston_data_2016.csv')
extract_data('https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/30022137-709d-465e-baae-ca155b51927d/download/tmpzccn8u4q.csv', 'data/boston_data_2017.csv')
extract_data('https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/2be28d90-3a90-4af1-a3f6-f28c1e25880a/download/tmp7602cia8.csv', 'data/boston_data_2018.csv')
extract_data('https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/ea2e4696-4a2d-429c-9807-d02eb92e0222/download/tmpcje3ep_w.csv', 'data/boston_data_2019.csv')
extract_data('https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/6ff6a6fd-3141-4440-a880-6f60a37fe789/download/tmpcv_10m2s.csv', 'data/boston_data_2020.csv')

## Zipping Files

In [8]:
import shutil

def zip_folder(folder_path, zip_path):
    shutil.make_archive(zip_path, 'zip', folder_path)

In [9]:
zip_folder('/content/data', 'boston_data')

## Azure Storage Account Connect

In [10]:
!pip install azure-storage-blob azure-identity

Collecting azure-storage-blob
  Downloading azure_storage_blob-12.24.0-py3-none-any.whl.metadata (26 kB)
Collecting azure-identity
  Downloading azure_identity-1.19.0-py3-none-any.whl.metadata (80 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.6/80.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-core>=1.30.0 (from azure-storage-blob)
  Downloading azure_core-1.32.0-py3-none-any.whl.metadata (39 kB)
Collecting isodate>=0.6.1 (from azure-storage-blob)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Collecting msal>=1.30.0 (from azure-identity)
  Downloading msal-1.31.1-py3-none-any.whl.metadata (11 kB)
Collecting msal-extensions>=1.2.0 (from azure-identity)
  Downloading msal_extensions-1.2.0-py3-none-any.whl.metadata (7.6 kB)
Collecting portalocker<3,>=1.4 (from msal-extensions>=1.2.0->azure-identity)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading azure_storage_blob-12.24.0-py3-none-any.whl (40

In [12]:
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import os

## Upload file to Azure Blob Storage

- [Get started with Azure Blob Storage and Python](https://learn.microsoft.com/en-us/azure/storage/blobs/storage-blob-python-get-started?tabs=azure-ad)

- [Upload a block blob with Python](https://learn.microsoft.com/en-us/azure/storage/blobs/storage-blob-upload-python)

In [27]:
def get_blob_service_client_account_key():
    # Ensure the storage account name and shared access key are correct
    account_url = "https://<storage_account>.blob.core.windows.net"
    # ^ This should be the URL of your storage account, not including the container
    shared_access_key = "<shared_access_key>"
    # ^ Ensure this shared access key is correct and has the necessary permissions
    credential = shared_access_key

    # Create the BlobServiceClient object
    blob_service_client = BlobServiceClient(account_url, credential=credential)

    return blob_service_client

def upload_blob_file(blob_service_client, container_name, file_path, file_name):
    container_client = blob_service_client.get_container_client(container=container_name)
    with open(file = file_path, mode="rb") as data:
        blob_client = container_client.upload_blob(name = file_name, data=data, overwrite=True)

# Re-initialize blob_service_client with the correct account details:
blob_service_client = get_blob_service_client_account_key()

# Now upload the file:
upload_blob_file(blob_service_client, 'zipserviceboston', '/content/boston_data.zip', 'boston_data.zip')