In [1]:
import os
import shutil
from pathlib import Path
import zipfile
from xml.etree import ElementTree as ET
from datetime import datetime

# Intro into unzipping
## With and Without with notation

In [2]:
# open a zip file for read
with zipfile.ZipFile(r"to_process/Test_001_20201027.zip", 'r') as zip_ref:
    # unzip its content to the temp folder
    zip_ref.extractall("temp")

In [3]:
# using with notation ensure that the file is closed after the processing
zip_ref

<zipfile.ZipFile [closed]>

In [4]:
zip_ref = zipfile.ZipFile(r"to_process/Test_001_20201027.zip", 'r')
a = zip_ref.extractall("temp")

# if you only assign the file to the variable, it remains in the memory after processing
zip_ref

<zipfile.ZipFile filename='to_process/Test_001_20201027.zip' mode='r'>

In [5]:
# unless you intentionally close it
zip_ref.close()
zip_ref

<zipfile.ZipFile [closed]>

In [6]:
# no matter the reference is closed, you can see the content of the zip file:
zip_ref.namelist()

['a.xml']

In [7]:
# archive containing a file in the folder and a second file
zipfile.ZipFile(r"to_process/Folder_001_20201101.zip", 'r').namelist()

['efg.txt', 'folder_A/a.xml']

# Unzip multiple archives in a folder

In [8]:
folder = "to_process"

In [9]:
# os.listdir returns a list with all files and folders contained in a folder
os.listdir(folder)

['Folder_001_20201101.zip',
 'Test_001_20201027.zip',
 'twoXMLs_001_20201029.zip']

In [10]:
# to get only .zips let's use the list notation and endswith function
[f for f in os.listdir(folder) if f.endswith(".zip")]

['Folder_001_20201101.zip',
 'Test_001_20201027.zip',
 'twoXMLs_001_20201029.zip']

In [11]:
# alternativelly you can use os.path's splitext
[f for f in os.listdir(folder) if os.path.splitext(f)[1] == ".zip"]

['Folder_001_20201101.zip',
 'Test_001_20201027.zip',
 'twoXMLs_001_20201029.zip']

## break down the list comprehension

In [12]:
# the list notations is a comprehensive way how to return a list out of a for loop

# initiate an empty list
output = []

# iterate over the files/subfolders in the folder
for f in os.listdir(folder):
    
    # if the file/folder ends with a string ".zip"
    if f.endswith(".zip"):
        
        # append it to the output
        output.append(f)
        
output

['Folder_001_20201101.zip',
 'Test_001_20201027.zip',
 'twoXMLs_001_20201029.zip']

## Unzip all zips in the folder

In [13]:
# iterate over the identified zipfiles and unzip them  to the temp folder
for zip_file in [f for f in os.listdir(folder) if f.endswith(".zip")]:
    with zipfile.ZipFile(os.path.join(folder,zip_file), 'r') as zip_ref:
        zip_ref.extractall("temp")

You can wrap the unzipping into a function which would squeeze the final code into a one-liner

In [14]:
def unzip(folder: str, file: str, folder_to_extract: str) -> list:
    """unzips a file in a folder into folder_to_extract
    returns a list of files in the zip archive"""
    with zipfile.ZipFile(os.path.join(folder,file), 'r') as zip_ref:
        zip_ref.extractall(folder_to_extract)
        return zip_ref.namelist()

In [15]:
# applying a function to the output can be squeezed into the list comprehension
[unzip(folder, f, "temp") for f in os.listdir(folder) if f.endswith(".zip")]

[['efg.txt', 'folder_A/a.xml'], ['a.xml'], ['a.xml', 'b.xml']]

# Update unzipped XML

In [16]:
folder = "temp"
file = "a.xml"

# values to which we update
new_prefix = "updated"
new_version = 3

In [17]:
# path to the unzipped xml in the temp folder
path = os.path.join(folder, file)

# load the xml
tree = ET.parse(path)
root = tree.getroot()

# the wrapping tag <data> is the root element of our loaded xml
root

<Element 'data' at 0x0000027560749368>

In [18]:
# in the <data> find the <id>...</id> node and show its content (.text)
id = root.find("id").text
id

'xyz_001_20201029'

In [19]:
# split the id by "_" underscore
split_id = id.split("_")

# update the values
split_id[0] = new_prefix
split_id[1] = "{:03d}".format(new_version) # prefix with leading zeros up to 3 position if needed

# update the xml content
root.find("id").text = "_".join(split_id)

In [20]:
# any variable derived from root contains a reference to the original xml
# so using tree.write writes the updated contant to the path
tree.write(path)

## Wrap to function

In [21]:
def update_id(id: str, new_prefix: str, new_version: int) -> str:
    # split the id by "_" underscore
    split_id = id.split("_")

    # update the values
    split_id[0] = new_prefix
    split_id[1] = "{:03d}".format(new_version) # prefix with leading zeros up to 3 position if needed
    
    return "_".join(split_id)

def update_xml(path: str, new_prefix: str, new_version: int) -> None:
    # load the xml
    tree = ET.parse(path)
    root = tree.getroot()
    
    # in the <data> find the <id>...</id> node and show its content (.text)
    id = root.find("id").text

    # update the xml content
    root.find("id").text = update_id(id, new_prefix, new_version)
    
    # any variable derived from root contains a reference to the original xml
    # so using tree.write writes the updated contant to the path
    tree.write(path)
    
    return None

In [22]:
update_xml(path, new_prefix, new_version)

# Zip to the output folder

In [23]:
temp_folder = "temp"
output_folder = "processed"
temp_file = "a.xml"
output_file_name= "updated_003_20201026.zip"

In [24]:
processed_xml = os.path.join(temp_folder, temp_file)
output_path = os.path.join(output_folder, output_file_name)

In [25]:
# create the output folder if it doesn't exists
Path(output_folder).mkdir(parents=True, exist_ok=True)

In [26]:
# open archive for writing
with zipfile.ZipFile(output_path, 'w') as myzip:
    # write our processed xml to it, under it's file name only (not the full path)
    myzip.write(processed_xml, os.path.basename(processed_xml))

In [27]:
with zipfile.ZipFile(output_path, 'w') as myzip:
    myzip.write("temp/a.xml")

# Delete files in the temp folder

In [28]:
# remove files and folders (including content) in the directory 
# https://stackoverflow.com/questions/185936/how-to-delete-the-contents-of-a-folder
def remove_all(folder):
    """Checks content of the folder and if it's a files remove it with os.unlink, 
    if it's a folder it's deleted using shutil.rmtree including all content"""
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

In [29]:
remove_all("temp")

# Handle zipped folders
so far our zip archive contained only the single file. But it can also contain a folder

In [30]:
# open a zip file for read
archive = "Folder_001_20201101.zip"
unzipped = unzip("to_process", archive, temp_folder)
unzipped

['efg.txt', 'folder_A/a.xml']

In [31]:
for file in unzipped:
    if file.endswith(".xml"):
        path = os.path.join(temp_folder, file)
        update_id(path, new_prefix, new_version)

In [32]:
# create a new id based on original archive name
new_archive_name = update_id(archive, new_prefix, new_version)
new_archive_name

'updated_003_20201101.zip'

In [33]:
# list all files including the subdirectories
# https://stackoverflow.com/questions/2909975/python-list-directory-subdirectory-and-files
for path, subdirs, files in os.walk(temp_folder):
    for name in files:
        print(os.path.join(path, name))

temp\efg.txt
temp\folder_A\a.xml


In [34]:
def zipdir(path, output_archive_path):
    with zipfile.ZipFile(output_archive_path, 'w') as ziph:
        # ziph is zipfile handle
        for root, dirs, files in os.walk(path):
            for file in files:
                ziph.write(os.path.join(root, file))

In [35]:
# unfortunatelly this zips the temp folder as a root directory
zipdir(temp_folder, os.path.join(output_folder, new_archive_name))

In [36]:
# to zip only the content of the temp folder, use the shutil.make_archive
shutil.make_archive(os.path.join(output_folder, new_archive_name.replace(".zip","")), 'zip', temp_folder)

'C:\\Users\\v.dekanovsky\\Python Queries\\Medium\\Unzip\\processed\\updated_003_20201101.zip'

# Put it all together
## Unzip, update and zip XML in a folder using python

In [37]:
source_folder = "to_process"
temp_folder = "temp"
output_folder = "processed"

# values to which we update
new_prefix = "updated"
new_version = 3

In [38]:
# create the output folder if it doesn't exists
Path(output_folder).mkdir(parents=True, exist_ok=True)

In [39]:
archives_to_process = [f for f in os.listdir(source_folder) if f.endswith(".zip")]
archives_to_process

['Folder_001_20201101.zip',
 'Test_001_20201027.zip',
 'twoXMLs_001_20201029.zip']

In [40]:
for archive in archives_to_process:
    print(archive)
    new_archive_name = update_id(archive, new_prefix, new_version)
    
    archive_name_wihtout_extension = os.path.splitext(archive)[0]
    
    # unzip to temp folder
    extracted = unzip(source_folder, archive, temp_folder)
        
    # iterate over the extracted files
    for extracted_file in extracted:
        print(extracted_file)
        # cover the option that id was not found, because we use the id in naming the output zip
        new_id = update_id(archive, new_prefix, new_version)
        
        # if the extracted file is a xml
        if extracted_file.endswith(".xml"):
            update_xml(os.path.join(temp_folder, extracted_file), new_prefix, new_version)
        
    # zip again into the new_id.zip
    shutil.make_archive(os.path.join(output_folder, new_archive_name.replace(".zip","")), 'zip', temp_folder)
            
    # remove the extracted files, so that they are not packed to the following archives
    remove_all(temp_folder)

Folder_001_20201101.zip
efg.txt
folder_A/a.xml
Test_001_20201027.zip
a.xml
twoXMLs_001_20201029.zip
a.xml
b.xml
