# Creating the files

In [11]:
import os 
def getmhtml():
    d = "klarna_product_page_dataset_MHTML_50k/test"
    for nation in os.listdir(d):
        if len(nation) > 2 or nation not in ["GB", "US"]:
            continue
        for site in os.listdir(os.path.join(d, nation)):
            if ".DS_Store" in site:
                continue
            for number in os.listdir(os.path.join(d, nation, site)):
                if ".DS_Store" in number:
                    continue
                name = os.path.join(d, nation, site, number, "source.mhtml")
                try:
                    with open(name) as f:
                        yield os.path.join(site, number), f.read(), True
                except FileNotFoundError:
                    with open(name.replace(".mhtml", ".html")) as f:
                        yield  os.path.join(site, number), f.read(), False

In [12]:
import re 
import quopri

def extract_html_content(text):
    # Define the regex pattern to match content between <!DOCTYPE html> and </html>
    pattern = r'<!DOCTYPE html>.*?</html>'
    
    # Search for the pattern in the text with DOTALL flag to match newlines as well
    match = re.search(pattern, text, re.DOTALL)
    
    # Extract and return the matched content if found
    if match:
        return match.group(0)
    else:
        return None

def decode_quote(s):
    return quopri.decodestring(s)

def extract_html(mhtml):
    pattern = r'boundary="([^"]+)"'
    m = re.search(pattern, mhtml[:10000])
    boundary = m.group(1)
    for chunk in mhtml.split(boundary):
        if "<!DOCTYPE html>" in chunk:
            content = extract_html_content(chunk)
            if content:
                if "quoted-printable" in chunk:
                    content = decode_quote(content)
                return content
            

In [13]:
import pickle 

pack = []
BATCH = 100
i = 0
for path, mhtml, htm in getmhtml():
    p = path.replace("/", "-")
    if htm:
        html = extract_html(mhtml)
    else:
        html = mhtml
    if html:
        pack.append(html)
    if len(pack) >= BATCH:
        with open(f'out/webleaf{i}.pkl', "wb") as f:
            pickle.dump(pack, f)
        pack = []
        i += 1

# Uploading the files

In [14]:
from google.cloud import storage

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"
    # The path to your file to upload
    # source_file_name = "local/path/to/file"
    # The ID of your GCS object
    # destination_blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Optional: set a generation-match precondition to avoid potential race conditions
    # and data corruptions. The request to upload is aborted if the object's
    # generation number does not match your precondition. For a destination
    # object that does not yet exist, set the if_generation_match precondition to 0.
    # If the destination object already exists in your bucket, set instead a
    # generation-match precondition using its generation number.
    generation_match_precondition = 0

    blob.upload_from_filename(source_file_name, if_generation_match=generation_match_precondition, content_type='application/octet-stream')

    print(
        f"File {source_file_name} uploaded to {destination_blob_name}."
    )

for f in os.listdir("out"):
    if "webleaf" in f:
        upload_blob("webleaftest", f'out/{f}', f)
    print(f)

.DS_Store
File out/webleaf0.pkl uploaded to webleaf0.pkl.
webleaf0.pkl
File out/webleaf1.pkl uploaded to webleaf1.pkl.
webleaf1.pkl
File out/webleaf10.pkl uploaded to webleaf10.pkl.
webleaf10.pkl
File out/webleaf11.pkl uploaded to webleaf11.pkl.
webleaf11.pkl
File out/webleaf12.pkl uploaded to webleaf12.pkl.
webleaf12.pkl
File out/webleaf13.pkl uploaded to webleaf13.pkl.
webleaf13.pkl
File out/webleaf14.pkl uploaded to webleaf14.pkl.
webleaf14.pkl
File out/webleaf15.pkl uploaded to webleaf15.pkl.
webleaf15.pkl
File out/webleaf16.pkl uploaded to webleaf16.pkl.
webleaf16.pkl
File out/webleaf17.pkl uploaded to webleaf17.pkl.
webleaf17.pkl
File out/webleaf18.pkl uploaded to webleaf18.pkl.
webleaf18.pkl
File out/webleaf19.pkl uploaded to webleaf19.pkl.
webleaf19.pkl
File out/webleaf2.pkl uploaded to webleaf2.pkl.
webleaf2.pkl
File out/webleaf20.pkl uploaded to webleaf20.pkl.
webleaf20.pkl
File out/webleaf21.pkl uploaded to webleaf21.pkl.
webleaf21.pkl
File out/webleaf22.pkl uploaded to webl