# raw_to_text monthly data

take the most recent month's mbox file and convert it into a csv for analysis.

In [2]:
import mailbox
import os
import sys
import gzip
import pandas as pd
import boto3
from pathlib import Path
from dotenv import load_dotenv
load_dotenv("../../.env")

sys.path.append('../..')

from src import utils

In [5]:
BASE_PATH = os.getenv("LOCAL_DATA_PATH", "../../data")
path = Path(BASE_PATH).joinpath('raw/fedora-devel-list')
mboxes = [x.name for x in list(path.glob('*.mbox'))]

In [6]:
def mbox_to_text(mbox):

    csv = []
    for msg in mbox:
        msg_id = msg["Message-ID"]
        date = msg["Date"]
        body = []
        for m in msg.get_payload():
            body.append(m.get_payload())

        csv.append((msg_id, date, body))
    df = pd.DataFrame(csv,columns=['Message-ID', "Date", "Body"])
    return df

In [7]:
# Ensure datset location exists
dataset_base_path = Path(f"{BASE_PATH}/interim/text")
dataset_base_path.mkdir(parents=True, exist_ok=True)


# Register all created dataset slices for later upload
new_files = []

for mbox in mboxs:
    output_path = dataset_base_path.joinpath(f'{mbox}.csv')
    monthly_mbox = mailbox.mbox(path.joinpath(mbox))
    df = mbox_to_text(monthly_mbox)
    df.to_csv(output_path)
    new_files.append(output_path)
    print(f"{output_path} saved")

../../data/interim/text/fedora-devel-2018-3.mbox.csv saved
../../data/interim/text/fedora-devel-2018-5.mbox.csv saved
../../data/interim/text/fedora-devel-2018-6.mbox.csv saved
../../data/interim/text/fedora-devel-2018-7.mbox.csv saved
../../data/interim/text/fedora-devel-2018-8.mbox.csv saved
../../data/interim/text/fedora-devel-2018-9.mbox.csv saved
../../data/interim/text/fedora-devel-2019-7.mbox.csv saved
../../data/interim/text/fedora-devel-2019-8.mbox.csv saved
../../data/interim/text/fedora-devel-2019-9.mbox.csv saved
../../data/interim/text/fedora-devel-2019-10.mbox.csv saved
../../data/interim/text/fedora-devel-2019-11.mbox.csv saved
../../data/interim/text/fedora-devel-2019-12.mbox.csv saved
../../data/interim/text/fedora-devel-2020-1.mbox.csv saved
../../data/interim/text/fedora-devel-2020-2.mbox.csv saved
../../data/interim/text/fedora-devel-2020-3.mbox.csv saved
../../data/interim/text/fedora-devel-2020-4.mbox.csv saved
../../data/interim/text/fedora-devel-2020-5.mbox.csv 

In [8]:
# Push to Ceph
if os.getenv('RUN_IN_AUTOMATION'):
    utils.upload_files((f.as_posix(), f'interim/text/{Path(f).name}') for f in new_files)