In [None]:
import tarfile
import pandas as pd
import requests
from zipfile import ZipFile
from typing import List

In [None]:
def extract_csv_files(archive_path: str, headers: List[str]) -> pd.DataFrame:
    if archive_path.endswith(".tar.gz"):
        with tarfile.open(archive_path, "r:gz") as tar:
            output = pd.concat([pd.read_csv(tar.extractfile(file))
                               for file in tar.getnames()])

    if archive_path.endswith(".zip"):
        zip_file = ZipFile(archive_path)

        output = pd.concat([pd.read_csv(zip_file.open(csv_file), header=None)
                  for csv_file in zip_file.namelist()])
    
    output.columns = headers

    return output

In [None]:
traffic_file_archives = ["../raw_datasets/citypulse_traffic_raw_data_aarhus_aug_sep_2014.tar.gz",
                         "../raw_datasets/citypulse_traffic_raw_data_aarhus_oct_nov_2014.zip",
                         ]

headers = ["status", "avg_measured_time", "avg_speed",	"ext_id",
           "median_measured_time", "timestamp", "vehicle_count", "_id", "report_id"]

In [None]:
traffic_data = pd.concat([extract_csv_files(archive, headers)
                for archive in traffic_file_archives])

In [None]:
traffic_data.shape

In [None]:
traffic_data.head()

In [None]:
# missing data
traffic_data.isna().sum()

In [None]:
traffic_data.to_csv("../data/aarhus_traffic_data_aug_nov_2014.csv")