In [20]:
import tarfile
import pandas as pd
import requests
from zipfile import ZipFile
from typing import List
import requests
from io import BytesIO

In [22]:
def extract_csv_files(file_url_path: str, headers: List[str]) -> pd.DataFrame:
    response = requests.get(file_url_path)
    if file_url_path.endswith(".tar.gz"):
        
        with tarfile.open(fileobj=BytesIO(response.content), mode='r:gz') as tar:
            output = pd.concat([pd.read_csv(tar.extractfile(file))
                               for file in tar.getnames()])

    if file_url_path.endswith(".zip"):
        zip_file = ZipFile(BytesIO(response.content))

        output = pd.concat([pd.read_csv(zip_file.open(csv_file), header=None)
                  for csv_file in zip_file.namelist()])
    
    output.columns = headers

    return output

In [23]:

traffic_url_archives = ["http://iot.ee.surrey.ac.uk:8080/datasets/traffic/traffic_oct_nov/citypulse_traffic_raw_data_aarhus_oct_nov_2014.zip",
                        "http://iot.ee.surrey.ac.uk:8080/datasets/traffic/traffic_june_sep/citypulse_traffic_raw_data_aarhus_aug_sep_2014.tar.gz"
                        ]

headers = ["status", "avg_measured_time", "avg_speed",	"ext_id",
           "median_measured_time", "timestamp", "vehicle_count", "_id", "report_id"]

In [24]:
traffic_data = pd.concat([extract_csv_files(archive, headers)
                for archive in traffic_url_archives])

In [25]:
traffic_data.shape

(11520409, 9)

In [26]:
traffic_data.head()

Unnamed: 0,status,avg_measured_time,avg_speed,ext_id,median_measured_time,timestamp,vehicle_count,_id,report_id
0,OK,78,47,668,78,2014-10-01T01:40:00,0,28071995,158324
1,OK,78,47,668,78,2014-10-01T01:45:00,0,28072438,158324
2,OK,76,48,668,76,2014-10-01T01:50:00,1,28072887,158324
3,OK,76,48,668,76,2014-10-01T01:55:00,1,28073334,158324
4,OK,60,61,668,60,2014-10-01T02:05:00,0,28074100,158324


In [27]:
# missing data
traffic_data.isna().sum()

status                  0
avg_measured_time       0
avg_speed               0
ext_id                  0
median_measured_time    0
timestamp               0
vehicle_count           0
_id                     0
report_id               0
dtype: int64

In [28]:
traffic_data.to_csv("../data/aarhus_traffic_data_aug_nov_2014.csv")