In [2]:
import urllib.request
import calendar
import glob
import pandas as pd
import subprocess
from dateutil import parser

In [18]:
def get_file_urls(file_prefix):
    base_url = "https://aineistot.liikennevirasto.fi/lam/reports/LAM/"
    file_names = []
    for y in range(13, 18):
        for m in range(1, 13):    
            dir_name = "20{:02}{:02}11/".format(y, m+1)
            first_date = parser.parse("20{:02}-{:02}-01".format(y, m))
            d = calendar.monthrange(first_date.year, first_date.month)[1]
            file_name = file_prefix + "_20{:02}{:02}01_20{:02}{:02}{:02}.xls".format(y, m, y, m, d)
            file_url = base_url + dir_name + file_name
            file_names.append((file_url, file_name))
    return file_names

In [15]:
def download_xls_files(file_prefix, to_location="./data/"):
    for url, file_name in get_file_urls(file_prefix):
        try:
            urllib.request.urlretrieve(url, to_location + file_name)
            print(file_name + " downloaded")
        except:
            print(url + " not found")
            pass

In [23]:
def convert_xls_to_csv(at_location="./data2/"):
    for f in (glob.glob("{}*.xls".format(at_location))):
        subprocess.call(["ssconvert", f, f[:-3] + "csv"])

In [33]:
columns = [
    'location_id', 
    'location_name', 
    'date', 
    'direction', 
    'vehicle_type',  
    'hour_1', 
    'hour_2',
    'hour_3',
    'hour_4',
    'hour_5', 
    'hour_6', 
    'hour_7',
    'hour_8',
    'hour_9',
    'hour_10',
    'hour_11',
    'hour_12',
    'hour_13',
    'hour_14',
    'hour_15',
    'hour_16',
    'hour_17',
    'hour_18',
    'hour_19',
    'hour_20',
    'hour_21',
    'hour_22',
    'hour_23',
    'hour_24',
]

data_dirs = ['./data1/', './data2/', './data3/']

def read_csv_files_to_dataframe():
    dfs = []
    for data_dir in data_dirs:
        for f in (glob.glob("{}*.csv".format(data_dir))):
            df = pd.read_csv(f)
            df.columns = columns 
            dfs.append(df)
        df = pd.concat(dfs)
    return df

In [34]:
def export_dataframe_to_csv(df):
    df.to_csv("raw_dataset.csv", index=False)

In [24]:
#file_prefixes = ["168_kt50_Askisto", "110_M%c3%84NTS%c3%84L%c3%84", "1403_KEMIJ%c3%84RVI"]
download_location = "./data3/"
#download_xls_files(file_prefix=file_prefixes[2], to_location=download_location)

#https://aineistot.liikennevirasto.fi/lam/reports/LAM/20141211/110_M%c3%84NTS%c3%84L%c3%84_20141101_20141130.xls
convert_xls_to_csv(at_location=download_location)

In [35]:
data = read_csv_files_to_dataframe().fillna(0)

In [37]:
export_dataframe_to_csv(data)

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111151 entries, 0 to 404
Data columns (total 29 columns):
location_id      111151 non-null int64
location_name    111151 non-null object
date             111151 non-null object
direction        111151 non-null int64
vehicle_type     111151 non-null object
hour_1           111151 non-null float64
hour_2           111151 non-null float64
hour_3           111151 non-null float64
hour_4           111151 non-null float64
hour_5           111151 non-null float64
hour_6           111151 non-null float64
hour_7           111151 non-null float64
hour_8           111151 non-null float64
hour_9           111151 non-null float64
hour_10          111151 non-null float64
hour_11          111151 non-null float64
hour_12          111151 non-null float64
hour_13          111151 non-null float64
hour_14          111151 non-null float64
hour_15          111151 non-null float64
hour_16          111151 non-null float64
hour_17          111151 non-null float

In [29]:
data.sample(20)

Unnamed: 0,Mittauspiste,Sijainti,Päivä,Suuntakoodi,Ajoneuvoluokka,KLO_00-01,KLO_01-02,KLO_02-03,KLO_03-04,KLO_04-05,...,KLO_14-15,KLO_15-16,KLO_16-17,KLO_17-18,KLO_18-19,KLO_19-20,KLO_20-21,KLO_21-22,KLO_22-23,KLO_23-00
266,168,Askisto,2017/05/20,1,11 HA-PA,171.0,127.0,91.0,107.0,130.0,...,1333.0,1218.0,1137.0,1016.0,975.0,810.0,523.0,430.0,370.0,277.0
104,1403,KEMIJÄRVI,2013/12/09,1,16 HA + PK,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0
338,168,Askisto,2011/05/25,1,13 Linja-autot,0.0,1.0,0.0,0.0,2.0,...,7.0,4.0,7.0,12.0,4.0,3.0,3.0,3.0,3.0,0.0
44,110,MÄNTSÄLÄ,2017/06/04,1,13 Linja-autot,1.0,0.0,1.0,0.0,0.0,...,5.0,6.0,3.0,13.0,5.0,5.0,4.0,4.0,1.0,1.0
229,1403,KEMIJÄRVI,2013/01/18,2,13 Linja-autot,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0
170,168,Askisto,2013/09/13,1,13 Linja-autot,0.0,0.0,1.0,0.0,1.0,...,6.0,8.0,6.0,8.0,4.0,2.0,1.0,1.0,4.0,3.0
59,110,MÄNTSÄLÄ,2015/08/05,1,14 KAPP,4.0,4.0,1.0,1.0,1.0,...,30.0,22.0,13.0,30.0,14.0,13.0,9.0,11.0,8.0,7.0
76,1403,KEMIJÄRVI,2013/02/06,2,14 KAPP,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
46,168,Askisto,2016/05/04,1,15 KATP,9.0,11.0,16.0,16.0,7.0,...,66.0,59.0,44.0,48.0,41.0,20.0,16.0,26.0,15.0,11.0
27,168,Askisto,2013/12/02,2,17 HA + AV,0.0,0.0,0.0,0.0,0.0,...,5.0,9.0,2.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0
