In [5]:
import urllib.request
import calendar
import glob
import pandas as pd
import subprocess
from dateutil import parser

In [43]:
def get_file_urls():
    base_url = "https://aineistot.liikennevirasto.fi/lam/reports/LAM/"
    file_names = []
    for y in range(18, 19):
        for m in range(1, 13):
            dir_name = "20{:02}{:02}11/".format(y, m+1)
            first_date = parser.parse("20{:02}-{:02}-01".format(y, m))
            d = calendar.monthrange(first_date.year, first_date.month)[1]
            file_name = "168_kt50_Askisto_20{:02}{:02}01_20{:02}{:02}{:02}.xls".format(y, m, y, m, d)
            file_url = base_url + dir_name + file_name
            file_names.append((file_url, file_name))
    return file_names

In [9]:
def download_xls_files():
    for url, file_name in get_file_urls():
        try:
            urllib.request.urlretrieve(url, "./data/" + file_name)
            print(file_name + " downloaded")
        except:
            print(url + " not found")
            pass

In [18]:
def convert_xls_to_csv():
    for f in (glob.glob("./data/*.xls")):
        subprocess.call(["ssconvert", f, f[:-3] + "csv"])

In [27]:
columns = [
    'Mittauspiste', 
    'Sijainti', 
    'Päivä', 
    'Suuntakoodi', 
    'Ajoneuvoluokka',  
    'KLO_00-01', 
    'KLO_01-02',
    'KLO_02-03', 
    'KLO_03-04', 
    'KLO_04-05', 
    'KLO_05-06', 
    'KLO_06-07',
    'KLO_07-08', 
    'KLO_08-09', 
    'KLO_09-10', 
    'KLO_10-11', 
    'KLO_11-12',
    'KLO_12-13', 
    'KLO_13-14', 
    'KLO_14-15', 
    'KLO_15-16', 
    'KLO_16-17',
    'KLO_17-18', 
    'KLO_18-19', 
    'KLO_19-20', 
    'KLO_20-21', 
    'KLO_21-22',
    'KLO_22-23', 
    'KLO_23-00',
]

def read_csv_files_to_dataframe():
    dfs = []
    for f in (glob.glob("./data/*.csv")):
        df = pd.read_csv(f)
        df.columns = columns 
        dfs.append(df)
    df = pd.concat(dfs)
    return df

In [37]:
def export_dataframe_to_csv(df):
    df.to_csv("cleaned_dataset.csv")

In [28]:
data = read_csv_files_to_dataframe()

In [38]:
export_dataframe_to_csv(data)

In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40879 entries, 0 to 419
Data columns (total 29 columns):
Mittauspiste      40879 non-null int64
Sijainti          40879 non-null object
Päivä             40879 non-null object
Suuntakoodi       40879 non-null int64
Ajoneuvoluokka    40879 non-null object
KLO_00-01         32345 non-null float64
KLO_01-02         30672 non-null float64
KLO_02-03         28173 non-null float64
KLO_03-04         29243 non-null float64
KLO_04-05         30277 non-null float64
KLO_05-06         34568 non-null float64
KLO_06-07         37941 non-null float64
KLO_07-08         38806 non-null float64
KLO_08-09         39521 non-null float64
KLO_09-10         39998 non-null float64
KLO_10-11         40115 non-null float64
KLO_11-12         40167 non-null float64
KLO_12-13         40246 non-null float64
KLO_13-14         40255 non-null float64
KLO_14-15         40310 non-null float64
KLO_15-16         40256 non-null float64
KLO_16-17         40164 non-null float6

In [36]:
data.sample(10)

Unnamed: 0,Mittauspiste,Sijainti,Päivä,Suuntakoodi,Ajoneuvoluokka,KLO_00-01,KLO_01-02,KLO_02-03,KLO_03-04,KLO_04-05,...,KLO_14-15,KLO_15-16,KLO_16-17,KLO_17-18,KLO_18-19,KLO_19-20,KLO_20-21,KLO_21-22,KLO_22-23,KLO_23-00
76,168,kt50_Askisto,2018/05/06,1,17 HA + AV,,,,,,...,5.0,6.0,13.0,4.0,7.0,4.0,2.0,3.0,1.0,
286,168,Askisto,2012/09/21,1,17 HA + AV,,,1.0,,,...,13.0,10.0,8.0,4.0,2.0,3.0,4.0,4.0,2.0,
217,168,Askisto,2015/06/16,2,11 HA-PA,146.0,99.0,47.0,36.0,63.0,...,1515.0,2518.0,2999.0,2011.0,1368.0,986.0,934.0,583.0,393.0,328.0
61,168,Askisto,2010/02/05,1,16 HA + PK,,,,,,...,18.0,18.0,15.0,11.0,10.0,3.0,3.0,2.0,2.0,1.0
340,168,Askisto,2013/05/25,1,15 KATP,3.0,5.0,1.0,3.0,4.0,...,4.0,7.0,4.0,6.0,4.0,3.0,8.0,3.0,4.0,1.0
355,168,Askisto,2014/02/26,1,16 HA + PK,,,1.0,1.0,,...,19.0,19.0,18.0,20.0,17.0,2.0,3.0,5.0,1.0,2.0
134,168,Askisto,2016/01/10,2,12 KAIP,2.0,5.0,1.0,1.0,,...,6.0,9.0,7.0,6.0,12.0,10.0,4.0,2.0,10.0,6.0
323,168,Askisto,2012/01/24,1,12 KAIP,6.0,6.0,8.0,10.0,17.0,...,131.0,130.0,92.0,55.0,42.0,22.0,22.0,19.0,12.0,9.0
175,168,Askisto,2017/03/13,2,11 HA-PA,141.0,88.0,17.0,30.0,76.0,...,1463.0,2452.0,2790.0,1852.0,1180.0,848.0,673.0,453.0,290.0,210.0
175,168,Askisto,2011/06/13,2,11 HA-PA,191.0,94.0,53.0,49.0,72.0,...,1391.0,2096.0,2699.0,1962.0,1367.0,1057.0,760.0,510.0,312.0,220.0
