# neak loader notebook alkalmazás v2

In [1]:
import pyodbc
import pandas as pd
import numpy as np
import os
import sqlite3
from datetime import datetime, timezone

In [2]:
def extractPeriod(filename):
    """read the period (YYYYMM) value from filename"""
    period = ""
    for char in filename:
        if char.isnumeric():
            period = period + char
    return period

In [3]:
def openListOfFiles():
    """returns a list with the names of the already scanned files"""
    fileListText = open("files_list.csv", "r")
    fileList = fileListText.read()
    fileListText.close()
    print("\tThe list of processed files is loaded")
    return fileList

In [4]:
def writeFilesList(nowProcessed):
    """write the list of now scanned files"""
    fileListText = open("files_list.csv", 'a')
    for step in nowProcessed:
        fileListText.write(step+";")
    fileListText.close()
    print("\tThe nowProcessed filename list writed to file.")

In [5]:
def dirReading(fileList):
    """returns a list with the names of files that have not yet been processed"""
    entries = os.scandir('.')
    toDoList = []
    for entry in entries:
        if entry.is_file():
            filename = entry.name
            if filename.endswith(".mdb"):
                if filename in fileList:
                    pass
                else:
                    toDoList.append(filename)
    print("\tDirectory reading ready", toDoList)
    return toDoList

In [6]:
def mdbReader(path, filename, q):
    """return a DataFrame readed from Access database"""
    file = path+"\\"+filename
    period = extractPeriod(filename)
    if int(period) < 200801:
        query = q + "_" + period
    else:
        query = q
    print(period, "query:", query)
    conn = pyodbc.connect(
        r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+file)
    dfr = pd.read_sql(query, conn)
    dfr.shape
    conn.close()
    print("\tMS-Access file reading OK")
    return dfr

In [7]:
def mdbMetaData(filename, startTime):
    """returns with the list of metadata of selected file"""
    entry = []
    entry.append(startTime)
    statinfo = os.stat(filename)
    entry.append(filename)
    period = extractPeriod(filename)
    year = period[:4]
    entry.append(year)
    entry.append(period)
    entry.append(statinfo.st_size)
    created = datetime.fromtimestamp(statinfo.st_ctime, tz=timezone.utc).strftime("%Y/%m/%d, %H:%M:%S")
    entry.append(created)
    return entry

In [8]:
# source: https://www.ksh.hu
# módszertan: KSH módszertani útmutató(PDF)
# ("Az ábrázolhatóság érdekében a megyenevek helyett
# egyes grafikonokon az alábbi rövidítéseket alkalmaztuk: ...")

megyedict = {
1:"BA",
2:"BÁ",
3:"BÉ",
4:"BO",
5:"CS",
6:"FE",
7:"GY",
8:"HA",
9:"HE",
10:"KO",
11:"NÓ",
12:"PE",
13:"SO",
14:"SZ",
15:"JÁ",
16:"TO",
17:"VA",
18:"VE",
19:"ZA",
20:"BP",
21:"BP",
'Bács-Kiskun':'BÁ',
'Baranya':'BA',
'Békés':'BÉ',
'Borsod-Abaúj-Zemplén':'BO',
'Budapest':'BP',
'Csongrád-Csanád': 'CS',
'Csongrád':'CS',
'Fejér':'FE',
'Fejé':'FE',
'Győr-Moson-Sopron':'GY',
'Hajdú-Bihar':'HA',
'Heves':'HE',
'Jász-Nagykun-Szolnok':'JÁ',
'Komárom-Esztergom':'KO',
'Nógrád':'NÓ',
'Pest':'PE',
'Somogy':'SO',
'Szabolcs-Szatmár-Bereg':'SZ',
'Tolna':'TO',
'Vas':'VA',
'Veszprém':'VE',
'Zala':'ZA'}

In [9]:
def mdbDecorator(mdb, step):
    if int(extractPeriod(step)) < 200801:
        mdb.columns=['idoszak', 'TTT', 'nev', 'kisznev', 'tk', 'atc', 'jogcim', 'megye', 'doboz', 'tbtam', 'fogyar', 'terdij', 'kvater']
        mdb.insert(1, 'brand', np.nan)
        mdb.insert(2, 'brandid', np.nan)
        mdb.insert(8, 'hatoanyag', np.nan)
        mdb.insert(9, 'venytipus', np.nan)
        mdb.insert(13, 'dotforg', np.nan)
    elif int(extractPeriod(step)) < 200802:
        mdb.columns=['idoszak', 'Megyekód', 'TTT', 'nev', 'kisznev', 'tk', 'atc', 'jogcim', 'doboz', 'tbtam', 'fogyar', 'terdij', 'kvater']
        mdb.insert(1, 'brand', np.nan)
        mdb.insert(2, 'brandid', np.nan)
        mdb.insert(9, 'hatoanyag', np.nan)
        mdb.insert(10, 'venytipus', np.nan)
        mdb.insert(12, 'megye', np.nan)
        mdb['megye'] = mdb['Megyekód']
        del mdb['Megyekód']
        mdb.insert(13, 'dotforg', np.nan)
    elif int(extractPeriod(step)) < 200901:
        mdb.insert(10, 'venytipus', np.nan)
        mdb.columns = mdb.columns.str.lower()
    else:
        mdb.columns=['idoszak', 'brand', 'brandid', 'TTT', 'nev', 'kisznev', 'tk', 'atc', 'hatoanyag','venytipus', 'jogcim', 'megye', 'doboz','dotforg', 'tbtam', 'fogyar', 'terdij', 'kvater']
    return mdb

In [10]:
def priceClassifier(fogyar, doboz):
    if doboz == 0:
        price = 0
    else:
        price = abs(fogyar/doboz)
    if price < 1000:
        return "A:1000 Ft"
    elif price < 2000:
        return "B:1-2 ezer Ft"
    elif price < 4000:
        return "C:2-4 ezer Ft"
    elif price < 6000:
        return "D:4-6 ezer Ft"
    elif price < 10000:
        return "E:6-10 ezer Ft"
    elif price < 20000:
        return "F:10-20 ezer Ft"
    elif price < 40000:
        return "G:20-40 ezer Ft"
    elif price < 150000:
        return "I:90-150 ezer Ft"
    elif price < 250000:
        return "J:150-250 ezer Ft"
    elif price < 500000:
        return "K:250-500 ezer Ft"
    else:
        return "L:500 ezer Ft felett"

In [11]:
def copaymentClassifier(terdij, doboz):
    if doboz == 0:
        copayment = 0
    else:
        copayment = abs(terdij/doboz)
    if copayment < 500:
        return "1:500 Ft alatt"
    elif copayment < 1000:
        return "2:500-1000 Ft"
    elif copayment < 2000:
        return "3:1000-2000 Ft"
    elif copayment < 3000:
        return "4:2000-3000 Ft"
    elif copayment < 5000:
        return "5:3000-5000 Ft"
    elif copayment < 10000:
        return "6:5000-10000 Ft"
    elif copayment < 25000:
        return "7:10000-25000 Ft"
    elif copayment < 75000:
        return "8:25000-75000 Ft"
    elif copayment < 150000:
        return "9:75000-150000 Ft"
    elif copayment < 1000000:
        return "10:150000 - 1000000 Ft"
    else:
        return "11:1000000 Ft felett"

In [12]:
# declarations:

query = "SELECT * FROM Megyei_forgalom"
directory = r"C:\neak_source"
journal = []
nowProcessed = []

In [None]:
# main loader process

db = sqlite3.connect('neak2.sqlite3')
journal = []
nowProcessed = []
alreadyProcessed = openListOfFiles()
toDoList = dirReading(alreadyProcessed)
startTime = datetime.now().strftime("%y-%m-%d_%H-%M-%S")

for step in toDoList:
    mdb = mdbReader(directory, step, query)
    entry = mdbMetaData(step, startTime)
    entry.append(len(mdb))
    mdb = mdbDecorator(mdb, step)
    mdb['megye'] = mdb['megye'].map(megyedict)
    entry.append(mdb['doboz'].sum()/1000000) 
    entry.append(mdb['fogyar'].sum()/1000000000)
    mdb['priceClass'] = mdb.apply(lambda mdb: priceClassifier(mdb['fogyar'], mdb['doboz']), axis=1)
    mdb['copaymClass'] = mdb.apply(lambda mdb: copaymentClassifier(mdb['terdij'], mdb['doboz']), axis=1)
    year = extractPeriod(step)[:4]
    mdb.to_sql(("whole"+year), db, if_exists = 'append')
    nowProcessed.append(step)
    journal.append(entry)
db.close()
print("ready")
writeFilesList(nowProcessed)
dfprocessed = pd.DataFrame(journal, columns =['startTime', 'filename', 'year', 'period', 'size', 'created', 'sourceRecordNumber', 'dfSumBoxMill', 'dfSumPriceBill' ])
period = extractPeriod(step)
STR = 'already_processed_log_'+period+'.csv'
dfprocessed.to_csv(STR, sep=";", encoding='utf-8', index=False)