In [96]:
import pandas as pd;
# PATHS
#agriculture (3)
agri1 = '../agriculture/data/final_data_bundesland_agriculture_animalcount.csv'
agri2 = '../agriculture/data/final_data_landkreis_agriculture_animalcount.csv'

#buildings (2)
bui1 = '../buildings/data/final_data_landkreis_newbuildings_renewables.csv'
bui2 = '../buildings/data/final_data_bundesland_heating.csv'

#mobility (3)
mobi1 = '../mobility/data/final_data_landkreis_mobility_cartype.csv'
mobi2 = '../mobility/data/final_data_bundesland_mobility_mid.csv'

#waste (3)

#energy (3)
en1 = '../energy/data/final_data_bundesland_energy_primaryconsumption.csv'
en2 = '../energy/data/final_data_landkreis_energy_industryenergy.csv'

#combine all paths
paths = [mobi1,mobi2,agri2,agri1,bui1,bui2,en1,en2]
paths

['../mobility/data/final_data_landkreis_mobility_cartype.csv',
 '../mobility/data/final_data_bundesland_mobility_mid.csv',
 '../agriculture/data/final_data_landkreis_agriculture_animalcount.csv',
 '../agriculture/data/final_data_bundesland_agriculture_animalcount.csv',
 '../buildings/data/final_data_landkreis_newbuildings_renewables.csv',
 '../buildings/data/final_data_bundesland_heating.csv',
 '../energy/data/final_data_bundesland_energy_primaryconsumption.csv',
 '../energy/data/final_data_landkreis_energy_industryenergy.csv']

In [104]:
import json
import numpy as np
import os

#decision to split by lk, not by section, to have smaller slices of data 
#max slice of 5 section / 5 landkreise, min slice of 5 sections / 1 landkreis 
#not max slice of 5 sections / 400 landkreise, min slice of 1 section / 400 landkreise
            
#1001
    #ags
    #bundesland
    #score
    #En
        #score
        #primary_energy 
            #regional
            #data
            #benchmark
            #unit
        #industry
            #regional
            #data
            #benchmark
            #unit

#1. Create pipeline to fetch snippets of data for each location and indicator
#2. TODO: Define a hierarchy for indicators
#3. Create dynamic labels to differentiate datasets
#4. Check if regional data are available for each landkreis to store it as a parameter

sections={"mobility":"Mo","agriculture":"La","energy":"En","waste":"Ab","buildings":"Ge"}

def getDictionaryWithPrefix(df):
    #set index
    df.set_index('AGS',inplace=True)

    #fill nan with jsonable format
    df = df.fillna(np.nan).replace([np.nan], [None])
    
    #drop name column
    df.drop(columns=["Name"],axis=1,inplace=True)

    #get dynamic labels to differentiate datasets
    #get prefix
    columns = df.columns.tolist()
    prefix = os.path.commonprefix(columns)
    prefix

    #TODO: create hiearchy from prefixes
    section_eng = prefix.split('_')[0]
    section = sections[section_eng]
    print(section)

    #remove prefix from column names
    df.columns = df.columns.str.removeprefix(prefix)
    
    #remove section from prefix
    prefix = prefix.removeprefix(section_eng)

    #create dict from dataframe
    data_dict = df.T.to_dict('dict')
    
    return section,prefix,data_dict 


def generate_json(paths):

    landkreisData = {}

    indicators = []
    dicts = []
    indicator_sections = []
    
    #get current ags list from population data from regionalstatisik
    population = pd.read_csv("../general_data/population.csv",delimiter=";",encoding="latin1")
    #population = pd.read_csv("../mobility/data/car_total.csv",delimiter=";",encoding="latin1")
    population.rename({"1_Auspraegung_Code":"AGS"},axis=1,inplace=True)
    population["AGS"].replace({"DG":0},inplace=True)
    population["AGS"] = pd.to_numeric(population["AGS"])
    population["Zeit"] = population["Zeit"].str.replace(".","/",regex=True)
    population['Zeit'] =  pd.to_datetime(population['Zeit'],dayfirst=True)
    population['Jahr'] = population['Zeit'].dt.year


    ags = population.AGS.unique()

    #rows with no population data
    changedAGS = population[(population.BEVSTD__Bevoelkerungsstand__Anzahl == "-") & (population.Jahr != 2021)].AGS.unique()

    #changedAGS = [5334]
    population["BEVSTD__Bevoelkerungsstand__Anzahl"] = population["BEVSTD__Bevoelkerungsstand__Anzahl"].replace("-",np.NaN,regex=True)

    #pipeline to fetch snippets of data for each location and indicator
    for path in paths:
        #read dataset
        df = pd.read_csv(path)
        
        #check if ags is in list of current ags    
        df = df[~df['AGS'].isin(currentAGS)]
        
        #append footnotes for lk if only new data exists

        section,prefix,data_dict = getDictionaryWithPrefix(df)
        indicators.append(prefix)
        indicator_sections.append(section)
        dicts.append(data_dict)

    print("indicator: ",indicators)


    for ort in ags:

        ort = int(ort)
        footnote = ""
        
        if(ort in changedAGS):
            s = population[population.AGS == ort]
            #get first datapoint for each
            first = s.BEVSTD__Bevoelkerungsstand__Anzahl.first_valid_index()
            footnote = "Diesen Landkreis gibt es erst seit " + str(population.iloc[first].Jahr)
        
        if(ort < 17):
            bundesland = ort
        elif ort > 1000:
            bundesland = int(str(ort)[:-3])  #store bundesland from ags
        else:
            print("not included: ",ort)
            pass

        #create top level for each location
        landkreisData[str(ort)] = {
            'ags':ort,
            'bundesland':bundesland,
            'score':2,
            'footnote':footnote,
            'Mo':{},
            'La':{},
            'Ge':{},
            'En':{},
            'Ab':{},
            }

        data = {}
        
        #iterate over indicators
        for i,ind in enumerate(indicators):
            ind_section = indicator_sections[i]
            #print(ind,ind_section)

            if(ort in dicts[i]):
                
                local_data = [{"key":index ,"year":item[0][-4:],"column":item[0][:-5],"value":item[1]} for index,item in enumerate(dicts[i][ort].items())]
        
                #create one entry per indicator
                #Check if regional data are available for each landkreis to store it as a parameter
                data[ind] = {
                    'benchmark':200,
                    'regional': True if ort > 17 else False,
                    'unit': 'cars / 1000 residents',
                    'data':local_data
                }
            
            else:
                #TODO: add bundesland level to all indicators, so this is only false for landkreise
                data[ind] = {
                    'regional':False,
                }

            #add data entry to location level
            #landkreisData[str(ort)][ind] = "added data"
            landkreisData[str(ort)][ind_section][ind] = data[ind]
        

    with open('data-inprogress.json', 'w', encoding='utf-8') as f:
        json.dump(landkreisData, f, ensure_ascii=False, indent=4)


generate_json(paths)

Mo
Mo
La
La
Ge
Ge
En
En
indicator:  ['_cartype_density_', '_mid_', '_animalcount_Anzahl_', '_animalcount_bydensity_', '_newbuilding_energy_', '_heating_20', '_primaryconsumption_', '_industry_consumption_']


In [41]:
paths = [mobi2]
for path in paths:
    #read dataset
    df = pd.read_csv(path)

    #check if ags is in list of current ags
    check = df[~df['AGS'].isin(currentAGS)]

#TODO: agri1 & bui1 & en2 have regierunsgbezirke included
    #is brunaschweig included??

check

Unnamed: 0,AGS,Name,mobility_mid_percentage_wegkm_Fahrrad_2002,mobility_mid_percentage_wegkm_Fahrrad_2008,mobility_mid_percentage_wegkm_Fahrrad_2017,mobility_mid_percentage_wegkm_Fahrrad_und_ÖV_2002,mobility_mid_percentage_wegkm_Fahrrad_und_ÖV_2008,mobility_mid_percentage_wegkm_Fahrrad_und_ÖV_2017,mobility_mid_percentage_wegkm_MIV_(Fahrer)_und_ÖV_2002,mobility_mid_percentage_wegkm_MIV_(Fahrer)_und_ÖV_2008,...,mobility_mid_wegkm_nur_MIV_(Mitfahrer)_2017,mobility_mid_wegkm_nur_ÖV_2002,mobility_mid_wegkm_nur_ÖV_2008,mobility_mid_wegkm_nur_ÖV_2017,mobility_mid_wegkm_zu_Fuß_2002,mobility_mid_wegkm_zu_Fuß_2008,mobility_mid_wegkm_zu_Fuß_2017,mobility_mid_sum_2002,mobility_mid_sum_2008,mobility_mid_sum_2017
