In [1]:
import pandas as pd;
# PATHS
# Changed structure for more legibility of sources
#agriculture (3)
agri1 = '../finaldata/la_final_data_bundesland_agriculture_animalcount.csv'
agri2 = '../finaldata/la_final_data_landkreis_agriculture_animalcount.csv'

#buildings (2)
#bui1 = '../finaldata/ge_final_data_aggregated_landkreis_newbuildings_renewables.csv'
bui2 = '../finaldata/ge_final_data_bundesland_heating.csv'
bui3 = '../finaldata/ge_final_data_dynamic_aggregated_landkreis_newbuildings_renewables.csv'

#mobility (3)
# mobi1 = '../finaldata/mo_final_data_landkreis_mobility_cartype.csv'
mobi2 = '../finaldata/mo_final_data_bundesland_mobility_mid.csv'
mobi3 = '../finaldata/mo_final_data_landkreis_mobility_fueltype.csv'

#waste (3)
was1 = '../finaldata/ab_final_data_landkreis_waste_organicwaste.csv'

#energy (3)
en1 = '../finaldata/en_final_data_bundesland_energy_primaryconsumption.csv'
en2 = '../finaldata/en_final_data_landkreis_energy_industryenergy.csv'

#combine all paths
paths = [mobi2,mobi3,agri2,agri1,bui3,bui2,en1,en2,was1]
paths

['../finaldata/mo_final_data_bundesland_mobility_mid.csv',
 '../finaldata/mo_final_data_landkreis_mobility_fueltype.csv',
 '../finaldata/la_final_data_landkreis_agriculture_animalcount.csv',
 '../finaldata/la_final_data_bundesland_agriculture_animalcount.csv',
 '../finaldata/ge_final_data_dynamic_aggregated_landkreis_newbuildings_renewables.csv',
 '../finaldata/ge_final_data_bundesland_heating.csv',
 '../finaldata/en_final_data_bundesland_energy_primaryconsumption.csv',
 '../finaldata/en_final_data_landkreis_energy_industryenergy.csv',
 '../finaldata/ab_final_data_landkreis_waste_organicwaste.csv']

In [2]:
import json
import numpy as np
import os

#decision to split by lk, not by section, to have smaller slices of data 
#max slice of 5 section / 5 landkreise, min slice of 5 sections / 1 landkreis 
#not max slice of 5 sections / 400 landkreise, min slice of 1 section / 400 landkreise
            
#1001
    #ags
    #bundesland
    #score
    #En
        #score
        #primary_energy 
            #regional
            #data
            #benchmark
            #unit
        #industry
            #regional
            #data
            #benchmark
            #unit

#1. Create pipeline to fetch snippets of data for each location and indicator
#2. TODO: Define a hierarchy for indicators
#3. Create dynamic labels to differentiate datasets
#4. Check if regional data are available for each landkreis to store it as a parameter

sections={"mobility":"Mo","agriculture":"La","energy":"En","waste":"Ab","buildings":"Ge"}

def getDictionaryWithPrefix(df):
    #set index, condition to catch error in case AGS column is not defined
    if 'AGS' in df.columns:
        df.set_index('AGS',inplace=True)
    
    #fill nan with jsonable format
    df = df.fillna(np.nan).replace([np.nan], [None])
    
    #drop name column
    df.drop(columns=["Name"],axis=1,inplace=True)
    
    #get dynamic labels to differentiate datasets
    #get prefix
    columns = df.columns.tolist()
    #remove column Unit to avoid data mash-up (dirtydirty)
    if "Unit" in columns:
        columns.remove("Unit")
    prefix = os.path.commonprefix(columns)  
    print(prefix)
    section_eng = prefix.split('_')[0]
    section = sections[section_eng]

    #remove prefix from column names
    df.columns = df.columns.str.removeprefix(prefix)
    
    #remove section from prefix
    prefix = prefix.removeprefix(section_eng)

    #create dict from dataframe
    data_dict = df.T.to_dict('dict')
    
    return section,prefix,data_dict 


def generate_json(paths,last_year):

    landkreisData = {}

    indicators = []
    dicts = []
    indicator_sections = []
    
    #get current ags list from population data from regionalstatisik
    population = pd.read_csv("../general_data/population.csv",delimiter=";",encoding="latin1")
    population.rename({"1_Auspraegung_Code":"AGS"},axis=1,inplace=True)
    population["AGS"].replace({"DG":0},inplace=True)
    population["AGS"] = pd.to_numeric(population["AGS"])
    population["Zeit"] = population["Zeit"].str.replace(".","/",regex=True)
    population['Zeit'] =  pd.to_datetime(population['Zeit'],dayfirst=True)
    population['Jahr'] = population['Zeit'].dt.year

    #rows with no population data in some year before is changed ags
    changedAGS = population[(population.BEVSTD__Bevoelkerungsstand__Anzahl == "-") & (population.Jahr != last_year)].AGS.unique()
    currentAGS = population.AGS.unique() #only current AGS are counted in this dataset

    population["BEVSTD__Bevoelkerungsstand__Anzahl"] = population["BEVSTD__Bevoelkerungsstand__Anzahl"].replace("-",np.NaN,regex=True)
    units = []
    #pipeline to fetch snippets of data for each location and indicator
    for path in paths:
        #read dataset
        print(path)
        df = pd.read_csv(path)
        #check if ags is in list of current ags    
        #df = df[~df['AGS'].isin(currentAGS)]
        #append footnotes for lk if only new data exists
        
        #Check if unit is defined in the dataset 
        if "Unit" in df.columns.tolist():
            # If yes appends itself to array
            unitValue = df["Unit"][1]
            units.append(unitValue)
            df.drop('Unit', axis=1, inplace=True)
        else:
            # If no appends dummy to array
            units.append("not defined at source")

        # if "energy_industry_consumption_has_regional_data" in df.columns.tolist():
        #     regional_data = df["energy_industry_consumption_has_regional_data"]
        #     print(regional_data)
        #     df.drop("energy_industry_consumption_has_regional_data", axis=1, inplace=True)
        
        section,prefix,data_dict = getDictionaryWithPrefix(df)
        indicators.append(prefix)
        indicator_sections.append(section)
        dicts.append(data_dict)

    print("indicator: ",indicators)

    for ort in currentAGS:
        ort = int(ort)
        footnote = ""
        
        #handle new landkreise
        if(ort in changedAGS):
            s = population[population.AGS == ort]
            #get first datapoint for each
            first = s.BEVSTD__Bevoelkerungsstand__Anzahl.first_valid_index()
            footnote = "Diesen Landkreis gibt es erst seit " + str(population.iloc[first].Jahr)
        
        if(ort == 15003):
            print(footnote)
        
        #store bundesland of each landkreis
        if(ort < 17):
            bundesland = ort
        elif ort > 1000:
            bundesland = int(str(ort)[:-3])  #store bundesland from ags
        else:
            print("not included: ",ort)
            pass

        #create top level data for each location
        landkreisData[str(ort)] = {
            'ags':ort,
            'bundesland':bundesland,
            'score':2,
            'footnote':footnote,
            'Mo':{},
            'La':{},
            'Ge':{},
            'En':{},
            'Ab':{},
            }

        data = {}
        
        #iterate over indicators
        for i,ind in enumerate(indicators):
            ind_section = indicator_sections[i]
            if(ort in dicts[i]):
                local_data = [{"key":index ,"year":item[0][-4:],"column":item[0][:-5],"value":item[1]} for index,item in enumerate(dicts[i][ort].items())]
                #print(local_data)
                #create one entry per indicator
                #Check if regional data are available for each landkreis to store it as a parameter
                #Gets unit of measure from array of units based on i
                isRegional = True if ort > 17 else False

                #if industry data: use "has regional data column"
                if (ind == "_industry_consumption_"):
                    isRegional = dicts[i][ort]["has_regional_data"] if ort > 17 else False
                    #remove last element from local data (so "has regional data")
                    local_data = local_data[:-1]

                if (ind == "_organicwaste_kg_per_person_"):
                    isRegional = dicts[i][ort]["has_regional_data"] if ort > 17 else False
                    #remove last element from local data (so "has regional data")
                    local_data = local_data[:-1]

                data[ind] = {
                    'benchmark':200,
                    'regional': isRegional,
                    'unit': units[i],
                    'data':local_data
                }
            
            else:
                data[ind] = {
                    'regional':False,
                }

            #add data entry to location level
            #landkreisData[str(ort)][ind] = "added data"
            landkreisData[str(ort)][ind_section][ind] = data[ind]
        
    with open('data.json', 'w', encoding='utf-8') as f:
        json.dump(landkreisData, f, ensure_ascii=False, indent=4)


generate_json(paths,2021)

../finaldata/mo_final_data_bundesland_mobility_mid.csv
mobility_mid_
../finaldata/mo_final_data_landkreis_mobility_fueltype.csv
mobility_fueltype_cardensity_
../finaldata/la_final_data_landkreis_agriculture_animalcount.csv
agriculture_animalcount_Anzahl_
../finaldata/la_final_data_bundesland_agriculture_animalcount.csv
agriculture_animalcount_bydensity_
../finaldata/ge_final_data_dynamic_aggregated_landkreis_newbuildings_renewables.csv
buildings_newbuilding_energy_
../finaldata/ge_final_data_bundesland_heating.csv
buildings_heating_20
../finaldata/en_final_data_bundesland_energy_primaryconsumption.csv
energy_primaryconsumption_
../finaldata/en_final_data_landkreis_energy_industryenergy.csv
energy_industry_consumption_
../finaldata/ab_final_data_landkreis_waste_organicwaste.csv
waste_organicwaste_kg_per_person_
indicator:  ['_mid_', '_fueltype_cardensity_', '_animalcount_Anzahl_', '_animalcount_bydensity_', '_newbuilding_energy_', '_heating_20', '_primaryconsumption_', '_industry_consum

In [23]:
paths = [mobi2]
for path in paths:
    #read dataset
    df = pd.read_csv(path)

    #check if ags is in list of current ags
    check = df[~df['AGS'].isin(currentAGS)]

#TODO: agri1 & bui1 & en2 have regierunsgbezirke included
    #is brunaschweig included??

check

NameError: name 'currentAGS' is not defined