In [7]:
### Download industry energy consumption data for Landkreise
### from regionalstatistik.de and compile into csv with Regions as rows
### and years as columns.

In [8]:
###https://www.regionalstatistik.de/genesis/online?operation=table&code=43531-01-02-4-B&levelindex=0&levelid=1658896529187#astructure
###download data for all regions and all years as "flat"

In [9]:
#import necessary libraries

import pandas as pd
import numpy as np

In [10]:
#helper function pivot original table
def createWideFormat(df,column, values):
    #pivot table to wide format data
    orte = df["AGS"].unique()

    pivot_total = pd.DataFrame()

    for ort in orte:
            snippet = df[df["AGS"] == ort]

            temp = pd.pivot_table(snippet, index='Jahr',columns=column, values=values)
            temp = temp.reset_index(level=0)

            temp["ort_ags"] = ort
            temp["ort_name"]= str(snippet["1_Auspraegung_Label"].iloc[0]).lstrip()

            pivot_total = pd.concat([pivot_total, temp])
            
    return pivot_total

In [11]:
#load data from csv
ie_df = pd.read_csv("data/industry.csv",delimiter=";",encoding="latin1")

ie_df

Unnamed: 0,Statistik_Code,Statistik_Label,Zeit_Code,Zeit_Label,Zeit,1_Merkmal_Code,1_Merkmal_Label,1_Auspraegung_Code,1_Auspraegung_Label,2_Merkmal_Code,2_Merkmal_Label,2_Auspraegung_Code,2_Auspraegung_Label,VBR001__Energieverbrauch_(einschl._nichtenergetischem_V.)__Tsd._MJ
0,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2020,KREISE,Kreise und kreisfreie Städte,DG,Deutschland,ENRNW1,Energieträger,ENRGTRNW0,insgesamt,3747109390
1,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2020,KREISE,Kreise und kreisfreie Städte,DG,Deutschland,ENRNW1,Energieträger,ENRGTRNW1,Kohle,583625959
2,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2020,KREISE,Kreise und kreisfreie Städte,DG,Deutschland,ENRNW1,Energieträger,ENRGTRNW2,Heizöl,61319731
3,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2020,KREISE,Kreise und kreisfreie Städte,DG,Deutschland,ENRNW1,Energieträger,ENRGTRNW3,Erdgas,1168708514
4,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2020,KREISE,Kreise und kreisfreie Städte,DG,Deutschland,ENRNW1,Energieträger,ENRGTRNW4,Erneuerbare Energien,153606273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77467,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2003,KREISE,Kreise und kreisfreie Städte,16077,"Altenburger Land, Kreis",ENRNW1,Energieträger,ENRGTRNW3,Erdgas,240813
77468,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2003,KREISE,Kreise und kreisfreie Städte,16077,"Altenburger Land, Kreis",ENRNW1,Energieträger,ENRGTRNW4,Erneuerbare Energien,-
77469,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2003,KREISE,Kreise und kreisfreie Städte,16077,"Altenburger Land, Kreis",ENRNW1,Energieträger,ENRGTRNW5,Strom,430856
77470,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2003,KREISE,Kreise und kreisfreie Städte,16077,"Altenburger Land, Kreis",ENRNW1,Energieträger,ENRGTRNW6,Wärme,.


In [129]:
#rename AGS and year columns for merging with population
ie_df.rename({"1_Auspraegung_Code":"AGS","2_Auspraegung_Label":"Energietyp","Zeit":"Jahr","VBR001__Energieverbrauch_(einschl._nichtenergetischem_V.)__Tsd._MJ":"Energieverbrauch"},axis=1,inplace=True)

#set empty cells to nan and convert to numeric
ie_df["Energieverbrauch"] = ie_df["Energieverbrauch"].replace({'-':0,".":np.NaN})
ie_df["Energieverbrauch"] = pd.to_numeric(ie_df["Energieverbrauch"])

#create wide format data
pivot = createWideFormat(ie_df,"Energietyp","Energieverbrauch")

#calculate proportion of renewables
pivot["Anteil_Erneuerbar"] = pivot["Erneuerbare Energien"] / pivot["insgesamt"] * 100

#calculate secret part of data
pivot["Geheim"] = pivot['insgesamt']-pivot[['Erdgas', 'Erneuerbare Energien', 'Heizöl', 'Kohle', 'Sonstige Energieträger','Strom','Wärme']].sum(axis=1)
pivot["Geheim"] = pivot["Geheim"].apply(lambda x: x if x > 3 else 0)
pivot["Anteil_Geheim"] = pivot["Geheim"] / pivot["insgesamt"] * 100
pivot.loc[pivot['insgesamt']== 0, 'Anteil_Geheim'] = 100
pivot["Anteil_Geheim"]

#correct AGS for Deutschland
pivot['ort_ags']=pivot['ort_ags'].replace('DG',0)

pivot['ort_ags']=pd.to_numeric(pivot['ort_ags'])

pivot

Energietyp,Jahr,Erdgas,Erneuerbare Energien,Heizöl,Kohle,Sonstige Energieträger,Strom,Wärme,insgesamt,ort_ags,ort_name,Anteil_Erneuerbar,Geheim,Anteil_Geheim
0,2003,1.013851e+09,59425545.0,285107207.0,660462603.0,418927983.0,854459979.0,112208059.0,3.404443e+09,0,Deutschland,1.745529,0.0,0.000000
1,2004,1.035524e+09,84931606.0,269248048.0,838349544.0,443621664.0,878001673.0,110378173.0,3.660054e+09,0,Deutschland,2.320501,0.0,0.000000
2,2005,1.042020e+09,101389083.0,332050567.0,715638646.0,400588448.0,891448655.0,118128163.0,3.601264e+09,0,Deutschland,2.815375,0.0,0.000000
3,2006,1.038921e+09,101034069.0,315453665.0,749290655.0,602259640.0,893774767.0,153365492.0,3.854099e+09,0,Deutschland,2.621470,0.0,0.000000
4,2007,1.067395e+09,140056029.0,279773855.0,832459933.0,827447483.0,919395218.0,156783228.0,4.223310e+09,0,Deutschland,3.316262,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,2016,3.669870e+05,,43101.0,,4026.0,664679.0,11674.0,1.102775e+06,16077,"Altenburger Land, Kreis",,12308.0,1.116093
14,2017,3.626500e+05,,41589.0,,4059.0,684864.0,13311.0,1.120508e+06,16077,"Altenburger Land, Kreis",,14035.0,1.252557
15,2018,3.470580e+05,0.0,40064.0,,,661013.0,12766.0,1.079857e+06,16077,"Altenburger Land, Kreis",0.000000,18956.0,1.755418
16,2019,3.655850e+05,0.0,52241.0,,,622395.0,27840.0,1.085825e+06,16077,"Altenburger Land, Kreis",0.000000,17764.0,1.635991


In [13]:
#save test dataframe to file
#pivot.to_csv("data/test_data_landkreis_energy_industryenergy.csv")

Energietyp,Jahr,Erdgas,Erneuerbare Energien,Heizöl,Kohle,Sonstige Energieträger,Strom,Wärme,insgesamt,ort_ags,ort_name,Anteil_Erneuerbar,Geheim,Anteil_Geheim
0,2003,1.013851e+09,59425545.0,285107207.0,660462603.0,418927983.0,854459979.0,112208059.0,3.404443e+09,0,Deutschland,1.745529,0.0,0.000000
1,2004,1.035524e+09,84931606.0,269248048.0,838349544.0,443621664.0,878001673.0,110378173.0,3.660054e+09,0,Deutschland,2.320501,0.0,0.000000
2,2005,1.042020e+09,101389083.0,332050567.0,715638646.0,400588448.0,891448655.0,118128163.0,3.601264e+09,0,Deutschland,2.815375,0.0,0.000000
3,2006,1.038921e+09,101034069.0,315453665.0,749290655.0,602259640.0,893774767.0,153365492.0,3.854099e+09,0,Deutschland,2.621470,0.0,0.000000
4,2007,1.067395e+09,140056029.0,279773855.0,832459933.0,827447483.0,919395218.0,156783228.0,4.223310e+09,0,Deutschland,3.316262,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,2016,3.669870e+05,,43101.0,,4026.0,664679.0,11674.0,1.102775e+06,16077,"Altenburger Land, Kreis",,12308.0,1.116093
14,2017,3.626500e+05,,41589.0,,4059.0,684864.0,13311.0,1.120508e+06,16077,"Altenburger Land, Kreis",,14035.0,1.252557
15,2018,3.470580e+05,0.0,40064.0,,,661013.0,12766.0,1.079857e+06,16077,"Altenburger Land, Kreis",0.000000,18956.0,1.755418
16,2019,3.655850e+05,0.0,52241.0,,,622395.0,27840.0,1.085825e+06,16077,"Altenburger Land, Kreis",0.000000,17764.0,1.635991


In [131]:
#filter secret landkreise

#get landkreise with 0 in total energy in more than 5 years since 2010
recents = pivot[pivot['Jahr'] >= 2010]
recents["has_data"] = recents["insgesamt"] > 0

groups = recents.groupby(['ort_ags']).sum()
without_data = groups[groups.has_data <= 5].index

#use bundesland data for these cases
pivot['bundeslanddata_used'] = False 
pivot.loc[pivot['ort_ags'].isin(without_data), 'bundeslanddata_used'] = True

energycolumns = ['Erdgas', 'Erneuerbare Energien', 'Heizöl', 'Kohle',
       'Sonstige Energieträger', 'Strom', 'Wärme', 'insgesamt', 'Anteil_Erneuerbar']

for ort in without_data:
    if int(ort) < 1000:
        bundesland = int(str(int(ort))[2:])
    else:
        bundesland = int(str(int(ort))[-6:-3])
    pivot.loc[pivot.ort_ags == ort,energycolumns] = pivot.loc[pivot.ort_ags == bundesland,energycolumns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recents["has_data"] = recents["insgesamt"] > 0


In [134]:
#change year column to string to facilitate renaming columns after second pivot
pivot['Jahr'] = pivot['Jahr'].astype(str)

#pivot data to include years and energy types as columns
piv_df = pivot.pivot(index=["ort_ags","ort_name"],columns="Jahr",values=['Erdgas', 'Erneuerbare Energien', 'Heizöl', 'Kohle',
       'Sonstige Energieträger', 'Strom', 'Wärme','Geheim', 'insgesamt', 'Anteil_Erneuerbar',"Anteil_Geheim","bundeslanddata_used"])

#collapse levels of column names and join energy type to year
piv_df.columns = piv_df.columns.map('_'.join)

#prepend all columns with indicator name
piv_df=piv_df.add_prefix('energy_industry_consumption_')

#reset index to recreate AGS and Name columns
piv_df.reset_index(inplace=True)

#rename regional id and name and strip whitespace from Name column
piv_df.rename(columns={'ort_name':'Name','ort_ags':'AGS'},inplace=True)
piv_df['Name']=piv_df['Name'].str.strip()
piv_df['AGS']=pd.to_numeric(piv_df['AGS'])

#set AGS as index
piv_df.set_index('AGS',inplace=True)
piv_df.sort_index(inplace=True)

##tsd mjoule to tjoule (1000*10^-6=0.001)

piv_df[piv_df.columns.difference(["Name","Unit","Anteil Erneuerbar"])] = piv_df[piv_df.columns.difference(["Name","Unit","Anteil Erneuerbar"])].applymap(lambda x: x*0.001)

#unit of measure
piv_df["Unit"] = "TJoule"

piv_df

  piv_df["Unit"] = "TJoule"


Unnamed: 0_level_0,Name,energy_industry_consumption_Erdgas_2003,energy_industry_consumption_Erdgas_2004,energy_industry_consumption_Erdgas_2005,energy_industry_consumption_Erdgas_2006,energy_industry_consumption_Erdgas_2007,energy_industry_consumption_Erdgas_2008,energy_industry_consumption_Erdgas_2009,energy_industry_consumption_Erdgas_2010,energy_industry_consumption_Erdgas_2011,...,energy_industry_consumption_bundeslanddata_used_2012,energy_industry_consumption_bundeslanddata_used_2013,energy_industry_consumption_bundeslanddata_used_2014,energy_industry_consumption_bundeslanddata_used_2015,energy_industry_consumption_bundeslanddata_used_2016,energy_industry_consumption_bundeslanddata_used_2017,energy_industry_consumption_bundeslanddata_used_2018,energy_industry_consumption_bundeslanddata_used_2019,energy_industry_consumption_bundeslanddata_used_2020,Unit
AGS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Deutschland,1013851.318,1035523.684,1042020.106,1038921.147,1067394.684,1063796.264,961175.019,1083952.519,1075942.674,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TJoule
1,Schleswig-Holstein,16719.489,15952.023,16495.189,16249.404,17288.187,17596.175,17799.405,21590.156,23296.010,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TJoule
2,Hamburg,20832.851,22699.813,19694.204,22059.252,21767.977,21451.180,18353.140,21258.040,19089.141,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TJoule
3,Niedersachsen,138240.466,140664.906,149429.766,139171.323,137855.601,138242.116,127744.815,138032.562,137875.917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TJoule
4,Bremen,13797.809,13427.685,11900.574,13503.478,13513.058,13553.719,11043.752,12617.533,12022.076,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TJoule
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11008008,Berlin-Neukölln,77861.672,80629.839,83515.275,83285.920,86120.305,87298.024,77873.772,82159.648,79813.967,...,0.0,0.0,0.0,0.0,0.0,,,,,TJoule
11009009,Berlin-Treptow-Köpenick,114188.673,124835.990,123680.865,124605.899,125987.870,121952.135,113348.258,137071.491,140412.247,...,0.0,0.0,0.0,0.0,0.0,,,,,TJoule
11010010,Berlin-Marzahn-Hellersdorf,8941.282,10370.692,10676.497,11133.437,10594.507,11901.759,10657.172,12071.922,11261.072,...,0.0,0.0,0.0,0.0,0.0,,,,,TJoule
11011011,Berlin-Lichtenberg,3881.270,3679.908,3388.890,4662.937,5997.800,5741.925,4403.581,4782.369,4485.206,...,0.0,0.0,0.0,0.0,0.0,,,,,TJoule


In [135]:
#save dataframe to file
piv_df.to_csv("data/final_data_landkreis_energy_industryenergy.csv")

In [17]:
## DATASNIPPET FOR TESTING

barnim = pivot[pivot.ort_ags == "12060"]
barnim.drop(["ort_ags","ort_name","insgesamt"],axis= 1,inplace=True)

melt = pd.melt(barnim,id_vars=["Jahr"])
melt = melt.fillna(0)
melt['value']=pd.to_numeric(melt['value'])
melt = melt[melt["Energietyp"] != "Anteil_Erneuerbar"]

renewables = ["Wärme","Strom","Erneuerbare Energien"]
melt["Energiekategorie"] = np.where(melt.Energietyp.isin(renewables), 'renewable', 'fossil')
melt.to_csv("data/snippet/energy_industry_barnim.csv")
melt

barnim.to_csv("data/snippet/energy_industry_barnim_wide.csv")
melt

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barnim.drop(["ort_ags","ort_name","insgesamt"],axis= 1,inplace=True)


Unnamed: 0,Jahr,Energietyp,value,Energiekategorie
0,2003,Erdgas,426279.0,fossil
1,2004,Erdgas,439554.0,fossil
2,2005,Erdgas,402347.0,fossil
3,2006,Erdgas,378409.0,fossil
4,2007,Erdgas,392469.0,fossil
...,...,...,...,...
157,2016,Geheim,15867.0,fossil
158,2017,Geheim,26720.0,fossil
159,2018,Geheim,204932.0,fossil
160,2019,Geheim,17923.0,fossil
