In [1]:
### Download industry energy consumption data for Landkreise
### from regionalstatistik.de and compile into csv with Regions as rows
### and years as columns.

In [2]:
###https://www.regionalstatistik.de/genesis/online?operation=table&code=43531-01-02-4-B&levelindex=0&levelid=1658896529187#astructure
###download data for all regions and all years as "flat"

In [3]:
#import necessary libraries

import pandas as pd
import numpy as np

In [4]:
#helper function pivot original table
def createWideFormat(df,column, values):
    #pivot table to wide format data
    orte = df["AGS"].unique()

    pivot_total = pd.DataFrame()

    for ort in orte:
            snippet = df[df["AGS"] == ort]

            temp = pd.pivot_table(snippet, index='Jahr',columns=column, values=values)
            temp = temp.reset_index(level=0)

            temp["ort_ags"] = ort
            temp["ort_name"]= str(snippet["1_Auspraegung_Label"].iloc[0]).lstrip()

            pivot_total = pd.concat([pivot_total, temp])
            
    return pivot_total

In [5]:
#load data from csv
ie_df = pd.read_csv("data/industry.csv",delimiter=";",encoding="latin1")

ie_df

Unnamed: 0,Statistik_Code,Statistik_Label,Zeit_Code,Zeit_Label,Zeit,1_Merkmal_Code,1_Merkmal_Label,1_Auspraegung_Code,1_Auspraegung_Label,2_Merkmal_Code,2_Merkmal_Label,2_Auspraegung_Code,2_Auspraegung_Label,VBR001__Energieverbrauch_(einschl._nichtenergetischem_V.)__Tsd._MJ
0,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2020,KREISE,Kreise und kreisfreie Städte,DG,Deutschland,ENRNW1,Energieträger,ENRGTRNW0,insgesamt,3747109390
1,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2020,KREISE,Kreise und kreisfreie Städte,DG,Deutschland,ENRNW1,Energieträger,ENRGTRNW1,Kohle,583625959
2,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2020,KREISE,Kreise und kreisfreie Städte,DG,Deutschland,ENRNW1,Energieträger,ENRGTRNW2,Heizöl,61319731
3,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2020,KREISE,Kreise und kreisfreie Städte,DG,Deutschland,ENRNW1,Energieträger,ENRGTRNW3,Erdgas,1168708514
4,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2020,KREISE,Kreise und kreisfreie Städte,DG,Deutschland,ENRNW1,Energieträger,ENRGTRNW4,Erneuerbare Energien,153606273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77467,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2003,KREISE,Kreise und kreisfreie Städte,16077,"Altenburger Land, Kreis",ENRNW1,Energieträger,ENRGTRNW3,Erdgas,240813
77468,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2003,KREISE,Kreise und kreisfreie Städte,16077,"Altenburger Land, Kreis",ENRNW1,Energieträger,ENRGTRNW4,Erneuerbare Energien,-
77469,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2003,KREISE,Kreise und kreisfreie Städte,16077,"Altenburger Land, Kreis",ENRNW1,Energieträger,ENRGTRNW5,Strom,430856
77470,43531,Jahreserhebung ü. die Energieverwendung der Be...,JAHR,Jahr,2003,KREISE,Kreise und kreisfreie Städte,16077,"Altenburger Land, Kreis",ENRNW1,Energieträger,ENRGTRNW6,Wärme,.


In [6]:
#rename AGS and year columns for merging with population
ie_df.rename({"1_Auspraegung_Code":"AGS","2_Auspraegung_Label":"Energietyp","Zeit":"Jahr","VBR001__Energieverbrauch_(einschl._nichtenergetischem_V.)__Tsd._MJ":"Energieverbrauch"},axis=1,inplace=True)

#set empty cells to nan and convert to numeric
ie_df["Energieverbrauch"] = ie_df["Energieverbrauch"].replace({'-':0,".":np.NaN})
ie_df["Energieverbrauch"] = pd.to_numeric(ie_df["Energieverbrauch"])

#create wide format data
pivot = createWideFormat(ie_df,"Energietyp","Energieverbrauch")

#calculate proportion of renewables
pivot["Anteil_Erneuerbar"] = pivot["Erneuerbare Energien"] / pivot["insgesamt"] * 100

#calculate secret part of data
pivot["Geheim"] = pivot['insgesamt']-pivot[['Erdgas', 'Erneuerbare Energien', 'Heizöl', 'Kohle', 'Sonstige Energieträger','Strom','Wärme']].sum(axis=1)
pivot["Geheim"] = pivot["Geheim"].apply(lambda x: x if x > 3 else 0)
pivot["Anteil_Geheim"] = pivot["Geheim"] / pivot["insgesamt"] * 100
pivot.loc[pivot['insgesamt']== 0, 'Anteil_Geheim'] = 100
pivot.loc[pivot['insgesamt'].isnull(), 'Anteil_Geheim'] = 100
pivot["Anteil_Geheim"]

#correct AGS for Deutschland
pivot['ort_ags']=pivot['ort_ags'].replace('DG',0)

pivot['ort_ags']=pd.to_numeric(pivot['ort_ags'])

pivot[pivot.ort_ags == 3103]

Energietyp,Jahr,Erdgas,Erneuerbare Energien,Heizöl,Kohle,Sonstige Energieträger,Strom,Wärme,insgesamt,ort_ags,ort_name,Anteil_Erneuerbar,Geheim,Anteil_Geheim
0,2003,,0.0,,0.0,,,23518.0,,3103,"Wolfsburg, kreisfreie Stadt",,0.0,100.0
1,2004,,0.0,33081.0,0.0,,,,,3103,"Wolfsburg, kreisfreie Stadt",,0.0,100.0
2,2005,,0.0,13857.0,0.0,,,436693.0,,3103,"Wolfsburg, kreisfreie Stadt",,0.0,100.0
3,2006,,0.0,33558.0,,,,434583.0,,3103,"Wolfsburg, kreisfreie Stadt",,0.0,100.0
4,2007,,0.0,,,,,,,3103,"Wolfsburg, kreisfreie Stadt",,0.0,100.0
5,2008,,0.0,,,,,,,3103,"Wolfsburg, kreisfreie Stadt",,0.0,100.0
6,2009,,0.0,16917.0,,,,433217.0,,3103,"Wolfsburg, kreisfreie Stadt",,0.0,100.0
7,2010,,0.0,,,,,33980.0,,3103,"Wolfsburg, kreisfreie Stadt",,0.0,100.0
8,2011,,0.0,,,,,30752.0,,3103,"Wolfsburg, kreisfreie Stadt",,0.0,100.0
9,2012,,0.0,13880.0,,,,,,3103,"Wolfsburg, kreisfreie Stadt",,0.0,100.0


In [7]:
#save test dataframe to file
#pivot.to_csv("data/test_data_landkreis_energy_industryenergy.csv")

In [8]:
#filter secret landkreise

#get landkreise with 0 in total energy in more than 6 years since 2010
recents = pivot[pivot['Jahr'] >= 2010]
recents["has_data"] = recents["insgesamt"] > 0

groups = recents.groupby(['ort_ags']).sum()
without_data = groups[groups.has_data <= 4].index

without_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recents["has_data"] = recents["insgesamt"] > 0
  groups = recents.groupby(['ort_ags']).sum()


Int64Index([     141,      142,      143,      151,      152,      153,
                1061,     3102,     3103,     3157,
            ...
            11003003, 11004004, 11005005, 11006006, 11007007, 11008008,
            11009009, 11010010, 11011011, 11012012],
           dtype='int64', name='ort_ags', length=125)

In [9]:
#change year column to string to facilitate renaming columns after second pivot
pivot['Jahr'] = pivot['Jahr'].astype(str)

#pivot data to include years and energy types as columns
piv_df = pivot.pivot(index=["ort_ags","ort_name"],columns="Jahr",values=['Erdgas', 'Erneuerbare Energien', 'Heizöl', 'Kohle',
       'Sonstige Energieträger', 'Strom', 'Wärme','Geheim', 'insgesamt', 'Anteil_Erneuerbar',"Anteil_Geheim"])

#collapse levels of column names and join energy type to year
piv_df.columns = piv_df.columns.map('_'.join)

#prepend all columns with indicator name
piv_df=piv_df.add_prefix('energy_industry_consumption_')

#reset index to recreate AGS and Name columns
piv_df.reset_index(inplace=True)

#rename regional id and name and strip whitespace from Name column
piv_df.rename(columns={'ort_name':'Name','ort_ags':'AGS'},inplace=True)
piv_df['Name']=piv_df['Name'].str.strip()
piv_df['AGS']=pd.to_numeric(piv_df['AGS'])

#store if AGS has only secret data
piv_df['energy_industry_consumption_has_regional_data'] = True 
piv_df.loc[piv_df['AGS'].isin(without_data), 'energy_industry_consumption_has_regional_data'] = False

#set AGS as index
piv_df.set_index('AGS',inplace=True)
piv_df.sort_index(inplace=True)

##tsd mjoule to tjoule (1000*10^-6=0.001)
cols = [col for col in piv_df.columns if not "Anteil" in col]
mask = piv_df[cols].columns.difference(["Name","Unit",'energy_industry_consumption_has_regional_data'])
#piv_df[mask] = piv_df[mask].applymap(lambda x: x*0.001)

#unit of measure
piv_df["Unit"] = "GJoule"

piv_df

Unnamed: 0_level_0,Name,energy_industry_consumption_Erdgas_2003,energy_industry_consumption_Erdgas_2004,energy_industry_consumption_Erdgas_2005,energy_industry_consumption_Erdgas_2006,energy_industry_consumption_Erdgas_2007,energy_industry_consumption_Erdgas_2008,energy_industry_consumption_Erdgas_2009,energy_industry_consumption_Erdgas_2010,energy_industry_consumption_Erdgas_2011,...,energy_industry_consumption_Anteil_Geheim_2013,energy_industry_consumption_Anteil_Geheim_2014,energy_industry_consumption_Anteil_Geheim_2015,energy_industry_consumption_Anteil_Geheim_2016,energy_industry_consumption_Anteil_Geheim_2017,energy_industry_consumption_Anteil_Geheim_2018,energy_industry_consumption_Anteil_Geheim_2019,energy_industry_consumption_Anteil_Geheim_2020,energy_industry_consumption_has_regional_data,Unit
AGS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Deutschland,1.013851e+09,1.035524e+09,1.042020e+09,1.038921e+09,1.067395e+09,1.063796e+09,961175019.0,1.083953e+09,1.075943e+09,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,True,GJoule
1,Schleswig-Holstein,1.671949e+07,1.595202e+07,1.649519e+07,1.624940e+07,1.728819e+07,1.759618e+07,17799405.0,2.159016e+07,2.329601e+07,...,18.313509,8.378238,10.422590,6.825056,0.000000,18.834137,18.773342,22.721961,True,GJoule
2,Hamburg,2.083285e+07,2.269981e+07,1.969420e+07,2.205925e+07,2.176798e+07,2.145118e+07,18353140.0,2.125804e+07,1.908914e+07,...,1.117061,7.196770,7.154508,6.728502,32.699227,2.399027,32.885934,32.488468,True,GJoule
3,Niedersachsen,1.382405e+08,1.406649e+08,1.494298e+08,1.391713e+08,1.378556e+08,1.382421e+08,127744815.0,1.380326e+08,1.378759e+08,...,31.274519,33.558165,31.015897,35.914885,34.860399,37.948640,37.972722,36.575043,True,GJoule
4,Bremen,1.379781e+07,1.342768e+07,1.190057e+07,1.350348e+07,1.351306e+07,1.355372e+07,11043752.0,1.261753e+07,1.202208e+07,...,0.000000,74.365769,76.768027,76.962388,73.927717,76.230458,75.283260,75.190047,True,GJoule
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11008008,Berlin-Neukölln,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.000000e+00,...,100.000000,100.000000,100.000000,100.000000,,,,,False,GJoule
11009009,Berlin-Treptow-Köpenick,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.000000e+00,...,100.000000,100.000000,100.000000,100.000000,,,,,False,GJoule
11010010,Berlin-Marzahn-Hellersdorf,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.000000e+00,...,100.000000,100.000000,100.000000,100.000000,,,,,False,GJoule
11011011,Berlin-Lichtenberg,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.000000e+00,...,100.000000,100.000000,100.000000,100.000000,,,,,False,GJoule


In [13]:
#save dataframe to file
piv_df.to_csv("data/final_data_landkreis_energy_industryenergy.csv")

In [12]:
## DATASNIPPET FOR TESTING

barnim = pivot[pivot.ort_ags == 10045]
barnim.drop(["ort_ags","ort_name","insgesamt"],axis= 1,inplace=True)

melt = pd.melt(barnim,id_vars=["Jahr"])
melt = melt.fillna(0)
melt['value']=pd.to_numeric(melt['value'])
melt = melt[melt["Energietyp"] != "Anteil_Erneuerbar"]

renewables = ["Wärme","Strom","Erneuerbare Energien"]
melt["Energiekategorie"] = np.where(melt.Energietyp.isin(renewables), 'renewable', 'fossil')
melt.to_csv("data/snippet/energy_industry_barnim.csv")
melt

barnim.to_csv("data/snippet/energy_industry_barnim_wide.csv")
melt

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barnim.drop(["ort_ags","ort_name","insgesamt"],axis= 1,inplace=True)


Unnamed: 0,Jahr,Energietyp,value,Energiekategorie
0,2003,Erdgas,2.104987e+06,fossil
1,2004,Erdgas,2.125962e+06,fossil
2,2005,Erdgas,2.205134e+06,fossil
3,2006,Erdgas,2.260986e+06,fossil
4,2007,Erdgas,2.217532e+06,fossil
...,...,...,...,...
115,2010,Anteil_Geheim,6.419762e+00,fossil
116,2011,Anteil_Geheim,7.445819e+00,fossil
117,2018,Anteil_Geheim,6.593957e+00,fossil
118,2019,Anteil_Geheim,0.000000e+00,fossil
