In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import zipfile

In [None]:
zf = zipfile.ZipFile("/data/buildings.zip")
buildings = pd.read_csv(zf.open("Geb100m.csv"), delimiter=",", encoding="cp1252")

In [None]:
buildings

In [None]:
# read excel
translations = pd.read_excel("Data_Format_Census.xlsx", sheet_name="Translations")

In [None]:
translations

In [None]:
# tansfrom to dictionary with original as key and translation as value
translation_dict = translations.set_index("Original")["Translated"].to_dict()
# translations_dict = dict(zip(translations['Original'], translations['Translated']))

In [None]:
pivot_df = buildings.pivot(
    index="Gitter_ID_100m", columns=["Merkmal", "Auspraegung_Text"], values="Anzahl"
)

instead of replacing nan with 0, exclude these cells

In [None]:
# rename columns with trtanslation_dict
pivot_df = pivot_df.rename(columns=translation_dict)

In [None]:
pivot_df

In [None]:
# print unique columns
print(pivot_df.columns.get_level_values(0).unique())

In [None]:
total = pivot_df["TOTAL"]["Total"].sum()

In [None]:
pivot_df[["TOTAL", "HEAT_TYPE"]]

In [None]:
pivot_df["HEAT_TYPE_DC"] = (
    pivot_df["HEAT_TYPE"].sum(axis=1) / pivot_df["TOTAL"]["Total"]
)
pivot_df["APARTMENT_NO_DC"] = (
    pivot_df["APARTMENT_NO"].sum(axis=1) / pivot_df["TOTAL"]["Total"]
)
pivot_df["OWNERSHIP_TYPE2_DC"] = (
    pivot_df["OWNERSHIP_TYPE2"].sum(axis=1) / pivot_df["TOTAL"]["Total"]
)
pivot_df["BUILDING_TYPE1_DC"] = (
    pivot_df["BUILDING_TYPE1"].sum(axis=1) / pivot_df["TOTAL"]["Total"]
)
pivot_df["BUILDING_TYPE2_DC"] = (
    pivot_df["BUILDING_TYPE2"].sum(axis=1) / pivot_df["TOTAL"]["Total"]
)
pivot_df["BUILDING_SIZE_DC"] = (
    pivot_df["BUILDING_SIZE"].sum(axis=1) / pivot_df["TOTAL"]["Total"]
)
pivot_df["BUILDING_YEAR_DC"] = (
    pivot_df["BUILDING_YEAR"].sum(axis=1) / pivot_df["TOTAL"]["Total"]
)

In [None]:
pivot_df[["TOTAL", "HEAT_TYPE", "HEAT_TYPE_DC"]]

In [None]:
heat = pivot_df["HEAT_TYPE"].sum().sum()
apt = pivot_df["APARTMENT_NO"].sum().sum()
owner = pivot_df["OWNERSHIP_TYPE2"].sum().sum()
build1 = pivot_df["BUILDING_TYPE1"].sum().sum()
build2 = pivot_df["BUILDING_TYPE2"].sum().sum()
build_size = pivot_df["BUILDING_SIZE"].sum().sum()
build_year = pivot_df["BUILDING_YEAR"].sum().sum()

In [None]:
print("heat:", heat / total * 100)
print("apt:", apt / total * 100)
print("ownership:", owner / total * 100)
print("build1:", build1 / total * 100)
print("build2:", build2 / total * 100)
print("build_size:", build_size / total * 100)
print("build_yr:", build_year / total * 100)

In [None]:
data1 = pivot_df["HEAT_TYPE_DC"]
data2 = pivot_df["APARTMENT_NO_DC"]
data3 = pivot_df["OWNERSHIP_TYPE2_DC"]
data4 = pivot_df["BUILDING_TYPE1_DC"]
data5 = pivot_df["BUILDING_TYPE2_DC"]
data6 = pivot_df["BUILDING_SIZE_DC"]
data7 = pivot_df["BUILDING_YEAR_DC"]


counts1, bins1 = np.histogram(data1)
counts1 = counts1 / counts1.sum() * 100

counts2, bins2 = np.histogram(data2)
counts2 = counts2 / counts2.sum() * 100

counts3, bins3 = np.histogram(data3)
counts3 = counts3 / counts3.sum() * 100

counts4, bins4 = np.histogram(data4)
counts4 = counts4 / counts4.sum() * 100

counts5, bins5 = np.histogram(data5)
counts5 = counts5 / counts5.sum() * 100

counts6, bins6 = np.histogram(data6)
counts6 = counts6 / counts6.sum() * 100

counts7, bins7 = np.histogram(data7)
counts7 = counts7 / counts7.sum() * 100


plt.figure(figsize=(15, 5))
plt.subplot(1, 7, 1)
plt.hist(bins1[:-1], bins1, weights=counts1)
plt.xlabel("Heating Type")
plt.ylabel("Percentage of total")
plt.ylim(0, 100)
plt.subplot(1, 7, 2)
plt.hist(bins2[:-1], bins2, weights=counts2)
plt.xlabel("Number of Apartments")
plt.ylim(0, 100)
plt.subplot(1, 7, 3)
plt.hist(bins3[:-1], bins3, weights=counts3)
plt.xlabel("Ownership")
plt.ylim(0, 100)
plt.subplot(1, 7, 4)
plt.hist(bins4[:-1], bins4, weights=counts4)
plt.xlabel("Building Type 1")
plt.ylim(0, 100)
plt.subplot(1, 7, 5)
plt.hist(bins5[:-1], bins5, weights=counts5)
plt.xlabel("Building Type 2")
plt.ylim(0, 100)
plt.subplot(1, 7, 6)
plt.hist(bins6[:-1], bins6, weights=counts6)
plt.xlabel("Building Size")
plt.ylim(0, 100)
plt.subplot(1, 7, 7)
plt.hist(bins7[:-1], bins7, weights=counts7)
plt.xlabel("Building Year")
plt.ylim(0, 100)


plt.tight_layout()


plt.show()

In [None]:
pivot_df.to_csv("Buidlings100m_pivot.csv", sep=",", encoding="cp1252")