###### Import packages and csv tables

In [None]:
import pandas as pd
import altair as alt
from altair import *
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from vega_datasets import data

sns.set_style("white")
%matplotlib inline

In [None]:
df = pd.read_csv('expedia_data_clean.csv', index_col=0)
df_bp = pd.read_pickle("Booking_price_comparison.pkl")
df_air = pd.read_pickle("2017_airbnb_api_data_clean.pkl")

In [None]:
#Transformatin of df. area
df = df.replace("Hessenwinkel", "Treptow-Köpenick")
#rename columns
df.columns=["name", "area", "price"]
df_bp.columns = ["name", "price", "street", "area", "zip"]
df.head(2)

In [None]:
df_bp.head(2)

In [None]:
df_air.head(2)

In [None]:
# to measure the density of hotels per area, we only need the hotel and the area columns
df_bp = df_bp.drop([ "street", "zip"], axis=1)

##### Merge all the tables 

In [None]:
merged = df.append(df_bp, ignore_index=True, sort=True)

In [None]:
#Create Column that indicates source
merged["source"] = "hotel"
merged.head(2)

In [None]:
merged.head(2)

In [None]:
merged_all = merged.append(df_air, ignore_index=True, sort = True)

In [None]:
merged_all

##### Transformations

In [None]:
merged_all.area.unique()

In [None]:
# get rid of the € signs in price
#merged_data_full.price.replace(regex=True,inplace=True,to_replace=r'\D',value=r'')

In [None]:
#we have 24 duplicates
merged_all["name"].value_counts()

In [None]:
#drop duplicates
merged_all["name"] = merged_all["name"].drop_duplicates()

In [None]:
merged_all["name"].value_counts()

##### Inspect Data Types

In [None]:
merged_all.dtypes

In [None]:
merged_all.area.unique()

In [None]:
#remove spacing in the fromt of area name
merged_all.area = merged_all.area.str.lstrip()

In [None]:
#remove weird area data and rename area Berlin to unknown
merged_all.area = merged_all.area.replace("Berlin", "unknown")
merged_all.area = merged_all.area.replace("80-82", "unknown")
merged_all.area = merged_all.area.replace("10787 Berlin", "Tempelhof-Schöneberg")
merged_all.area = merged_all.area.replace("Charlottenburg-Wilm.", "Charlottenburg-Wilmersdorf")
merged_all.area = merged_all.area.replace("Marzahn - Hellersdorf", "Marzahn-Hellersdorf")
merged_all.area = merged_all.area.replace("Steglitz - Zehlendorf", "Steglitz-Zehlendorf")
merged_all.area = merged_all.area.replace("Tempelhof - Schöneberg", "Tempelhof-Schöneberg")
merged_all.area = merged_all.area.replace("Treptow - Köpenick", "Treptow-Köpenick")
merged_all.area = merged_all.area.replace("Prenzlauer Berg", "Pankow")

In [None]:
merged_all.area.unique()

In [None]:
# save price as int
merged_all["price"] = merged_all["price"].astype("float")

In [None]:
merged_all

##### Descriptive Statistics

In [None]:
# Price comparison between hotels and air bnb
price_chart = merged_all.groupby(["area", "source"]).median().unstack().plot(figsize=(10,6),kind="bar");
price_chart;
plt.legend();
plt.xlabel("Area", size = 15);
plt.ylabel(" Median Price in €", size = 15);
plt.title("Hotel and AirBnB Prices/Night per Area in Berlin", size = 20);
plt.xticks(rotation=50)

In [None]:
# here we can clearly see the difference in price
merged_all.groupby(["area", "source"]).median().unstack().plot.bar(figsize=(10,6),stacked=True)
plt.legend();
plt.xlabel("Area", size = 15);
plt.ylabel("Median Price in €", size = 15);
plt.title("Hotel and AirBnB Prices/Night per Area in Berlin", size = 20);
plt.xticks(rotation=50)

In [None]:
# density comparison between hotels and air bnb
density_chart = merged_all.sort_values(["area", "source"]).reset_index(drop=True).groupby(["area", "source"]).size().unstack().plot(figsize=(10,16),kind="bar");
density_chart;
plt.legend();
plt.xlabel("Area", size = 15);
plt.ylabel("Number of Listings", size = 15);
plt.title("Hotel and AirBnB Listings per Area in Berlin", size = 20);
plt.xticks(rotation=50)

In [None]:
# hstogram is no use to us here 
merged_all.sort_values(["area", "source"]).reset_index(drop=True).groupby(["area", "source"]).size().unstack().plot.hist(alpha=0.5);

In [None]:
#merged_data_full.boxplot(column="price")

#we have outliers - might consider a drop of the one thats 600+ per night

In [None]:
#merged_all

In [None]:
# number of hotels and air bnb's per area
#n = merged_all.groupby(["area","source"]).size()
#n

In [None]:
#n.agg("n.size()").reset_index().plot(kind="bar");

In [None]:
#chart = Chart(merged_all).mark_bar().encode(column=Column("source"), x=X("area"), y=Y("name"), color=Color("source", scale=Scale(range=['#EA98D2', '#659CCA']))).configure_facet_cell(strokeWidth=0.0,)

#chart.display()

In [None]:
#merged_data_full['area'].value_counts().plot.bar()

#merged_data_full[['area','source']].plot(kind='bar')

#most hotels are in Mitte & Charlottenburg 

In [None]:
#hotel_li=[merged_data_full[merged_data_full["source"] == "hotel"].count()]
#airbnb_li=[merged_data_full[merged_data_full["source"] == "airbnb"].count()]
#index_area = [merged_data_full["area"]]
#df_area = pd.DataFrame({'hotel': hotel_li, 'airbnb': airbnb_li}, index=index_area)
#ax = merged_data_full.plot.bar(rot=0)

In [None]:
#[merged_data_full[merged_data_full["source"] == "hotel"].groupby(["area"]).count()]

In [None]:
#hotel_li = (merged_data_full[["area","source"]]
#          .groupby(["area", "source"])
#          .agg({"source": ["count"]}))

#hotel_li

In [None]:
#hotel_li = []
#i = 0
#for merged_data_full['source'].iloc[i] in merged_data_full['source'] == "hotel":
#    (merged_data_full[["area","source"]]
#          .groupby(["area", "source"])
 #         .agg({("source"): ["count"]}))
  #  hotel_li.append(merged_data_full['source'].iloc[i])
   # i +=1
#
#hotel_li

In [None]:
#[merged_data_full[merged_data_full["source"] == "hotel"].groupby("area").count()]

##### Create aggregate Dataset for plotting 

In [None]:
#df_agg = (merged_data_full[["area", "price", "source"]]
 #         .groupby(["area", "source"])
  #        .agg({"price": ["median", "count"]}))
#
#df_agg = (df_agg
#          .reset_index())
#df_agg.head()

##### remove weird indexes

In [None]:
#df_agg.columns

In [None]:
#list(df_agg.columns.to_flat_index())

In [None]:
#[first+second for first, second in df_agg.columns]

In [None]:
#df_agg.columns = [first+second for first, second in df_agg.columns.to_flat_index()]

In [None]:
#df_agg = df_agg.rename(columns={"pricemedian": "median_price",
#                                "pricecount": "counts"})

In [None]:
#df_agg.dtypes

In [None]:
#(df_agg
# .groupby("area", "source")
# .agg({"median_price": "sum"})).plot(kind="bar", color="red");