In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import json
import datetime

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [2]:
#export the booking.csv file to pandas df
df = pd.read_csv("BookingcomHotelsinBerlin.csv", low_memory=False)

In [3]:
#check the df
df.head(5)

Unnamed: 0,Field1,Field2,Field3
0,HotelCity Hotel am Gendarmenmarkt,"Leipziger Str. 65, Mitte, 10117 Berlin, Germany",€ 107
1,HotelARCO Hotel,"Geisbergstrasse 30, Tempelhof-Schöneberg, 1077...",
2,HotelSchulz Hotel Berlin Wall at the East Side...,"Stralauer Platz 36, Friedrichshain-Kreuzberg, ...",
3,HotelGuesthouse21,"Martin-Luther-Strasse 21, Tempelhof-Schöneberg...",
4,Hotelmk hotel berlin,"Osloer Straße 100, Mitte, 13359 Berlin, Germany",€ 91


In [4]:
df.describe().transpose()

Unnamed: 0,count,unique,top,freq
Field1,359,358,HotelGold Hotel,2
Field2,359,358,"Weserstr. 24, Friedrichshain-Kreuzberg, 10247 ...",2
Field3,287,104,€ 99,10


In [5]:
#check the types
df.dtypes

Field1    object
Field2    object
Field3    object
dtype: object

In [6]:
#make a copy to edit
df_c = df.copy()

In [7]:
#check the columns in order to drop the duplicates
df_c.columns

Index(['Field1', 'Field2', 'Field3'], dtype='object')

In [8]:
# renaming the columns
df_c.columns = ["Hotel", "Address", "Price"]
df_c.head(5)

Unnamed: 0,Hotel,Address,Price
0,HotelCity Hotel am Gendarmenmarkt,"Leipziger Str. 65, Mitte, 10117 Berlin, Germany",€ 107
1,HotelARCO Hotel,"Geisbergstrasse 30, Tempelhof-Schöneberg, 1077...",
2,HotelSchulz Hotel Berlin Wall at the East Side...,"Stralauer Platz 36, Friedrichshain-Kreuzberg, ...",
3,HotelGuesthouse21,"Martin-Luther-Strasse 21, Tempelhof-Schöneberg...",
4,Hotelmk hotel berlin,"Osloer Straße 100, Mitte, 13359 Berlin, Germany",€ 91


In [9]:
# splitting thr address field into useful values for us
df_ap=pd.DataFrame(df_c["Address"].str.split(",", n=2, expand = True))
df_ap.dtypes

0    object
1    object
2    object
dtype: object

In [10]:
df_ap.reset_index(drop = True)

Unnamed: 0,0,1,2
0,Leipziger Str. 65,Mitte,"10117 Berlin, Germany"
1,Geisbergstrasse 30,Tempelhof-Schöneberg,"10777 Berlin, Germany"
2,Stralauer Platz 36,Friedrichshain-Kreuzberg,"10243 Berlin, Germany"
3,Martin-Luther-Strasse 21,Tempelhof-Schöneberg,"10777 Berlin, Germany"
4,Osloer Straße 100,Mitte,"13359 Berlin, Germany"
5,Neue Grünstraße 28,Mitte,"10179 Berlin, Germany"
6,Prager Str. 12,Charlottenburg-Wilmersdorf,"10779 Berlin, Germany"
7,Warschauer Str. 39-40,Friedrichshain-Kreuzberg,"10243 Berlin, Germany"
8,Albrechtstr. 8,Mitte,"10117 Berlin, Germany"
9,Stresemannstr. 47,Friedrichshain-Kreuzberg,"10963 Berlin, Germany"


In [11]:
df_c = df_c.join(df_ap, lsuffix="_caller", rsuffix="_other")
df_c.head(5)

Unnamed: 0,Hotel,Address,Price,0,1,2
0,HotelCity Hotel am Gendarmenmarkt,"Leipziger Str. 65, Mitte, 10117 Berlin, Germany",€ 107,Leipziger Str. 65,Mitte,"10117 Berlin, Germany"
1,HotelARCO Hotel,"Geisbergstrasse 30, Tempelhof-Schöneberg, 1077...",,Geisbergstrasse 30,Tempelhof-Schöneberg,"10777 Berlin, Germany"
2,HotelSchulz Hotel Berlin Wall at the East Side...,"Stralauer Platz 36, Friedrichshain-Kreuzberg, ...",,Stralauer Platz 36,Friedrichshain-Kreuzberg,"10243 Berlin, Germany"
3,HotelGuesthouse21,"Martin-Luther-Strasse 21, Tempelhof-Schöneberg...",,Martin-Luther-Strasse 21,Tempelhof-Schöneberg,"10777 Berlin, Germany"
4,Hotelmk hotel berlin,"Osloer Straße 100, Mitte, 13359 Berlin, Germany",€ 91,Osloer Straße 100,Mitte,"13359 Berlin, Germany"


In [12]:
df_c = df_c.drop(columns = "Address")

In [13]:
# renaming the columns
df_c.columns = ["Hotel", "Price €", "Street", "Area", "Zip"]
df_c.reset_index(drop = True)
df_c.head(5)

Unnamed: 0,Hotel,Price €,Street,Area,Zip
0,HotelCity Hotel am Gendarmenmarkt,€ 107,Leipziger Str. 65,Mitte,"10117 Berlin, Germany"
1,HotelARCO Hotel,,Geisbergstrasse 30,Tempelhof-Schöneberg,"10777 Berlin, Germany"
2,HotelSchulz Hotel Berlin Wall at the East Side...,,Stralauer Platz 36,Friedrichshain-Kreuzberg,"10243 Berlin, Germany"
3,HotelGuesthouse21,,Martin-Luther-Strasse 21,Tempelhof-Schöneberg,"10777 Berlin, Germany"
4,Hotelmk hotel berlin,€ 91,Osloer Straße 100,Mitte,"13359 Berlin, Germany"


In [14]:
df_c2 = pd.DataFrame(df_c["Hotel"].str.split("Hotel", n=1, expand = True))

In [15]:
df_c2.head(5)

Unnamed: 0,0,1
0,,City Hotel am Gendarmenmarkt
1,,ARCO Hotel
2,,Schulz Hotel Berlin Wall at the East Side Gallery
3,,Guesthouse21
4,,mk hotel berlin


In [16]:
df_c = df_c2.join(df_c, lsuffix="_caller", rsuffix="_other")

In [17]:
df_c.head(5)

Unnamed: 0,0,1,Hotel,Price €,Street,Area,Zip
0,,City Hotel am Gendarmenmarkt,HotelCity Hotel am Gendarmenmarkt,€ 107,Leipziger Str. 65,Mitte,"10117 Berlin, Germany"
1,,ARCO Hotel,HotelARCO Hotel,,Geisbergstrasse 30,Tempelhof-Schöneberg,"10777 Berlin, Germany"
2,,Schulz Hotel Berlin Wall at the East Side Gallery,HotelSchulz Hotel Berlin Wall at the East Side...,,Stralauer Platz 36,Friedrichshain-Kreuzberg,"10243 Berlin, Germany"
3,,Guesthouse21,HotelGuesthouse21,,Martin-Luther-Strasse 21,Tempelhof-Schöneberg,"10777 Berlin, Germany"
4,,mk hotel berlin,Hotelmk hotel berlin,€ 91,Osloer Straße 100,Mitte,"13359 Berlin, Germany"


In [18]:
df_c.dtypes

0          object
1          object
Hotel      object
Price €    object
Street     object
Area       object
Zip        object
dtype: object

In [19]:
df_c.columns = ["temp1", "Hotel", "temp2", "Price €", "Street", "Area", "Zip"]
df_c = df_c.drop(columns = ["temp1", "temp2"])

In [20]:
df_c = df_c.dropna()
df_c.reset_index(drop = True)
df_c.head(5)

Unnamed: 0,Hotel,Price €,Street,Area,Zip
0,City Hotel am Gendarmenmarkt,€ 107,Leipziger Str. 65,Mitte,"10117 Berlin, Germany"
4,mk hotel berlin,€ 91,Osloer Straße 100,Mitte,"13359 Berlin, Germany"
5,Best Western Hotel am Spittelmarkt,€ 79,Neue Grünstraße 28,Mitte,"10179 Berlin, Germany"
6,Hyperion Hotel Berlin,€ 131,Prager Str. 12,Charlottenburg-Wilmersdorf,"10779 Berlin, Germany"
7,Michelberger Hotel,€ 85,Warschauer Str. 39-40,Friedrichshain-Kreuzberg,"10243 Berlin, Germany"


In [21]:
df_c.to_csv("Booking_area_comparison.csv")
df_c.to_pickle("Booking_area_comparison.pkl")

In [22]:
df_c1 = df_c["Price €"].dropna(how='all')

In [23]:
df_c1.replace(regex=True,inplace=True,to_replace=r'\D',value=r'')

In [24]:
df_c1.astype(int)

0      107
4       91
5       79
6      131
7       85
9      100
11      59
12     106
13      87
14      62
16     110
19     178
20      65
21     169
22      68
23      95
25      99
26      69
27      89
28      99
29      85
30      99
31      80
33     106
34     110
35      93
36      70
37      73
38     102
40      75
41      98
43      90
44      65
45     201
47      69
48     109
49     124
51     243
53     115
54     131
55     142
56      99
57      89
58      93
59      89
61     148
63     120
64      98
65     128
68      99
69      90
72     217
73     163
75      63
76      90
77     185
78     108
79     148
80     119
82      96
83     137
84     113
85      74
86     143
87      70
89     123
90     155
91     108
92      78
93     170
94      78
96      67
97     113
98     156
99      95
100     95
101    224
104     80
105     85
106     88
107    174
109    228
110    326
111     87
112     72
113     79
114    109
116    127
117     67
118    194
119    139

In [25]:
df_c["Price €"] = df_c1

In [26]:
df_c.reset_index(drop = True)

Unnamed: 0,Hotel,Price €,Street,Area,Zip
0,City Hotel am Gendarmenmarkt,107,Leipziger Str. 65,Mitte,"10117 Berlin, Germany"
1,mk hotel berlin,91,Osloer Straße 100,Mitte,"13359 Berlin, Germany"
2,Best Western Hotel am Spittelmarkt,79,Neue Grünstraße 28,Mitte,"10179 Berlin, Germany"
3,Hyperion Hotel Berlin,131,Prager Str. 12,Charlottenburg-Wilmersdorf,"10779 Berlin, Germany"
4,Michelberger Hotel,85,Warschauer Str. 39-40,Friedrichshain-Kreuzberg,"10243 Berlin, Germany"
5,NH Berlin Potsdamer Platz,100,Stresemannstr. 47,Friedrichshain-Kreuzberg,"10963 Berlin, Germany"
6,EnergieHotel Berlin City West,59,Wielandstrasse 7/8,Charlottenburg-Wilmersdorf,"10625 Berlin, Germany"
7,Novotel Suites Berlin City Potsdamer Platz,106,Anhalter Str. 2,Friedrichshain-Kreuzberg,"10963 Berlin, Germany"
8,ibis budget Berlin Kurfürstendamm,87,Bayreuther Strasse 41,Tempelhof-Schöneberg,"10787 Berlin, Germany"
9,ibis Hotel Berlin Airport Tegel,62,Alt-Reinickendorf 4-5,Reinickendorf,"13407 Berlin, Germany"


In [27]:
df_c.to_csv("Booking_price_comparison.csv")
df_c.to_pickle("Booking_price_comparison.pkl")