In [1]:
import pandas as pd
import zipfile


zf = zipfile.ZipFile('./berlin-airbnb-data.zip')

file_names = zf.namelist()


# calendar_summary = pd.read_csv(zf.open(file_names[0]))

# listings = pd.read_csv(zf.open(file_names[1]))

listings_summary = pd.read_csv(zf.open(file_names[2]))

# neighbourhoods = pd.read_csv(zf.open(file_names[3]))

# reviews = pd.read_csv(zf.open(file_names[4]))

# reviews_summary = pd.read_csv(zf.open(file_names[5]))

In [2]:
columns_to_keep = ['id', 'space', 'description', 'neighbourhood_group_cleansed', 
                   'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',  
                   'bedrooms', 'bed_type', 'amenities', 'price', 'cleaning_fee', 
                   'security_deposit', 'extra_people', 'guests_included', 'minimum_nights',  
                   'instant_bookable', 'is_business_travel_ready', 'cancellation_policy']

df = listings_summary[columns_to_keep].set_index('id')
df.shape

(22552, 21)

In [3]:
df['size'] = df['description'].str.extract('(\d{2,3}\s?[smSM])', expand=True)
df['size'] = df['size'].str.replace("\D", "")

# change datatype of size into float
df['size'] = df['size'].astype(float)

In [4]:
df.cleaning_fee.fillna('$0.00', inplace=True)
df.security_deposit.fillna('$0.00', inplace=True)

df.price = df.price.str.replace('$', '').str.replace(',', '').astype(float)
df.cleaning_fee = df.cleaning_fee.str.replace('$', '').str.replace(',', '').astype(float)
df.security_deposit = df.security_deposit.str.replace('$', '').str.replace(',', '').astype(float)
df.extra_people = df.extra_people.str.replace('$', '').str.replace(',', '').astype(float)

df.drop(df[ (df.price > 400) | (df.price == 0) ].index, axis=0, inplace=True)

In [5]:
from geopy.distance import great_circle

def distance_to_mid(lat, lon):
    berlin_centre = (52.5027778, 13.404166666666667)
    accommodation = (lat, lon)
    return great_circle(berlin_centre, accommodation).km

df['distance'] = df.apply(lambda x: distance_to_mid(x.latitude, x.longitude), axis=1)

In [6]:
df.dropna(subset=['bathrooms', 'bedrooms'], inplace=True)

In [7]:
sub_df = df[['accommodates', 'bathrooms', 'bedrooms',  'price', 'cleaning_fee', 
                 'security_deposit', 'extra_people', 'guests_included', 'distance', 'size']]

train_data = sub_df[sub_df['size'].notnull()]
test_data  = sub_df[sub_df['size'].isnull()]

# define X
X_train = train_data.drop('size', axis=1)
X_test  = test_data.drop('size', axis=1)

# define y
y_train = train_data['size']

# import Linear Regression
from sklearn.linear_model import LinearRegression

# instantiate
linreg = LinearRegression()

# fit model to training data
linreg.fit(X_train, y_train)

y_test = linreg.predict(X_test)

In [8]:
y_test = pd.DataFrame(y_test)
y_test.columns = ['size']

# make the index of X_test to an own dataframe
prelim_index = pd.DataFrame(X_test.index)
prelim_index.columns = ['prelim']

# ... and concat this dataframe with y_test
y_test = pd.concat([y_test, prelim_index], axis=1)
y_test.set_index(['prelim'], inplace=True)

new_test_data = pd.concat([X_test, y_test], axis=1)

sub_df_new = pd.concat([new_test_data, train_data], axis=0)

df.drop(['accommodates', 'bathrooms', 'bedrooms', 'price', 'cleaning_fee', 
             'security_deposit', 'extra_people', 'guests_included', 'distance', 'size'], 
            axis=1, inplace=True)

df = pd.concat([sub_df_new, df], axis=1)

In [9]:
df.drop(df[ (df['size'] == 0.) | (df['size'] > 300.) ].index, axis=0, inplace=True)

In [10]:
df.drop(columns=['space', 'description', 'is_business_travel_ready'], inplace = True)

In [11]:
df.isnull().sum()

accommodates                    0
bathrooms                       0
bedrooms                        0
price                           0
cleaning_fee                    0
security_deposit                0
extra_people                    0
guests_included                 0
distance                        0
size                            0
neighbourhood_group_cleansed    0
latitude                        0
longitude                       0
property_type                   0
room_type                       0
bed_type                        0
amenities                       0
minimum_nights                  0
instant_bookable                0
cancellation_policy             0
dtype: int64

In [12]:
df.shape

(21958, 20)

In [13]:
test_lemma = pd.read_csv('./lemma.csv')

In [14]:
df = df.merge(test_lemma.set_index('id'), how = 'left', left_index=True, right_index=True)

In [15]:
df.shape

(21958, 21)

In [16]:
df_test = df.drop(['latitude', 'longitude', 'neighbourhood_group_cleansed', 'property_type', 'amenities', 'cleaning_fee', 'security_deposit', 'security_deposit',
                   'extra_people', 'distance'], axis=1).copy()

df_test.dropna(subset=['bag_of_words'], inplace=True)

for col in ['room_type', 'bed_type', 'instant_bookable', 
             'cancellation_policy']:
    df_test[col] = df_test[col].astype('category')

df_test.shape

(21945, 12)

In [17]:
df_test = df_test.drop(["guests_included"], axis=1)

In [18]:
target = df_test[["price"]]

# define our features 
features = df_test.drop(["price"], axis=1)

In [19]:
number_columns = features.select_dtypes(include=['number']).columns.tolist()
number_columns

['accommodates', 'bathrooms', 'bedrooms', 'size', 'minimum_nights']

In [20]:
features

Unnamed: 0,accommodates,bathrooms,bedrooms,size,room_type,bed_type,minimum_nights,instant_bookable,cancellation_policy,bag_of_words
2015,3,1.0,1.0,75.000000,Entire home/apt,Real Bed,4,f,strict_14_with_grace_period,great location 30 75 sq meter wood floor high ...
2695,2,1.0,1.0,25.000000,Private room,Real Bed,2,f,flexible,summertime spending weekend little house garde...
3176,4,1.0,1.0,68.000000,Entire home/apt,Real Bed,62,t,strict_14_with_grace_period,beautiful floor apartment situate kollwitzplat...
3309,2,1.0,1.0,26.000000,Private room,Pull-out Sofa,5,f,strict_14_with_grace_period,prefer short notice booking request 1 2 week a...
7071,2,1.0,1.0,20.000000,Private room,Real Bed,2,f,moderate,cozy large room beautiful district prenzlauer ...
...,...,...,...,...,...,...,...,...,...,...
29856708,2,1.0,1.0,52.545744,Entire home/apt,Real Bed,2,f,flexible,charming apartment right center close main tou...
29857108,6,1.0,1.0,52.685189,Shared room,Real Bed,1,t,flexible,wohnung liegt unmittelbarer nähe viktoria luis...
29864272,2,1.0,0.0,60.482581,Entire home/apt,Real Bed,3,f,flexible,prenzlauer berg creative district plenty bar r...
29866805,2,1.0,1.0,65.178081,Private room,Real Bed,1,f,flexible,double room private shower wc design artist ro...


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

from sklearn.model_selection import train_test_split
# import metrics
from sklearn.metrics import mean_squared_error, r2_score

# split our data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)


vect = CountVectorizer(analyzer='word', max_df=.99, min_df=.2)
enc = OrdinalEncoder()
sc = StandardScaler()

booster = xgb.XGBRegressor(colsample_bytree= 0.6, gamma= 0.0, learning_rate= 0.1, max_depth= 7, n_estimators= 200, tree_method = 'gpu_exact')

# ct = ColumnTransformer(
#         [("norm1", Normalizer(norm='l1'), [0, 1]),
#         ("norm2", Normalizer(norm='l1'), slice(2, 4))])
text_transformer = Pipeline([('vect', vect)])

preprocessor = ColumnTransformer(
        transformers=[('text', vect, 'bag_of_words'),
                      ('category', enc, ['room_type', 'bed_type', 'instant_bookable', 'cancellation_policy']),
                      ('numbers', sc, number_columns)],
        remainder='passthrough')

# encoding_pipe = Pipeline([('enc', enc), ('sc', sc)])

# test_transform = preprocessor.fit_transform
# test_transform.inverse

pipe = Pipeline([('preprocessor', preprocessor), ('booster', booster)])
# grid_search = GridSearchCV(pipe, params, cv=5, n_jobs=4, verbose=5)
# grid_search.fit(data.data, data.target)

pipe.fit(features, target)



Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('text',
                                                  CountVectorizer(analyzer='word',
                                                                  binary=False,
                                                                  decode_error='strict',
                                                                  dtype=<class 'numpy.int64'>,
                                                                  encoding='utf-8',
                                                                  input='content',
                                                                  lowercase=True,
                                                                  max_df=0.99,
                     

In [22]:
y_pred = pipe.predict(X_test)

In [23]:
error = mean_squared_error(y_test, y_pred)
error

280.880448332746

In [24]:
score = r2_score(y_test, y_pred)
score

0.8287356198361193

In [25]:
pipe.predict(X_test.iloc[0:1])

array([134.35118], dtype=float32)

In [26]:
X_test.iloc[0:1]

Unnamed: 0,accommodates,bathrooms,bedrooms,size,room_type,bed_type,minimum_nights,instant_bookable,cancellation_policy,bag_of_words
7863925,5,1.5,4.0,92.633579,Entire home/apt,Real Bed,1,f,flexible,rent friend beautiful modern cosy loft situate...


In [27]:
pipe.predict(X_test.iloc[0:1])

array([134.35118], dtype=float32)

In [28]:
X_test.iloc[0]

accommodates                                                           5
bathrooms                                                            1.5
bedrooms                                                               4
size                                                             92.6336
room_type                                                Entire home/apt
bed_type                                                        Real Bed
minimum_nights                                                         1
instant_bookable                                                       f
cancellation_policy                                             flexible
bag_of_words           rent friend beautiful modern cosy loft situate...
Name: 7863925, dtype: object

In [29]:
import numpy as np
X_test.iloc[0:4]

Unnamed: 0,accommodates,bathrooms,bedrooms,size,room_type,bed_type,minimum_nights,instant_bookable,cancellation_policy,bag_of_words
7863925,5,1.5,4.0,92.633579,Entire home/apt,Real Bed,1,f,flexible,rent friend beautiful modern cosy loft situate...
23338593,2,1.0,1.0,10.0,Entire home/apt,Real Bed,3,f,flexible,nice apartment locate heart close 2 subway sta...
24667228,8,1.0,3.0,15.0,Entire home/apt,Real Bed,63,t,strict_14_with_grace_period,modern spacious flat new berliner comfy fully ...
4047579,2,1.0,0.0,57.701617,Entire home/apt,Real Bed,14,f,strict_14_with_grace_period,modern uncluttered studio situate good area ce...


In [30]:
y_test.iloc[0:4]

Unnamed: 0,price
7863925,150.0
23338593,90.0
24667228,66.0
4047579,74.0


In [31]:
# df_test.select_dtypes(include=['number'])

In [32]:
# from sklearn.feature_extraction.text import CountVectorizer

# test_lemma.dropna(subset=['bag_of_words'], inplace=True)

# vectorizer = CountVectorizer(analyzer='word', max_df=.99, min_df=.10)
# X = vectorizer.fit_transform(test_lemma.bag_of_words)

In [33]:
#print(vectorizer.get_feature_names())

In [34]:
# Get feature importances

# rf = pipe.named_steps['booster']
# importances = pd.Series(rf.feature_importances_, X_train.columns)

# # Plot feature importances
# %matplotlib inline
# import matplotlib.pyplot as plt

# n = 20
# plt.figure(figsize=(10,n/2))
# plt.title(f'Top {n} features')
# importances.sort_values()[-n:].plot.barh(color='grey');

In [35]:
# pipe.named_steps['preprocessor']

In [36]:
import joblib
#save model
joblib.dump(pipe, 'test2_regression.pkl') 

#load saved model
pipe = joblib.load('./test2_regression.pkl')




In [37]:
pipe.predict(X_test.iloc[0:1])

array([134.35118], dtype=float32)

In [38]:
# pd.DataFrame(X_test.iloc[0]).todense()

In [39]:
df.iloc[0:1]

Unnamed: 0,accommodates,bathrooms,bedrooms,price,cleaning_fee,security_deposit,extra_people,guests_included,distance,size,...,latitude,longitude,property_type,room_type,bed_type,amenities,minimum_nights,instant_bookable,cancellation_policy,bag_of_words
2015,3,1.0,1.0,60.0,30.0,200.0,28.0,1,3.533182,75.0,...,52.534537,13.402557,Guesthouse,Entire home/apt,Real Bed,"{TV,""Cable TV"",Wifi,Kitchen,Gym,Heating,""Famil...",4,f,strict_14_with_grace_period,great location 30 75 sq meter wood floor high ...


In [40]:
test_df = features.iloc[0:2].copy()

In [41]:
test_df

Unnamed: 0,accommodates,bathrooms,bedrooms,size,room_type,bed_type,minimum_nights,instant_bookable,cancellation_policy,bag_of_words
2015,3,1.0,1.0,75.0,Entire home/apt,Real Bed,4,f,strict_14_with_grace_period,great location 30 75 sq meter wood floor high ...
2695,2,1.0,1.0,25.0,Private room,Real Bed,2,f,flexible,summertime spending weekend little house garde...


In [42]:
test_df.to_csv('testdf.csv', index=True)


In [43]:
df.room_type.value_counts()

Private room       11286
Entire home/apt    10382
Shared room          290
Name: room_type, dtype: int64

In [44]:
df.bed_type.value_counts()

Real Bed         21192
Pull-out Sofa      442
Futon              233
Couch               68
Airbed              23
Name: bed_type, dtype: int64

In [45]:
df.cancellation_policy.value_counts()

flexible                       8886
moderate                       6929
strict_14_with_grace_period    6085
super_strict_30                  48
super_strict_60                  10
Name: cancellation_policy, dtype: int64

In [46]:
features.head()

Unnamed: 0,accommodates,bathrooms,bedrooms,size,room_type,bed_type,minimum_nights,instant_bookable,cancellation_policy,bag_of_words
2015,3,1.0,1.0,75.0,Entire home/apt,Real Bed,4,f,strict_14_with_grace_period,great location 30 75 sq meter wood floor high ...
2695,2,1.0,1.0,25.0,Private room,Real Bed,2,f,flexible,summertime spending weekend little house garde...
3176,4,1.0,1.0,68.0,Entire home/apt,Real Bed,62,t,strict_14_with_grace_period,beautiful floor apartment situate kollwitzplat...
3309,2,1.0,1.0,26.0,Private room,Pull-out Sofa,5,f,strict_14_with_grace_period,prefer short notice booking request 1 2 week a...
7071,2,1.0,1.0,20.0,Private room,Real Bed,2,f,moderate,cozy large room beautiful district prenzlauer ...


In [47]:
features.shape

(21945, 10)

In [48]:
features.columns

Index(['accommodates', 'bathrooms', 'bedrooms', 'size', 'room_type',
       'bed_type', 'minimum_nights', 'instant_bookable', 'cancellation_policy',
       'bag_of_words'],
      dtype='object')

In [53]:
features['size'].describe()

count    21945.000000
mean        52.563153
std         38.580429
min          1.000000
25%         25.000000
50%         49.354306
75%         64.420869
max        300.000000
Name: size, dtype: float64