In [1]:
import pandas as pd
import zipfile


zf = zipfile.ZipFile('./berlin-airbnb-data.zip')

file_names = zf.namelist()


# calendar_summary = pd.read_csv(zf.open(file_names[0]))

# listings = pd.read_csv(zf.open(file_names[1]))

listings_summary = pd.read_csv(zf.open(file_names[2]))

# neighbourhoods = pd.read_csv(zf.open(file_names[3]))

# reviews = pd.read_csv(zf.open(file_names[4]))

# reviews_summary = pd.read_csv(zf.open(file_names[5]))

In [2]:
columns_to_keep = ['id', 'space', 'description', 'host_has_profile_pic', 'neighbourhood_group_cleansed', 
                   'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',  
                   'bedrooms', 'bed_type', 'amenities', 'price', 'cleaning_fee', 
                   'security_deposit', 'extra_people', 'guests_included', 'minimum_nights',  
                   'instant_bookable', 'is_business_travel_ready', 'cancellation_policy']

df = listings_summary[columns_to_keep].set_index('id')
df.shape

(22552, 22)

In [3]:
df['size'] = df['description'].str.extract('(\d{2,3}\s?[smSM])', expand=True)
df['size'] = df['size'].str.replace("\D", "")

# change datatype of size into float
df['size'] = df['size'].astype(float)

In [4]:
df.cleaning_fee.fillna('$0.00', inplace=True)
df.security_deposit.fillna('$0.00', inplace=True)
df.host_has_profile_pic.fillna('f', inplace=True)

df.price = df.price.str.replace('$', '').str.replace(',', '').astype(float)
df.cleaning_fee = df.cleaning_fee.str.replace('$', '').str.replace(',', '').astype(float)
df.security_deposit = df.security_deposit.str.replace('$', '').str.replace(',', '').astype(float)
df.extra_people = df.extra_people.str.replace('$', '').str.replace(',', '').astype(float)

df.drop(df[ (df.price > 400) | (df.price == 0) ].index, axis=0, inplace=True)

In [5]:
from geopy.distance import great_circle

def distance_to_mid(lat, lon):
    berlin_centre = (52.5027778, 13.404166666666667)
    accommodation = (lat, lon)
    return great_circle(berlin_centre, accommodation).km

df['distance'] = df.apply(lambda x: distance_to_mid(x.latitude, x.longitude), axis=1)

In [6]:
df.dropna(subset=['bathrooms', 'bedrooms'], inplace=True)

In [7]:
sub_df = df[['accommodates', 'bathrooms', 'bedrooms',  'price', 'cleaning_fee', 
                 'security_deposit', 'extra_people', 'guests_included', 'distance', 'size']]

train_data = sub_df[sub_df['size'].notnull()]
test_data  = sub_df[sub_df['size'].isnull()]

# define X
X_train = train_data.drop('size', axis=1)
X_test  = test_data.drop('size', axis=1)

# define y
y_train = train_data['size']

# import Linear Regression
from sklearn.linear_model import LinearRegression

# instantiate
linreg = LinearRegression()

# fit model to training data
linreg.fit(X_train, y_train)

y_test = linreg.predict(X_test)

In [8]:
y_test = pd.DataFrame(y_test)
y_test.columns = ['size']

# make the index of X_test to an own dataframe
prelim_index = pd.DataFrame(X_test.index)
prelim_index.columns = ['prelim']

# ... and concat this dataframe with y_test
y_test = pd.concat([y_test, prelim_index], axis=1)
y_test.set_index(['prelim'], inplace=True)

new_test_data = pd.concat([X_test, y_test], axis=1)

sub_df_new = pd.concat([new_test_data, train_data], axis=0)

df.drop(['accommodates', 'bathrooms', 'bedrooms', 'price', 'cleaning_fee', 
             'security_deposit', 'extra_people', 'guests_included', 'distance', 'size'], 
            axis=1, inplace=True)

df = pd.concat([sub_df_new, df], axis=1)

In [9]:
df.drop(df[ (df['size'] == 0.) | (df['size'] > 300.) ].index, axis=0, inplace=True)

In [10]:
df.drop(columns=['space', 'description'], inplace = True)

In [11]:
df.isnull().sum()

accommodates                    0
bathrooms                       0
bedrooms                        0
price                           0
cleaning_fee                    0
security_deposit                0
extra_people                    0
guests_included                 0
distance                        0
size                            0
host_has_profile_pic            0
neighbourhood_group_cleansed    0
latitude                        0
longitude                       0
property_type                   0
room_type                       0
bed_type                        0
amenities                       0
minimum_nights                  0
instant_bookable                0
is_business_travel_ready        0
cancellation_policy             0
dtype: int64

In [12]:
df.shape

(21958, 22)

In [13]:
test_lemma = pd.read_csv('./lemma.csv')

In [14]:
df = df.merge(test_lemma.set_index('id'), how = 'left', left_index=True, right_index=True)

In [15]:
df.shape

(21958, 23)

In [16]:
df_test = df.drop(['latitude', 'longitude', 'neighbourhood_group_cleansed', 'property_type', 'amenities'], axis=1).copy()

df_test.dropna(subset=['bag_of_words'], inplace=True)

for col in ['host_has_profile_pic', 'room_type', 'bed_type', 'instant_bookable', 
            'is_business_travel_ready', 'cancellation_policy']:
    df_test[col] = df_test[col].astype('category')

df_test.shape

(21945, 18)

In [17]:
target = df_test[["price"]]

# define our features 
features = df_test.drop(["price"], axis=1)

In [18]:
number_columns = features.select_dtypes(include=['number']).columns.tolist()
number_columns

['accommodates',
 'bathrooms',
 'bedrooms',
 'cleaning_fee',
 'security_deposit',
 'extra_people',
 'guests_included',
 'distance',
 'size',
 'minimum_nights']

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

from sklearn.model_selection import train_test_split
# import metrics
from sklearn.metrics import mean_squared_error, r2_score

# split our data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)


vect = CountVectorizer(analyzer='word', max_df=.99, min_df=.2)
enc = OrdinalEncoder()
sc = StandardScaler()

booster = xgb.XGBRegressor(colsample_bytree= 0.6, gamma= 0.0, learning_rate= 0.1, max_depth= 7, n_estimators= 200, tree_method = 'gpu_hist')

# ct = ColumnTransformer(
#         [("norm1", Normalizer(norm='l1'), [0, 1]),
#         ("norm2", Normalizer(norm='l1'), slice(2, 4))])
text_transformer = Pipeline([('vect', vect)])

preprocessor = ColumnTransformer(
        transformers=[('text', vect, 'bag_of_words'),
                      ('category', enc, ['host_has_profile_pic', 'room_type', 'bed_type', 'instant_bookable', 'is_business_travel_ready', 'cancellation_policy']),
                      ('numbers', sc, number_columns)],
        remainder='passthrough')

# encoding_pipe = Pipeline([('enc', enc), ('sc', sc)])

# test_transform = preprocessor.fit_transform
# test_transform.inverse

pipe = Pipeline([('preprocessor', preprocessor), ('booster', booster)])
# grid_search = GridSearchCV(pipe, params, cv=5, n_jobs=4, verbose=5)
# grid_search.fit(data.data, data.target)

pipe.fit(features, target)



Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('text',
                                                  CountVectorizer(analyzer='word',
                                                                  binary=False,
                                                                  decode_error='strict',
                                                                  dtype=<class 'numpy.int64'>,
                                                                  encoding='utf-8',
                                                                  input='content',
                                                                  lowercase=True,
                                                                  max_df=0.99,
                     

In [20]:
y_pred = pipe.predict(X_test)

In [21]:
error = mean_squared_error(y_test, y_pred)
error

178.89659453037274

In [22]:
score = r2_score(y_test, y_pred)
score

0.897643394894573

In [23]:
pipe.predict(X_test.iloc[0:1])

array([32.7385  , 51.0669  , 39.38682 , 43.434025], dtype=float32)

In [47]:
X_test.iloc[0:1]

Unnamed: 0,accommodates,bathrooms,bedrooms,cleaning_fee,security_deposit,extra_people,guests_included,distance,size,host_has_profile_pic,room_type,bed_type,minimum_nights,instant_bookable,is_business_travel_ready,cancellation_policy,bag_of_words
22305234,2,1.0,1.0,8.0,0.0,25.0,1,7.109022,10.0,t,Private room,Real Bed,2,t,f,moderate,euch erwartet kleines gemütlich eingerichtetes...


In [42]:
pipe.predict(X_test.iloc[0:1])

array([32.7385], dtype=float32)

In [36]:
X_test.iloc[0]

accommodates                                                                2
bathrooms                                                                   1
bedrooms                                                                    1
cleaning_fee                                                                8
security_deposit                                                            0
extra_people                                                               25
guests_included                                                             1
distance                                                              7.10902
size                                                                       10
host_has_profile_pic                                                        t
room_type                                                        Private room
bed_type                                                             Real Bed
minimum_nights                                                  

In [35]:
pipe.named_steps['booster'].get_booster().feature_names

['f0',
 'f1',
 'f2',
 'f3',
 'f4',
 'f5',
 'f6',
 'f7',
 'f8',
 'f9',
 'f10',
 'f11',
 'f12',
 'f13',
 'f14',
 'f15',
 'f16',
 'f17',
 'f18',
 'f19',
 'f20',
 'f21',
 'f22',
 'f23',
 'f24',
 'f25',
 'f26',
 'f27',
 'f28',
 'f29',
 'f30',
 'f31',
 'f32',
 'f33',
 'f34']

In [24]:
import numpy as np
X_test.iloc[0:4]

Unnamed: 0,accommodates,bathrooms,bedrooms,cleaning_fee,security_deposit,extra_people,guests_included,distance,size,host_has_profile_pic,room_type,bed_type,minimum_nights,instant_bookable,is_business_travel_ready,cancellation_policy,bag_of_words
22305234,2,1.0,1.0,8.0,0.0,25.0,1,7.109022,10.0,t,Private room,Real Bed,2,t,f,moderate,euch erwartet kleines gemütlich eingerichtetes...
11006340,2,1.0,1.0,0.0,1000.0,0.0,1,4.638263,51.645274,t,Entire home/apt,Real Bed,7,f,f,strict_14_with_grace_period,away travel like sublet apartment 2 1/2 month ...
20781469,2,1.0,1.0,10.0,0.0,5.0,1,6.311901,40.0,t,Entire home/apt,Real Bed,2,f,f,moderate,bedroom 40m2 apartment west 10min foot subway ...
1335109,1,1.0,1.0,75.0,250.0,0.0,1,4.642475,52.372091,t,Entire home/apt,Pull-out Sofa,6,f,f,moderate,hey guy actress tour lot offer private flat ye...


In [25]:
y_test.iloc[0:4]

Unnamed: 0,price
22305234,24.0
11006340,48.0
20781469,30.0
1335109,32.0


In [26]:
# df_test.select_dtypes(include=['number'])

In [27]:
# from sklearn.feature_extraction.text import CountVectorizer

# test_lemma.dropna(subset=['bag_of_words'], inplace=True)

# vectorizer = CountVectorizer(analyzer='word', max_df=.99, min_df=.10)
# X = vectorizer.fit_transform(test_lemma.bag_of_words)

In [28]:
# print(vectorizer.get_feature_names())

In [29]:
# Get feature importances

rf = pipe.named_steps['booster']
importances = pd.Series(rf.feature_importances_, X_train.columns)

# Plot feature importances
%matplotlib inline
import matplotlib.pyplot as plt

n = 20
plt.figure(figsize=(10,n/2))
plt.title(f'Top {n} features')
importances.sort_values()[-n:].plot.barh(color='grey');

ValueError: Length of passed values is 35, index implies 17

In [None]:
pipe.named_steps['preprocessor']

In [48]:
import joblib
#save model
joblib.dump(pipe, 'test2_regression.pkl') 

#load saved model
pipe = joblib.load('./test2_regression.pkl')




In [49]:
pipe.predict(X_test.iloc[0:1])

array([32.7385], dtype=float32)

In [52]:
pd.DataFrame(X_test.iloc[0]).todense()

AttributeError: 'DataFrame' object has no attribute 'todense'