In [2]:
import pandas as pd
import numpy as np
import copy

# Import data

In [3]:
data = pd.read_csv('../../data/train_airbnb_berlin.csv', delimiter = ',')

# Data analysis 

In [4]:
data.head()

'''
'Listing ID' : useless, take out of data
'Listing Name' : useless, take out of data
'Host ID' : transform feature to number of listings
'Host Name' : useless, take out of data (except if we can separate by gender --> new feature gender of host)
'Host Since' : transform date into a continuous number
'Host Response Time': build 4 discrete values + unknown for NaNs
'Host Response Rate': transform percentage to number + treat NaNs
'Is Superhost' : transform to binary 0, 1
'neighbourhood' : tranform classes to numbers (not sure, think about it)
'Neighbourhood Group' : transform classes to numbers, but is this feature usefull ??
'City' : useless, take out of data
'Postal Code' : leave as is
'Country' : useless, take out of data
'Country Code' : useless, take out of data
'Latitude' : keep as is
'Longitude' : keep as is + we could think of a feature combining Latitude and Longitude
'Is Exact Location' : transform to binary
'Property Type' : transform classes to numbers
'Room Type' : transform to 3 discrete variables
'Accomodates' : leave as is
'Bathrooms' : leave as is
'Bedrooms' : leave as is
'Beds' : leave as is
'Square Feet' : take out of data because 98% of NaNs
'Guests Included' : leave as is
'Min Nights' : leave as is
'Reviews' : leave as is (number of reviews)
'First Review' : transform date into a continuous number
'Last Review' : transform date into a continuous number
'Overall Rating' : leave as is
'Accuracy Rating' : leave as is
'Cleanliness Rating' : leave as is
'Checkin Rating' : leave as is
'Communication Rating' : leave as is
'Location Rating' : leave as is
'Value Rating' : leave as is
'Instant Bookable' : transform to binary
'Business Travel Ready' : useless all false
'Price' : leave as is
'''

"\n'Listing ID' : useless, take out of data\n'Listing Name' : useless, take out of data\n'Host ID' : transform feature to number of listings\n'Host Name' : useless, take out of data (except if we can separate by gender --> new feature gender of host)\n'Host Since' : transform date into a continuous number\n'Host Response Time': build 4 discrete values + unknown for NaNs\n'Host Response Rate': transform percentage to number + treat NaNs\n'Is Superhost' : transform to binary 0, 1\n'neighbourhood' : tranform classes to numbers (not sure, think about it)\n'Neighbourhood Group' : transform classes to numbers, but is this feature usefull ??\n'City' : useless, take out of data\n'Postal Code' : leave as is\n'Country' : useless, take out of data\n'Country Code' : useless, take out of data\n'Latitude' : keep as is\n'Longitude' : keep as is + we could think of a feature combining Latitude and Longitude\n'Is Exact Location' : transform to binary\n'Property Type' : transform classes to numbers\n'Ro

In [5]:
data.shape

(15692, 39)

In [6]:
data.isnull().sum()/data.shape[0]

Listing ID               0.000064
Listing Name             0.003441
Host ID                  0.000000
Host Name                0.001402
Host Since               0.001338
Host Response Time       0.450867
Host Response Rate       0.450867
Is Superhost             0.001466
neighbourhood            0.000000
Neighborhood Group       0.000000
City                     0.000064
Postal Code              0.014593
Country Code             0.000000
Country                  0.000000
Latitude                 0.000000
Longitude                0.000000
Is Exact Location        0.000000
Property Type            0.000000
Room Type                0.000000
Accomodates              0.000000
Bathrooms                0.000892
Bedrooms                 0.000319
Beds                     0.000510
Square Feet              0.980691
Guests Included          0.000000
Min Nights               0.000000
Reviews                  0.000000
First Review             0.172381
Last Review              0.172445
Overall Rating

In [7]:
data['Postal Code']

0        10437.0
1        13187.0
2        10439.0
3          10245
4        10405.0
          ...   
15687    10961.0
15688    12055.0
15689    13351.0
15690    12159.0
15691      10315
Name: Postal Code, Length: 15692, dtype: object

In [14]:
data['Postal Code'].unique()

array(['10437.0', '13187.0', '10439.0', '10245', '10405.0', '*', '10997',
       '10961.0', '10553.0', '14197.0', '12047.0', '12435.0', '10781.0',
       '13407', nan, '13086.0', '12055.0', '10245.0', '12161.0',
       '10249.0', '10555.0', '13353.0', '10999.0', '12053.0', '10969.0',
       '10629.0', '10709.0', '13357', '10967.0', '12167.0', '13347.0',
       '10407', '12049', '12437.0', '10247', '10249', '10407.0',
       '12049.0', '13055.0', '10317.0', '10178.0', '10787.0', '12055',
       '10997.0', '10965', '12059.0', '10829.0', '12157.0', '13359.0',
       '10115.0', '10119.0', '13357.0', '10409.0', '10179.0', '12051',
       '10365', '12159.0', '10711', '10405', '10435.0', '10963.0',
       '12043', '12045.0', '10551', '10961', '10247.0', '10823.0',
       '10627.0', '10369.0', '12043.0', '10585.0', '10967', '10439',
       '10999', '10587.0', '12203.0', '10437', '12099.0', '10965.0',
       '10243.0', '10783.0', '12623.0', '10589.0', '12045', '10777',
       '10777.0', '10969'

# Data preprocessing

In [59]:
del data['Square Feet']
del data['Listing ID']
del data['Listing Name']
del data['Host Name']
del data['City']
del data['Country']
del data['First Review']
del data['Last Review']
del data['Business Travel Ready']

In [65]:
data[data=='*'] = np.nan

### data_1 : just eliminate all instances with NaNs

In [60]:
# Naively drop all instances with NaNs
data_1 = data[~data.isnull().any(axis=1)]
data_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7524 entries, 5 to 15690
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Host ID               7524 non-null   float64
 1   Host Since            7524 non-null   object 
 2   Host Response Time    7524 non-null   object 
 3   Host Response Rate    7524 non-null   object 
 4   Is Superhost          7524 non-null   object 
 5   neighbourhood         7524 non-null   object 
 6   Neighborhood Group    7524 non-null   object 
 7   Postal Code           7524 non-null   object 
 8   Country Code          7524 non-null   object 
 9   Latitude              7524 non-null   float64
 10  Longitude             7524 non-null   float64
 11  Is Exact Location     7524 non-null   object 
 12  Property Type         7524 non-null   object 
 13  Room Type             7524 non-null   object 
 14  Accomodates           7524 non-null   object 
 15  Bathrooms           

### data_2 : treat Host Response Time and Host Response Rate then data_1

In [61]:
data_2 = copy.copy(data)

data_2['Host Response Time'][data_2['Host Response Time'].isnull()] = 'unknown'
data_2['Host Response Rate'][data_2['Host Response Rate'].isnull()] = 'unknown'

data_2 = data_2[~data_2.isnull().any(axis=1)]

data_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12498 entries, 0 to 15690
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Host ID               12498 non-null  float64
 1   Host Since            12498 non-null  object 
 2   Host Response Time    12498 non-null  object 
 3   Host Response Rate    12498 non-null  object 
 4   Is Superhost          12498 non-null  object 
 5   neighbourhood         12498 non-null  object 
 6   Neighborhood Group    12498 non-null  object 
 7   Postal Code           12498 non-null  object 
 8   Country Code          12498 non-null  object 
 9   Latitude              12498 non-null  float64
 10  Longitude             12498 non-null  float64
 11  Is Exact Location     12498 non-null  object 
 12  Property Type         12498 non-null  object 
 13  Room Type             12498 non-null  object 
 14  Accomodates           12498 non-null  object 
 15  Bathrooms          

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2['Host Response Time'][data_2['Host Response Time'].isnull()] = 'unknown'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2['Host Response Rate'][data_2['Host Response Rate'].isnull()] = 'unknown'


### data_3 : data_2 + set ratings to the mean when not given then data_1

In [62]:
data_3 = copy.copy(data)

data_3['Host Response Time'][data_3['Host Response Time'].isnull()] = 'unknown'
data_3['Host Response Rate'][data_3['Host Response Rate'].isnull()] = 'unknown'

data_3['Overall Rating'][data_3['Overall Rating'].isnull()] = np.nanmean(data_3['Overall Rating'])
data_3['Accuracy Rating'][data_3['Accuracy Rating'].isnull()] = np.nanmean(data_3['Accuracy Rating'])
data_3['Cleanliness Rating'][data_3['Cleanliness Rating'].isnull()] = np.nanmean(data_3['Cleanliness Rating'])
data_3['Checkin Rating'][data_3['Checkin Rating'].isnull()] = np.nanmean(data_3['Checkin Rating'])
data_3['Communication Rating'][data_3['Communication Rating'].isnull()] = np.nanmean(data_3['Communication Rating'])
data_3['Location Rating'][data_3['Location Rating'].isnull()] = np.nanmean(data_3['Location Rating'])
data_3['Value Rating'][data_3['Value Rating'].isnull()] = np.nanmean(data_3['Value Rating'])

data_3 = data_3[~data_3.isnull().any(axis=1)]

data_3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15405 entries, 0 to 15691
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Host ID               15405 non-null  float64
 1   Host Since            15405 non-null  object 
 2   Host Response Time    15405 non-null  object 
 3   Host Response Rate    15405 non-null  object 
 4   Is Superhost          15405 non-null  object 
 5   neighbourhood         15405 non-null  object 
 6   Neighborhood Group    15405 non-null  object 
 7   Postal Code           15405 non-null  object 
 8   Country Code          15405 non-null  object 
 9   Latitude              15405 non-null  float64
 10  Longitude             15405 non-null  float64
 11  Is Exact Location     15405 non-null  object 
 12  Property Type         15405 non-null  object 
 13  Room Type             15405 non-null  object 
 14  Accomodates           15405 non-null  object 
 15  Bathrooms          

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_3['Host Response Time'][data_3['Host Response Time'].isnull()] = 'unknown'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_3['Host Response Rate'][data_3['Host Response Rate'].isnull()] = 'unknown'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_3['Overall Rating'][data_3['Overall Rating'].isnull()] = np.nanmean(data_3['Overall Rating'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pand