## Import Libraries

In [None]:
import re
import numpy as np
import pandas as pd

##  Reviews Dataset

In [None]:
# Load Data - Reviews
reviews = pd.read_csv('./reviews.csv', encoding='utf-8')
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2595,17857,2009-11-21,50679,Jean,Notre séjour de trois nuits.\r<br/>Nous avons ...
1,2595,19176,2009-12-05,53267,Cate,Great experience.
2,2595,19760,2009-12-10,38960,Anita,I've stayed with my friend at the Midtown Cast...
3,2595,34320,2010-04-09,71130,Kai-Uwe,"We've been staying here for about 9 nights, en..."
4,2595,46312,2010-05-25,117113,Alicia,We had a wonderful stay at Jennifer's charming...


In [None]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 908803 entries, 0 to 908802
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   listing_id     908803 non-null  int64 
 1   id             908803 non-null  int64 
 2   date           908803 non-null  object
 3   reviewer_id    908803 non-null  int64 
 4   reviewer_name  908797 non-null  object
 5   comments       908638 non-null  object
dtypes: int64(3), object(3)
memory usage: 41.6+ MB


### Drop Columns

In [None]:
# Keep necessary columns - listing_id and comments
reviews = reviews[['listing_id', 'comments']]

### Remove Empty Reviews

In [None]:
# Count how many missing values there are in every column 
def count_null_df(df):
     check_nulls = df.isna().sum()
     return check_nulls[check_nulls != 0]

In [None]:
count_null_df(reviews)
# There are 165 rows of missing comments, which will be removed

comments    165
dtype: int64

In [None]:
# Drop null values
reviews = reviews.dropna(axis=0, subset=['comments'])

### Remove HTML Tags 

In [None]:
def remove_tags(df, col):
    df[col] = df[col].str.replace(r'<[^<>]*>', ' ', regex=True)
    df[col] = df[col].str.replace(u'\xa0',' ')
    return df

In [None]:
reviews = remove_tags(reviews, 'comments')

In [None]:
# Cleaning completed
reviews.to_csv('cleaned_reviews.csv', index=False)

## Listings Dataset

In [None]:
# Load Data - Listings
listings = pd.read_csv('listings.csv', encoding='utf-8')
listings.head()

  listings = pd.read_csv('listings.csv', encoding='utf-8')


Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2595,https://www.airbnb.com/rooms/2595,20220106025017,2022-01-06,Skylit Midtown Castle,"Beautiful, spacious skylit studio in the heart...",Centrally located in the heart of Manhattan ju...,https://a0.muscache.com/pictures/f0813a11-40b2...,2845,https://www.airbnb.com/users/show/2845,...,4.79,4.86,4.41,,f,3,3,0,0,0.33
1,3831,https://www.airbnb.com/rooms/3831,20220106025017,2022-01-06,"Whole flr w/private bdrm, bath & kitchen(pls r...","Enjoy 500 s.f. top floor in 1899 brownstone, w...",Just the right mix of urban center and local n...,https://a0.muscache.com/pictures/e49999c2-9fd5...,4869,https://www.airbnb.com/users/show/4869,...,4.8,4.71,4.64,,f,1,1,0,0,4.62
2,5121,https://www.airbnb.com/rooms/5121,20220106025017,2022-01-06,BlissArtsSpace!,<b>The space</b><br />HELLO EVERYONE AND THANK...,,https://a0.muscache.com/pictures/2090980c-b68e...,7356,https://www.airbnb.com/users/show/7356,...,4.91,4.47,4.52,,f,2,0,2,0,0.33
3,5136,https://www.airbnb.com/rooms/5136,20220106025017,2022-01-06,"Spacious Brooklyn Duplex, Patio + Garden",We welcome you to stay in our lovely 2 br dupl...,,https://a0.muscache.com/pictures/miso/Hosting-...,7378,https://www.airbnb.com/users/show/7378,...,5.0,4.5,5.0,,f,1,1,0,0,0.02
4,5178,https://www.airbnb.com/rooms/5178,20220106025017,2022-01-07,Large Furnished Room Near B'way,Please don’t expect the luxury here just a bas...,"Theater district, many restaurants around here.",https://a0.muscache.com/pictures/12065/f070997...,8967,https://www.airbnb.com/users/show/8967,...,4.42,4.87,4.36,,f,1,0,1,0,3.33


In [None]:
listings.info()
# There are 74 columns in listings dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38185 entries, 0 to 38184
Data columns (total 74 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            38185 non-null  int64  
 1   listing_url                                   38185 non-null  object 
 2   scrape_id                                     38185 non-null  int64  
 3   last_scraped                                  38185 non-null  object 
 4   name                                          38171 non-null  object 
 5   description                                   37077 non-null  object 
 6   neighborhood_overview                         22560 non-null  object 
 7   picture_url                                   38185 non-null  object 
 8   host_id                                       38185 non-null  int64  
 9   host_url                                      38185 non-null 

### Drop Columns

In [None]:
# Drop features that are duplicates or redundant 
remove_listing_rows =  ['scrape_id', 'last_scraped', 'picture_url' , 'host_id', 
                        'host_url', 'host_name', 'host_location', 'host_about', 
                        'host_thumbnail_url', 'host_picture_url', 
                        'host_neighbourhood', 'host_total_listings_count', 
                        'host_verifications', 'bathrooms', 'minimum_minimum_nights', 
                        'maximum_minimum_nights', 'minimum_maximum_nights', 
                        'maximum_maximum_nights', 'minimum_nights_avg_ntm', 
                        'maximum_nights_avg_ntm', 'calendar_updated', 
                        'calendar_last_scraped', 'first_review', 'last_review', 
                        'license', 'calculated_host_listings_count_entire_homes', 
                        'calculated_host_listings_count_private_rooms', 
                        'calculated_host_listings_count_shared_rooms', 
                        'reviews_per_month', 'listing_url']

listings = listings.drop(remove_listing_rows, axis=1)

### Inspect Null Values

In [None]:
# Count how many missing values there are in every column 
count_null_df(listings)

name                              14
description                     1108
neighborhood_overview          15625
host_since                        47
host_response_time             14290
host_response_rate             14290
host_acceptance_rate           13310
host_is_superhost                 47
host_listings_count               47
host_has_profile_pic              47
host_identity_verified            47
neighbourhood                  15624
bathrooms_text                   103
bedrooms                        3889
beds                            1287
review_scores_rating            9176
review_scores_accuracy          9759
review_scores_cleanliness       9748
review_scores_checkin           9765
review_scores_communication     9755
review_scores_location          9768
review_scores_value             9769
dtype: int64

In [None]:
def drop_variables(df, col):
    df = df.dropna(how='all', subset=[col])
    return df

#### Hosts

It seems like hosts with null values for 'host_since' also have null values for other important features such as 'host_acceptance_rate', 'host_listings_count', 'host_response_rate'. Hence, we will remove the rows with null values for 'host_since'.

In [None]:
# Inspect 'host_since' with null values
check_nulls_hosts = listings[listings['host_since'].isna()]
check_nulls_hosts.head()

Unnamed: 0,id,name,description,neighborhood_overview,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count
2395,2835711,Sunny West Village Apartment on MacDougal Street,NOW WITH HIGH-SPEED WI-FI<br /><br />This ador...,West Village!,,,,,,,...,0,4.46,4.63,3.75,4.85,4.92,4.92,4.52,f,1
3377,4446862,Charming Room in Prospect Heights!,Quiet room in three bedroom apt in Prospect He...,This charming Brooklyn neighborhood offers man...,,,,,,,...,0,,,,,,,,f,1
3666,4763327,"Luxurious, best location, spa inc'l","Brand new luxurious boutique building, swimmin...","Quoting from Time Out Magazine: ""Williamsburg ...",,,,,,,...,0,4.0,4.0,5.0,5.0,4.0,5.0,4.0,f,1
4040,5319785,Charming 1-BR in Gramercy,"Sunny, spacious, 1-bedroom apartment in great ...",,,,,,,,...,0,4.83,4.91,4.74,4.78,4.83,4.96,4.87,f,1
4600,6360224,"Sunny, Private room in Bushwick","Spacious room, lots of light. New, clean build...",,,,,,,,...,0,0.0,,,,,,,f,1


In [None]:
listings = drop_variables(listings, 'host_since')

#### Beds & Bedrooms

Based on Airbnb listing form, the minimum number of beds and bedrooms is 1. Hence, listings with null values will be removed.

In [None]:
listings = drop_variables(listings, 'bedrooms')
listings = drop_variables(listings, 'beds')

#### Review Scores

Listings with null values for review scores suggest that no reviews have been made as of the scraped date. Hence, we will convert all the null values to 0.

In [None]:
# Replace null values with 0 for review scores
review_scores_columns = ['review_scores_rating', 'review_scores_accuracy',
                         'review_scores_cleanliness', 'review_scores_checkin', 
                         'review_scores_communication', 'review_scores_location', 
                         'review_scores_value']

for x in review_scores_columns: 
    listings[x] = listings[x].replace(np.nan, 0)

#### Name

Those listings without a name seem to be dirty listings as the listings lack many data. Hence, we will be removing them.

In [None]:
# Inspect 'name' with null values
check_nulls_name = listings[listings['name'].isna()]
check_nulls_name.head()

Unnamed: 0,id,name,description,neighborhood_overview,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count
3214,4209595,,,,2014-08-29,,,,f,1.0,...,0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,f,1
3315,4370230,,Beautiful 1BR apartment in the heart of Nolita...,,2014-10-18,,,,f,1.0,...,0,5.0,5.0,4.8,5.0,4.8,5.0,4.8,f,1
3482,4581788,,,,2014-09-21,,,,f,1.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,f,1
3658,4756856,,"Sunny bedroom in Castlebraid, Bushwick! I shar...",,2012-02-29,,,,f,1.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,f,1
3678,4774658,,Room available (sublet) Jan.<br />Price very n...,,2014-12-08,,,,f,1.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,f,1


In [None]:
listings = drop_variables(listings, 'name')

### Convert Data Types

In [None]:
def remove_units(df,col):
    df[col] = df[col].replace({'\$': '', ',': '', '%':''}, regex=True)
    return df

In [None]:
listings = remove_units(listings, 'host_response_rate')
listings = remove_units(listings, 'host_acceptance_rate')
listings = remove_units(listings, 'price')

In [None]:
# Convert to datetime 
listings['host_since'] = pd.to_datetime(listings['host_since'])

In [None]:
# Convert to Int
numeric_int_variables = ['maximum_nights', 'minimum_nights', 'bedrooms', 'beds', 
                         'number_of_reviews', 'number_of_reviews_ltm', 
                         'number_of_reviews_l30d', 'calculated_host_listings_count']

for i_int in numeric_int_variables:
    listings[i_int] = listings[i_int].astype(int)

In [None]:
# Convert to Float
numeric_float_variables=['price', 'host_response_rate', 'host_acceptance_rate', 
                         'review_scores_rating', 'review_scores_accuracy', 
                         'review_scores_cleanliness', 'review_scores_checkin', 
                         'review_scores_communication',
                         'review_scores_location', 'review_scores_value']

for i_float in numeric_float_variables:
    listings[i_float] = listings[i_float].astype(float)

In [None]:
# Convert to String
listings['amenities_str'] = listings['amenities'].str[1:-1]

# Remove any html tags
# Fill 'nan' with 'No Data'
string_variables = ['name', 'description', 'neighborhood_overview', 
                    'host_response_time', 'neighbourhood', 
                    'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 
                    'property_type', 'room_type', 'bathrooms_text', 
                    'amenities_str']

for i in string_variables:
    listings[i] = listings[i].astype(str)
    listings[i] = listings[i].apply(lambda x: 'No Data' if x == 'nan' else x)
    listings = remove_tags(listings, i)

In [None]:
# Cleaning completed
listings.to_csv('cleaned_listings.csv', index=False)

In [None]:
listings

Unnamed: 0,id,name,description,neighborhood_overview,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,amenities_str
1,3831,"Whole flr w/private bdrm, bath & kitchen(pls r...","Enjoy 500 s.f. top floor in 1899 brownstone, w...",Just the right mix of urban center and local n...,2008-12-07,a few days or more,0.0,64.0,f,1.0,...,4.45,4.58,4.49,4.78,4.80,4.71,4.64,f,1,"""Hot water"", ""Iron"", ""Children\u2019s books an..."
2,5121,BlissArtsSpace!,The space HELLO EVERYONE AND THANKS FOR VISI...,No Data,2009-02-03,within an hour,100.0,100.0,f,1.0,...,4.52,4.22,4.09,4.91,4.91,4.47,4.52,f,2,"""Heating"", ""Air conditioning"", ""Long term stay..."
3,5136,"Spacious Brooklyn Duplex, Patio + Garden",We welcome you to stay in our lovely 2 br dupl...,No Data,2009-02-03,No Data,,25.0,f,1.0,...,5.00,5.00,5.00,5.00,5.00,4.50,5.00,f,1,"""Cable TV"", ""TV with standard cable"", ""Outdoor..."
4,5178,Large Furnished Room Near B'way,Please don’t expect the luxury here just a bas...,"Theater district, many restaurants around here.",2009-03-03,within a few hours,100.0,100.0,f,1.0,...,4.22,4.21,3.73,4.66,4.42,4.87,4.36,f,1,"""Hot water"", ""Body soap"", ""Iron"", ""Lock on bed..."
5,5203,Cozy Clean Guest Room - Family Apt,"Our best guests are seeking a safe, clean, spa...",Our neighborhood is full of restaurants and ca...,2009-02-05,No Data,,,f,1.0,...,4.91,4.83,4.82,4.97,4.95,4.94,4.92,f,1,"""Fire extinguisher"", ""Hot water"", ""Carbon mono..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38178,54146235,Micro Hostel in The Heart of Chelsea New York,Everything you want to explore is right outsid...,No Data,2017-10-20,within a day,100.0,100.0,f,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,t,4,"""Wifi"", ""Building staff"", ""Long term stays all..."
38180,54146364,Hermoso apartamento en el centro de williamsburg,Disfruta de la sencillez de este alojamiento t...,No Data,2018-12-12,within a few hours,100.0,72.0,f,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,f,2,"""Hot water"", ""Body soap"", ""Iron"", ""Window AC u..."
38181,54148518,Time Square New York -Share Room,New York!!! The trendiest city in the world. ...,The center of Manhattan! There are many uniqu...,2021-12-16,within a few hours,100.0,100.0,f,1.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,t,2,"""Hot water"", ""Ceiling fan"", ""Shower gel"", ""Ref..."
38182,54150715,Private PATIO in PRIVATE Room | 5mins to Manha...,No Data,No Data,2020-03-05,No Data,,,f,12.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,f,1,"""Fire extinguisher"", ""Hot water"", ""Carbon mono..."


In [None]:
listings.shape

(33150, 45)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2a92d0af-cecf-4cde-96f5-c3db3a7f88f1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>