In [None]:
#importing necessary libraries 

import pandas as pd  
import numpy as np

# Dataset -> Listings detailed

In [None]:
listings_detailed = pd.read_csv('listings2.csv', low_memory=False)
listings_detailed.head()

In [None]:
listings_detailed.shape

In [None]:
len(listings_detailed.columns.tolist())

In [None]:
print(listings_detailed.columns.tolist())

In [None]:
#let's remove columns we think won't be useful
low_variance = []

for col in listings_detailed._get_numeric_data():
    minimum = min(listings_detailed[col])
    ninety_perc = np.percentile(listings_detailed[col], 90)
    if ninety_perc == minimum:
        low_variance.append(col)

print(low_variance)


In [None]:
#host_acceptance_rate: All column values are N/A
#country_code and country (All PT)
#business travel ready is all F
#requires licence 
#availability_30,60, etc.... -> availability for the future
#neighbourhood_cleansed too detailed
#scores ratings other than 'review_scores_rating' left since the latest is a weighted average
#'cleaning_fee' included in price
#licence doesnt tell anything about the listing or neighbour since it lisbon is mandatory to have licence 
'''
'instant_bookable',
 'cancellation_policy',
 'require_guest_profile_picture',
 'require_guest_phone_verification',

to buld a flexiblity scale

'''

columns_to_remove = ['neighbourhood',"scrape_id",'listing_url',"last_scraped","experiences_offered","notes","transit","access","interaction",'number_of_reviews_ltm',
                     "thumbnail_url","house_rules","medium_url","picture_url","xl_picture_url","host_name","host_about",
                     "host_acceptance_rate","host_url","host_thumbnail_url","host_picture_url","host_neighbourhood","host_verifications",
                     "state","market","smart_location","country",'country_code',"minimum_minimum_nights","maximum_minimum_nights",
                     "minimum_maximum_nights",'maximum_maximum_nights',"minimum_nights_avg_ntm","maximum_nights_avg_ntm","calendar_last_scraped",
                     "jurisdiction_names", "street", "calendar_updated", "has_availability", "is_location_exact", "city", "zipcode", 
                     "is_business_travel_ready", "weekly_price", "monthly_price",'maximum_nights', 'availability_30',
                     'availability_60','availability_90','availability_365','square_feet', 'latitude','longitude',
                     "host_id","host_since", "host_location", "host_response_time",'host_response_rate','host_listings_count',
                     'host_total_listings_count','host_has_profile_pic','host_identity_verified','summary','space',
                     'review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication',
                     'review_scores_location','review_scores_value', 'cleaning_fee', 'security_deposit','extra_people',
                     'requires_license','license', 'calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms',
                     'calculated_host_listings_count_shared_rooms']

data = listings_detailed.drop(columns_to_remove, axis =1)

# Final Columns

In [None]:
print(data.columns.tolist())

In [None]:
#We can use the info() method to output some general information about the dataframe:

len(data.columns.tolist())

In [None]:
print(data.info())

In [None]:
data.describe()

In [None]:
### MISSING VALUES ###

null_cols = data.isnull().sum()
null_cols[null_cols > 0]

In [None]:
#makes sense that last review and reviews per month 
#have missing values when number_of_reviews=0. Let's check that:

null_review = data[(data['last_review'].isnull()==True)| (data['reviews_per_month'].isnull()==True)]
null_review['number_of_reviews']

#What to do with the missing values?
#We can see that last_review reviews_per_month is null when number of reviews zero, 
#which makes total sense! Therefore it makes sense to replace the null values with zeros

In [None]:
data['neighbourhood_group_cleansed'].unique() 

# Lisbon city


In [None]:
#####  WE WANT TO ANALYSE DATA IN LISBON ONLY!! #######
data = data[(data['neighbourhood_group_cleansed']=='Lisboa')]

data.head()

In [None]:
#let's check the dataset shape

data.shape

#### Listings with no reviews

In [None]:
#we have decided to exclude listings with no first nor last review 
#the below listings don't have enough information for our quality analysis 

null_displ = data[(data['last_review'].isnull()==True)|(data['first_review'].isnull()==True)]
null_displ['review_scores_rating']


In [None]:
data = data.dropna(subset=['last_review','first_review'])
data.head()

In [None]:
data.shape

In [None]:
data['review_scores_rating']

### Final data cleaning

In [None]:
#let's check datatypes
data.dtypes

In [None]:
data['price']

In [None]:
#let's change price type from object to float
# but first we neeed to remove special characters --> $ symbol
data['price'] = data['price'].str.replace('$', '')
data['price'] = data['price'].str.replace(',', '')

data['price'] = data.price.astype(float)

data['price'].dtypes #sucess! 

In [None]:
#also it does not make sense to have data points with price zero
price_zero = data[(data['price']==0)]
price_zero

In [None]:
## remove those rows

data = data.drop([11174,11201,11202,11323,11375], axis=0)

In [None]:
price_zero = data[(data['price']==0)]
price_zero

In [None]:
#convert fist and last review into data type data format

data['last_review'] = pd.to_datetime(data['last_review'])
data['first_review'] = pd.to_datetime(data['first_review'])

In [None]:
#remove outliers from price 
data = data.drop([21642, 9172, 22572, 10845, 24314, 22929, 18812, 1181 ], axis=0)

In [None]:
# listing_duration = (last_review - first_review) -> value to be evaluated in days

data.insert(21, "listing_duration", data['last_review'] - data["first_review"], True)

In [None]:
data['listing_duration'].unique() 

In [None]:
#transform into days

data["listing_duration"]=  (data['last_review'] - data["first_review"]).dt.days
data['listing_duration'].unique() 

In [None]:
data.dtypes

In [None]:
#renaming columns so that it's more intuitive 
data = data.rename(columns={'neighbourhood_cleansed':'neighbourhood',
                            'neighbourhood_group_cleansed':'city'})

In [None]:
print(data.columns.tolist())

In [None]:
## order columns 

column_order = ['id', 'name', 'city','neighbourhood', 'price','description', 'neighborhood_overview',   
                'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 
                'amenities',  'guests_included', 'minimum_nights', 'number_of_reviews', 'first_review', 'last_review',
                'listing_duration', 'review_scores_rating', 'host_is_superhost','instant_bookable', 'cancellation_policy', 
                'require_guest_profile_picture', 'require_guest_phone_verification', 'calculated_host_listings_count', 
                'reviews_per_month']

data = data[column_order]
data.head()

# A Neighborhood Analysis 

In [None]:
data['neighbourhood'].unique() 

In [None]:
#is this step necessary?

test_df= data.groupby(['neighbourhood'])['neighbourhood'].count()
test_df.sort_values(ascending= False)

In [None]:
#### PRICE ANALYSIS ####
neighbourhood_price = data.groupby(['neighbourhood'])['price'].describe().reset_index()
neighbourhood_price.head()
neighbourhood_price.columns
neighbourhood_price.sort_values(['mean'], ascending = False)

# The neighbourhood with cheapest mean price listings is 'Beato' 
# The neighbourhood with the most expensive mean price  is 'Parque das Nações'
# Highest price variation in Marvila, Lumiar, Santa Maria Maior, Santa Clara e Arroios

In [None]:
# what neighbourhoods have oldest listings --> this can provide insight on the most habitual touristic areas

neighbourhood_duration = data.groupby(['neighbourhood'])['listing_duration'].describe()
neighbourhood_duration.sort_values(by=['max'], ascending=False)

# Traditional Airbnb Neighbourhoods = Avenidas Novas, Misericordia, Estrela, São Vicente e Santa Maria Maior
# most of them available for more than 9 years

In [None]:
## which neighbourhoods have best ratings from costumers? 

neighbourhood_reviews = data.groupby(['neighbourhood'])['review_scores_rating'].describe()
neighbourhood_reviews.sort_values(by=['mean'], ascending = False)

#'Santa Clara' is the best rated neighbourhood 
# However, it has had only 17 reviews.

In [None]:
#sort by number of reviewes - other insight on the most habitual touristic areas!!!!

neighbourhood_reviews = data.groupby(['neighbourhood'])['review_scores_rating'].describe()
neighbourhood_reviews.sort_values(by=['count'], ascending = False)

#'Santa Maria Maior' is the most rated neighbourhood.
# this area includes Baixa de Lisboa, Cais de Sodre and Terreiro do Paço (LET'S PUT PICTURE IN THE SLIDES) 
# As we all know these are super touristic areas, therefore confirming our hyphotesis that the most reviewed areas are also the ones tourists tend to choose
# number of reviews is a good proxy for level of tourism 
# Maybe the backpacker would prefer to stay in a more hipster area? Example 'Beato', which also happens to be the cheapest!



In [None]:
# If i want to come to lisbon of a short weekend which neighbourhood would be easier? 
#(e.g. would have less min nights)

neighbourhood_minimum_nights = data.groupby(['neighbourhood'])['minimum_nights'].describe()
neighbourhood_minimum_nights.sort_values(by=['mean'], ascending = True)

# Santa Maria Maior seems to be, on average, the neighbourhood with the lowest minimum nights requested
# Once again confirming the hypothesis of an area more focused on tourism. in this case, short term visits

#If I'm looking for a long term stay (e.g. 6 months) i'd have to look in areas like 
#'Misericrdia', 'Santo Antnio', 'Penha de França" and 'Olivais'

## Value for Money

_"A utility derived from every purchase or every sum of money spent."_


.....How inspirational.....

In [None]:
# value_money = review_scores_rating/price

data.insert(5, "value_money", data["review_scores_rating"]/data['price'], True)

#### Value for money and Neighbourhood

In [None]:
neighbourhood_value_money = data.groupby(['neighbourhood'])['value_money'].describe()
neighbourhood_value_money = neighbourhood_value_money.sort_values(by=['mean'], ascending=False)
neighbourhood_value_money
#Santa Clara seems to be the best value for money
#Have you seen where Santa Clara is? It's basically Ameixoeira
#I think its intesting to explore this. The best value for money neighbourhood is in an absolutly not touristic area

In [None]:
#Maybe we could explore the best value for money, among the most rated 

neighbourhood_value_money.loc[neighbourhood_value_money['count']> 200]

#among the neighbourhoods with more than 200 reviews,'Areeiro'holds best value for money
#Once again not a touristic area

In [None]:
#let's increase our threshold:

neighbourhood_value_money.loc[neighbourhood_value_money['count']> 1000]

#among the neighbourhoods with more than 1000 reviews,'Arroios'holds best value for money
#This is a "new", "hipster", growing area in Lisbon incuding sections like Anjos, Martim Moniz!!

#### Room Type  - Price

In [None]:
room_price = data.groupby(['room_type'])['price'].describe().reset_index()
room_price.sort_values(by=['mean'])

In [None]:
# Group by neighbourhood and type of place
neighbourhood_rooms = data.groupby(['room_type','neighbourhood'])['price'].agg(['mean', 'median'])

neighbourhood_rooms.sort_values(by=['room_type','mean'], ascending= True)
neighbourhood_rooms

#### Room type - Value for Money

In [None]:
room_price = data.groupby(['room_type'])['value_money'].describe().reset_index()
room_price.sort_values(by=['mean'])

In [None]:
# Group by neighbourhood and type of place
neighbourhood_rooms = data.groupby(['room_type','neighbourhood'])['value_money'].agg(['mean', 'median'])
neighbourhood_rooms.sort_values(by=['room_type','mean'], ascending= True)

# Filter for  Personas

We believe that our three personas have different flexibility preferences when choosing their airbnb


#Wich is the best neighbouhood for each persona:

Business: high flex, intermediate comfort, intermediate price, regardless of the tourist location

Backpecker: high flex, low comfy, low price, regardless of the tourist location

Family: moderate flex, high comfy, intermediate to high price, high score rating, cares about the tourist location

In [None]:
#an overall check
data.columns

### Flexbility

In [None]:
# Select by cancellation policy
flexible = data[data ['cancellation_policy'] == 'flexible']  # Business/ Backpacker
moderate = data[data['cancellation_policy'] == 'moderate']  # Family
len(flexible)
len(moderate)
# print(type(flexible))
# flexible.head()

In [None]:
# Filter by instant bookable
instant_bookable = flexible[flexible['instant_bookable'] == 't']  # Business/ Backpacker
len(instant_bookable)

In [None]:
# Filter by require_picture
no_picture = instant_bookable[instant_bookable['require_guest_profile_picture'] == 'f']   # Business/ Backpacker
len(no_picture) #1688

# Filter by require phone verification
the_most_flexible = no_picture[no_picture['require_guest_phone_verification'] == 'f']   # Business/ Backpacker
len(no_phone) #1681

### Comfort

In [None]:
# Filter by room_type
# Business
business_room_type = the_most_flexible[the_most_flexible['room_type'] == 'Entire home/apt']
len(business_room_type)

# Back
backpacker_room_type = the_most_flexible[the_most_flexible['room_type'] == 'Shared room']
len(backpacker_room_type)

# Family
family_room_type = moderate[moderate['room_type'] == 'Entire home/apt']
len(family_room_type)


### Price/neighbourhood

In [None]:
# Business
business_neighbourhoods = business_room_type.groupby(['neighbourhood'])['price'].agg(['mean', 'median']).reset_index()
business_neighbourhoods = business_neighbourhoods.sort_values(by=['mean'], ascending= True)
business_neighbourhoods.head()

In [None]:
# Backpacker
backpacker_neighbourhoods = backpacker_room_type.groupby(['neighbourhood'])['price'].agg(['mean', 'median']).reset_index()
backpacker_neighbourhoods = backpacker_neighbourhoods.sort_values(by=['mean'], ascending= True)
backpacker_neighbourhoods.head()

In [None]:
# Family
family_neighbourhoods = family_room_type.groupby(['neighbourhood'])['price'].agg(['mean', 'median']).reset_index()
family_neighbourhoods = family_neighbourhoods.sort_values(by=['mean'], ascending= False)
family_neighbourhoods.head()

###  Value Money

In [None]:
# Business
business_value_money = business_room_type.groupby(['neighbourhood'])['value_money'].agg(['mean', 'median']).reset_index()
business_value_money = business_value_money.sort_values(by=['mean'], ascending = False)
business_value_money.head()

In [None]:
# Backpacker
backpacker_value_money = backpacker_room_type.groupby(['neighbourhood'])['value_money'].agg(['mean', 'median']).reset_index()
backpacker_value_money = backpacker_value_money.sort_values(by=['mean'], ascending= False)
backpacker_value_money.head()

In [None]:
# Family
family_value_money = family_room_type.groupby(['neighbourhood'])['value_money'].agg(['mean', 'median']).reset_index()
family_value_money = family_value_money.sort_values(by=['mean'], ascending= False)
family_value_money.head()

# Categorize neighbourhoods by price

In [None]:

# Create 5 bins and use the five labels above natural breaks

bins_labels = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']
neighbourhoods_bins = pd.cut(data['price'],5, labels=bins_labels)
# Number of obs in each category
neighbourhoods_bins.value_counts()

In [None]:
# Based in quartiles
bins_labels = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']
neighbourhoods_bins = pd.qcut(data['price'],5, labels=bins_labels)
# Number of obs in each category
neighbourhoods_bins.value_counts()

In [None]:
data['Price Categories'] = pd.qcut(data['price'],5, labels=bins_labels)
data.head()