In [None]:
#importing necessary libraries 

import pandas as pd  
import numpy as np

# Dataset -> Listings detailed

In [None]:
listings_detailed = pd.read_csv('listings2.csv', low_memory=False)
listings_detailed.head()

In [None]:
listings_detailed.shape

In [None]:
len(listings_detailed.columns.tolist())

In [None]:
print(listings_detailed.columns.tolist())

In [None]:
#let's remove columns we think won't be useful
low_variance = []

for col in listings_detailed._get_numeric_data():
    minimum = min(listings_detailed[col])
    ninety_perc = np.percentile(listings_detailed[col], 90)
    if ninety_perc == minimum:
        low_variance.append(col)

print(low_variance)


In [None]:
#host_acceptance_rate: All column values are N/A
#country_code and country (All PT)
#business travel ready is all F
#requires licence 
#availability_30,60, etc.... -> availability for the future
#neighbourhood_cleansed too detailed
#scores ratings other than 'review_scores_rating' left since the latest is a weighted average
#'cleaning_fee' included in price
#licence doesnt tell anything about the listing or neighbour since it lisbon is mandatory to have licence 
'''
'instant_bookable',
 'cancellation_policy',
 'require_guest_profile_picture',
 'require_guest_phone_verification',

to buld a flexiblity scale

'''

columns_to_remove = ['neighbourhood',"scrape_id",'listing_url',"last_scraped","experiences_offered","notes","transit","access","interaction",'number_of_reviews_ltm',
                     "thumbnail_url","house_rules","medium_url","picture_url","xl_picture_url","host_name","host_about",
                     "host_acceptance_rate","host_url","host_thumbnail_url","host_picture_url","host_neighbourhood","host_verifications",
                     "state","market","smart_location","country",'country_code',"minimum_minimum_nights","maximum_minimum_nights",
                     "minimum_maximum_nights",'maximum_maximum_nights',"minimum_nights_avg_ntm","maximum_nights_avg_ntm","calendar_last_scraped",
                     "jurisdiction_names", "street", "calendar_updated", "has_availability", "is_location_exact", "city", "zipcode", 
                     "is_business_travel_ready", "weekly_price", "monthly_price",'maximum_nights', 'availability_30',
                     'availability_60','availability_90','availability_365','square_feet', 'latitude','longitude',
                     "host_id","host_since", "host_location", "host_response_time",'host_response_rate','host_listings_count',
                     'host_total_listings_count','host_has_profile_pic','host_identity_verified','summary','space',
                     'review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication',
                     'review_scores_location','review_scores_value', 'cleaning_fee', 'security_deposit','extra_people',
                     'requires_license','license', 'calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms',
                     'calculated_host_listings_count_shared_rooms']

data = listings_detailed.drop(columns_to_remove, axis =1)

# Final Columns

In [None]:
print(data.columns.tolist())

In [None]:
#We can use the info() method to output some general information about the dataframe:

len(data.columns.tolist())

In [None]:
print(data.info())

In [None]:
data.describe()

In [None]:
### MISSING VALUES ###

null_cols = data.isnull().sum()
null_cols[null_cols > 0]

In [None]:
#makes sense that last review and reviews per month 
#have missing values when number_of_reviews=0. Let's check that:

null_review = data[(data['last_review'].isnull()==True)| (data['reviews_per_month'].isnull()==True)]
null_review['number_of_reviews']

#What to do with the missing values?
#We can see that last_review reviews_per_month is null when number of reviews zero, 
#which makes total sense! Therefore it makes sense to replace the null values with zeros

In [None]:
data['neighbourhood_group_cleansed'].unique() 

# Lisbon city only

In [None]:
#####  WE WANT TO ANALYSE DATA IN LISBON ONLY!! #######
data = data[(data['neighbourhood_group_cleansed']=='Lisboa')]

data.head()

In [None]:
data.shape

In [None]:
#the below listings don't have enough information for our quality analysis 
null_displ = data[(data['last_review'].isnull()==True)|(data['first_review'].isnull()==True)]
null_displ['review_scores_rating']


In [None]:
data = data.dropna(subset=['last_review','first_review'])
data

In [None]:
data.shape

In [None]:
data['review_scores_rating']

In [None]:
#let's check datatypes
data.dtypes

In [None]:
data['price']

In [None]:
#let's change price type from object to float
# but first we neeed to remove special characters --> $ symbol
data['price'] = data['price'].str.replace('$', '')
data['price'] = data['price'].str.replace(',', '')

data['price'] = data.price.astype(float)

data['price'].dtypes #sucess! 

In [None]:
#also it does not make sense to have data points with price zero
price_zero = data[(data['price']==0)]
price_zero

In [None]:
## remove those rows

data = data.drop([11174,11201,11202,11323,11375], axis=0)

In [None]:
price_zero = data[(data['price']==0)]
price_zero

In [None]:
#convert fist and last review into data type data format

data['last_review'] = pd.to_datetime(data['last_review'])
data['first_review'] = pd.to_datetime(data['first_review'])

In [None]:
# listing_duration = (last_review - first_review) -> value to be evaluated in days

data.insert(21, "listing_duration", data['last_review'] - data["first_review"], True)

In [None]:
data['listing_duration'].unique() 

In [None]:
#transform into days
data["listing_duration"]=  (data['last_review'] - data["first_review"]).dt.days
data['listing_duration'].unique() 

In [None]:
data.dtypes

# TO DO:

In [None]:
"""
We'll use 'instant_bookable', 'cancellation_policy', 'require_guest_profile_picture' and 'require_guest_phone_verification'
to a create a single varible where we can analyse the flexibility of the listing. 
"""

data['cancellation_policy'].unique() 

In [None]:
#rename columns 
data = data.rename(columns={'neighbourhood_cleansed':'neighbourhood',
                            'neighbourhood_group_cleansed':'city'})

In [None]:
print(data.columns.tolist())

In [None]:
## order columns 

column_order = ['id', 'name', 'city','neighbourhood', 'price','description', 'neighborhood_overview',   
                'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 
                'amenities',  'guests_included', 'minimum_nights', 'number_of_reviews', 'first_review', 'last_review',
                'listing_duration', 'review_scores_rating', 'host_is_superhost','instant_bookable', 'cancellation_policy', 
                'require_guest_profile_picture', 'require_guest_phone_verification', 'calculated_host_listings_count', 
                'reviews_per_month']

data = data[column_order]
data.head()

In [None]:
#create a flexibility variable with 'require_guest_profile_picture', 'require_guest_phone_verification','instant_bookable'

### labels = ['Low Flexibility', 'Moderate', 'High Flexibility']


# A Neighborhood Analysis 

In [None]:
data['neighbourhood'].unique() 

In [None]:
data.groupby(['neighbourhood'])

In [None]:
#### PRICE ANALYSIS ####
neighbourhood_price = data.groupby(['neighbourhood'])['price'].describe().reset_index()
neighbourhood_price.head()
neighbourhood_price.columns
neighbourhood_price.sort_values(['std'], ascending = False)
# Média mais barata é Beato
# Média mais cara é Parque das Nações
# Maior variação de preços em Marvila, Lumiar, Santa Maria Maior, Santa Clara e Arroios

In [None]:
# what neighbourhoods have oldest listings --> this can provide insight on the most habitual touristic areas
neighbourhood_duration = data.groupby(['neighbourhood'])['listing_duration'].describe()
neighbourhood_duration.sort_values(by=['max'], ascending=False)
# Traditional Airbnb Neighbourhoods = Avenidas Novas, Misericordia, Estrela, São Vicente e Santa Maria Maior
# disponíveis há mais de 9 anos

In [None]:
## which neighbourhoods have best ratings from costumers???
neighbourhood_reviews = data.groupby(['neighbourhood'])['review_scores_rating'].describe()
neighbourhood_reviews.sort_values(by=['mean'], ascending = False)

In [None]:
## If i want to come to lisbon of a short weekend which neighbourhood would be easier? (e.g. would have less min nights)

neighbourhood_minimum_nights = data.groupby(['neighbourhood'])['minimum_nights'].describe()
neighbourhood_minimum_nights.sort_values(by=['mean'], ascending = True)

In [None]:
data['price'].describe()

In [None]:
#### VALUE FOR MONEY #######

# value_money = review_scores_rating/price

data.insert(5, "value_money", data["review_scores_rating"]/data['price'], True)

In [None]:

neighbourhood_value_money = data.groupby(['neighbourhood'])['value_money'].describe()
neighbourhood_value_money.sort_values(by=['mean'], ascending=False)

Santa Clara seems to be the most value for money

In [None]:
# Room Type|Price
room_price = data.groupby(['room_type'])['price'].describe().reset_index()
room_price.sort_values(by=['mean'])

In [None]:
# Group by neighbourhood and type of place
neighbourhood_rooms = data.groupby(['room_type','neighbourhood'])['price'].agg(['mean', 'median'])
neighbourhood_rooms
neighbourhood_rooms.sort_values(by=['room_type','mean'], ascending= True)

In [None]:
### FLEXIBILITY ###
data.columns
flexibility = data.groupby(['cancellation_policy','neighbourhood'])['price'].agg(['mean', 'median'])
flexibility
flexibility.sort_values(by=['cancellation_policy','mean'], ascending= True)


In [None]:

############### Wich is the best neighbouhood for each persona ########################
# Business: high flex, intermediate comfort, intermediate price
#Backpecker: high flex, low comfy, low price
#Family:low flex, high comfy,inter to high price, high score rating

# Filtrar aps para personas
# 'instant_bookable' == False
# 'cancellation_policy' != flexible
# 'require_guest_profile_picture' = True
# 'require_guest_phone_verification' = True


# condition = data(['instant_bookable'] == False)
# condition2 = data(['cancellation_policy'] != 'flexible')




In [None]:
# dropar rows contrarios, aos flexiveis aplicar filtro do preço ?

In [None]:
# Categorize neighbourhoods by price
# Cheap, Intermediate, Expensive, Very Expensive