In [1]:
import pandas as pd 
import pandas_dq as dq

In [2]:
df = pd.read_csv('listings.csv', sep = ",")

# Accuracy

In [83]:
# find duplicate rows
duplicateRowsDF = df[df.duplicated()]
print("Duplicate Rows except first occurrence based on all columns are :")
print(duplicateRowsDF)


Duplicate Rows except first occurrence based on all columns are :
Empty DataFrame
Columns: [id, listing_url, host_id, host_name, host_url, scrape_id, last_scraped, source, name, description, neighborhood_overview, host_since, host_location, host_about, host_response_time, host_response_rate, host_acceptance_rate, host_is_superhost, host_neighbourhood, host_listings_count, host_total_listings_count, host_verifications, host_has_profile_pic, host_identity_verified, neighbourhood, neighbourhood_cleansed, neighbourhood_group_cleansed, latitude, longitude, property_type, room_type, accommodates, bathrooms, bathrooms_text, bedrooms, beds, amenities, price, minimum_nights, maximum_nights, minimum_minimum_nights, maximum_minimum_nights, minimum_maximum_nights, maximum_maximum_nights, minimum_nights_avg_ntm, maximum_nights_avg_ntm, calendar_updated, has_availability, availability_30, availability_60, availability_90, availability_365, calendar_last_scraped, number_of_reviews, number_of_reviews_

In [84]:
# check for duplicate values in ID column
duplicateRowsDF = df[df.duplicated(['id'])]
print("Duplicate Rows except first occurrence based on ID column are :")
print(duplicateRowsDF)

Duplicate Rows except first occurrence based on ID column are :
Empty DataFrame
Columns: [id, listing_url, host_id, host_name, host_url, scrape_id, last_scraped, source, name, description, neighborhood_overview, host_since, host_location, host_about, host_response_time, host_response_rate, host_acceptance_rate, host_is_superhost, host_neighbourhood, host_listings_count, host_total_listings_count, host_verifications, host_has_profile_pic, host_identity_verified, neighbourhood, neighbourhood_cleansed, neighbourhood_group_cleansed, latitude, longitude, property_type, room_type, accommodates, bathrooms, bathrooms_text, bedrooms, beds, amenities, price, minimum_nights, maximum_nights, minimum_minimum_nights, maximum_minimum_nights, minimum_maximum_nights, maximum_maximum_nights, minimum_nights_avg_ntm, maximum_nights_avg_ntm, calendar_updated, has_availability, availability_30, availability_60, availability_90, availability_365, calendar_last_scraped, number_of_reviews, number_of_reviews_lt

Use pattern matching to check if dates are formatted correctly

In [44]:
import re

texts1 = df['host_since'].to_list()
texts2 = df['last_review'].to_list()
texts3 = df['first_review'].to_list()
texts4 = df['last_scraped'].to_list()
texts5 = df['calendar_last_scraped'].to_list()
texts6 = df['calendar_updated'].to_list()

all_texts = texts1 + texts2 + texts3 + texts4 + texts5 + texts6

# Define a regular expression pattern for dates (e.g., YYYY-MM-DD or DD/MM/YY)
date_pattern = r'\d{4}-\d{2}-\d{2}'

# Find dates in the text
for dat in all_texts:
    # check for specific pattern in the date
    if re.search(date_pattern, dat):
        pass
    else:
        print('No date found in: ', dat)

## Outliers

Delete rows that have outlier values out of the 95th quantile.

In [45]:
# Find outliers for column minimum_nights
numerical_columns = df[['minimum_nights']]
Q1 = numerical_columns.quantile(0.05)
Q3 = numerical_columns.quantile(0.95)
IQR = Q3 - Q1
outliers = df[((numerical_columns < (Q1 - 1.5 * IQR)) | (numerical_columns > (Q3 + 1.5 * IQR))).any(axis=1)]
print(outliers['minimum_nights'])

# remove rows with outliers
df = df[~((numerical_columns < (Q1 - 1.5 * IQR)) | (numerical_columns > (Q3 + 1.5 * IQR))).any(axis=1)]

1617     1000
2356     1000
10590     999
26118    1000
29605    1000
29950    1000
32273     999
33535     999
34027     999
Name: minimum_nights, dtype: int64


In [46]:
# same for number_of_reviews_l30d
numerical_columns = df[['number_of_reviews_l30d']]

outliers = df[((numerical_columns < 0) | (numerical_columns > 30)).any(axis=1)]
print(outliers['listing_url'],outliers['number_of_reviews_l30d'])

# remove rows with outliers
df = df[~((numerical_columns < 0) | (numerical_columns > 30)).any(axis=1)]

26815    https://www.airbnb.com/rooms/35145338
30682    https://www.airbnb.com/rooms/40194697
31423    https://www.airbnb.com/rooms/41020735
Name: listing_url, dtype: object 26815    69
30682    43
31423    34
Name: number_of_reviews_l30d, dtype: int64


In [47]:
# same for number_of_reviews_ltm
numerical_columns = df[['number_of_reviews_ltm']]
outliers = df[((numerical_columns < 0) | (numerical_columns > 365)).any(axis=1)]
print(outliers['listing_url'],outliers['number_of_reviews_ltm'])

# remove rows with outliers
df = df[~((numerical_columns < 0) | (numerical_columns > 365)).any(axis=1)]


14711    https://www.airbnb.com/rooms/17222007
27792    https://www.airbnb.com/rooms/36063785
29684    https://www.airbnb.com/rooms/38891995
29727    https://www.airbnb.com/rooms/38899977
Name: listing_url, dtype: object 14711    852
27792    367
29684    503
29727    531
Name: number_of_reviews_ltm, dtype: int64


#### Convert price to numeric values and remove dollars based on quantile

In [48]:
# remove $ from price column and turn it to float
df['price'] = df['price'].str.replace('$', '')
df['price'] = df['price'].str.replace(',', '')
df['price'] = df['price'].astype(float)

#### Remove top 0.005% priced listings 
* 26 listings

In [49]:
# turn price to float
numerical_columns = df[['price']]
Q3 = numerical_columns.quantile(0.9995)
outliers = df[((numerical_columns > Q3)).any(axis=1)]
print(outliers['price'])

# remove rows with outliers
df = df[~((numerical_columns > Q3)).any(axis=1)]

7272     30000.0
7563      9347.0
14765    10000.0
23264    12000.0
28841    10857.0
31891    12000.0
33499     9336.0
34035    10000.0
36492     9999.0
36493     9999.0
36496     9999.0
36499     9999.0
36502     9999.0
36504     9999.0
36506     9999.0
36508     9999.0
41117    11600.0
41573    10000.0
42509    11600.0
43083    11600.0
46986    61119.0
49014    21564.0
54103     9999.0
55312    63594.0
56231    15000.0
61702     9999.0
Name: price, dtype: float64


# Completeness
#### Check for null values

### 2 problematic rows found:
* one with id null and many more fields null
* one with multiple columns fileld with null values
### deleted those

In [85]:
# find rows where id is null and delete them
null_rows = df[df['id'].isnull()]
print(null_rows)
df = df[~((df['id'].isnull()))]

#delete rows where host resposnse time is null
null_rows = df[df['host_response_time'].isnull()]
print(null_rows)
df = df[~((df['host_response_time'].isnull()))]


       id    listing_url   host_id host_name host_url scrape_id last_scraped  \
13080  NaN  within an hour   100%     100%       t      Nation       1.0       

      source                name               description  \
13080   1.0   ['email', 'phone', 'work_email']       t       

      neighborhood_overview           host_since          host_location  \
13080            t           Paris, Île-de-France, France     Reuilly     

      host_about host_response_time host_response_rate host_acceptance_rate  \
13080   Reuilly       48.84606            2.4035        Entire rental unit    

      host_is_superhost host_neighbourhood  host_listings_count  \
13080   Entire home/apt          2                  0.0           

      host_total_listings_count host_verifications host_has_profile_pic  \
13080           1 bath                  1.0                 2.0           

                                                                                                                      

In [14]:
# print columns when there is at least one null value and their data type
null_columns = df.columns[df.isnull().any()]
print('description' in null_columns, 'price' in null_columns)
#print(df[null_columns].isnull().sum())

True False


In [37]:
df['host_about'] = df['host_about'].fillna('The host has not provided this information yet.')
df['description'] = df['description'].fillna('The host has not provided a description yet.')
df['neighborhood_overview'] = df['neighborhood_overview'].fillna('The host has not provided a neighbourhood overview yet.')

In [38]:
# descriptions and columns that have text data
# replace the null values with empty string in many columns
df['host_name'] = df['host_name'].fillna('')
df['host_since'] = df['host_since'].fillna('2023-06-06')

# replace host location with listing location if it is null
df['host_location'] = df['host_location'].fillna(df['neighbourhood_cleansed'])

# fill the null values with the average of the column
df['host_response_time'] = df['host_response_time'].fillna(df['host_response_time'].mode()[0])
df['host_response_rate'] = df['host_response_rate'].fillna(df['host_response_rate'].mode()[0])
df['host_acceptance_rate'] = df['host_acceptance_rate'].fillna(df['host_acceptance_rate'].mode()[0])
df['host_is_superhost'] = df['host_is_superhost'].fillna('f')
df['host_thumbnail_url'] = df['host_thumbnail_url'].fillna('')
df['host_picture_url'] = df['host_picture_url'].fillna('')
df['host_neighbourhood'] = df['host_neighbourhood'].fillna(df['neighbourhood_cleansed'])
df['host_listings_count'] = df['host_listings_count'].fillna(df['calculated_host_listings_count'])
df['host_total_listings_count'] = df['host_total_listings_count'].fillna(df['calculated_host_listings_count'])
df['host_verifications'] = df['host_verifications'].fillna('[]')
df['host_has_profile_pic'] = df['host_has_profile_pic'].fillna('f')
df['host_identity_verified'] = df['host_identity_verified'].fillna('f')
df['neighbourhood'] = df['neighbourhood'].fillna(df['neighbourhood_cleansed'])
df['neighbourhood_group_cleansed'] = df['neighbourhood_group_cleansed'].fillna(df['neighbourhood_cleansed'])
df['bathrooms'] = df['bathrooms'].fillna(0)
df['bedrooms'] = df['bedrooms'].fillna(0)
df['beds'] = df['beds'].fillna(0)
df['bathrooms_text'] = df['bathrooms_text'].fillna('0 baths')
df['minimum_minimum_nights'] = df['minimum_minimum_nights'].fillna(0)
# all maximum nights to 1000
df['maximum_minimum_nights'] = df['maximum_minimum_nights'].fillna(1000)
df['minimum_maximum_nights'] = df['minimum_maximum_nights'].fillna(1000)
df['maximum_maximum_nights'] = df['maximum_maximum_nights'].fillna(1000)

# fill avg ntm with 3 according to Inside Airbnb's assumptions
df['minimum_nights_avg_ntm'] = df['minimum_nights_avg_ntm'].fillna(3)
df['maximum_nights_avg_ntm'] = df['maximum_nights_avg_ntm'].fillna(3)

# fill the calendar updated where the value is 'f' with 2023-06-06
df['calendar_updated'] = df['calendar_updated'].fillna('2023-06-06')

#in the rows that the first review is null the calculates_host_listings_count is 1 , make the first review equal to the host_since date
if df['first_review'].isnull().any():
    df.loc[df['first_review'].isnull(), 'first_review'] = df['host_since']
else:
    df['first_review'] = df['first_review'].fillna('No date for first review.')

df['last_review'] = df['last_review'].fillna('2023-06-06')

# if the license value is null then the listing has no license
df['license'] = df['license'].fillna('f')

df['reviews_per_month'] = df['reviews_per_month'].fillna(0)

# get the average for all review related columns if the value is null review_scores_rating, review_scores_accuracy, review_scores_cleanliness, review_scores_checkin, review_scores_communication, review_scores_location, review_scores_value
df['review_scores_rating'] = df['review_scores_rating'].fillna(df['review_scores_rating'].mean())
df['review_scores_accuracy'] = df['review_scores_accuracy'].fillna(df['review_scores_accuracy'].mean())
df['review_scores_cleanliness'] = df['review_scores_cleanliness'].fillna(df['review_scores_cleanliness'].mean())
df['review_scores_checkin'] = df['review_scores_checkin'].fillna(df['review_scores_checkin'].mean())
df['review_scores_communication'] = df['review_scores_communication'].fillna(df['review_scores_communication'].mean())
df['review_scores_location'] = df['review_scores_location'].fillna(df['review_scores_location'].mean())
df['review_scores_value'] = df['review_scores_value'].fillna(df['review_scores_value'].mean())


Further check for null values after first cleaning

In [39]:
null_columns = df.columns[df.isnull().any()]
print(df[null_columns].isnull().sum())

picture_url    1
dtype: int64


In [40]:
df['picture_url'] = df['picture_url'].fillna('')

In [41]:
#check for missing values
missing_values_count = df.isnull().sum()

## Changing the ID columns:
### Create the new id having the 3 first letters of neighbourhood plus 10 numbers

In [50]:
import random
import string

#for listing id
def randomize_id():
    return ''.join(random.choice(string.digits) for _ in range(10))

#for host_id
def randomize_id_8():
    return ''.join(random.choice(string.digits) for _ in range(8))

# Example usage
anonymized_host_id = randomize_id()
print(anonymized_host_id)

9076056426


In [51]:
# create a function where i will get the first three letters from each neighbour_cleansed row and turn it to uppercase
def get_neighbourhood_cleansed(neighbourhood_cleansed):
    return neighbourhood_cleansed[:3].upper()

# create a new column which has the first 3 letters of each neighbourhood_clensed in uppercase
df['anonymized_id'] = df['neighbourhood_cleansed'].apply(get_neighbourhood_cleansed).str.replace('É', 'E').str.replace('Ô','O')
df['anonymized_id'].head()

0    ENT
1    PAN
2    HOT
3    BUT
4    HOT
Name: anonymized_id, dtype: object

In [52]:
# add 10 random number to each element of the column anonymized_id
df['anonymized_id'] = df['anonymized_id'].apply(lambda x: x + randomize_id())
df['anonymized_id'].head()

0    ENT9911468079
1    PAN9295749920
2    HOT4153079899
3    BUT9265605214
4    HOT8992863305
Name: anonymized_id, dtype: object

In [53]:
# heck this new column for duplicate values
duplicateRowsDF = df[df.duplicated(['anonymized_id'])]
print(duplicateRowsDF)

Empty DataFrame
Columns: [id, listing_url, scrape_id, last_scraped, source, name, description, neighborhood_overview, picture_url, host_id, host_url, host_name, host_since, host_location, host_about, host_response_time, host_response_rate, host_acceptance_rate, host_is_superhost, host_thumbnail_url, host_picture_url, host_neighbourhood, host_listings_count, host_total_listings_count, host_verifications, host_has_profile_pic, host_identity_verified, neighbourhood, neighbourhood_cleansed, neighbourhood_group_cleansed, latitude, longitude, property_type, room_type, accommodates, bathrooms, bathrooms_text, bedrooms, beds, amenities, price, minimum_nights, maximum_nights, minimum_minimum_nights, maximum_minimum_nights, minimum_maximum_nights, maximum_maximum_nights, minimum_nights_avg_ntm, maximum_nights_avg_ntm, calendar_updated, has_availability, availability_30, availability_60, availability_90, availability_365, calendar_last_scraped, number_of_reviews, number_of_reviews_ltm, number_of_

In [54]:
#create new df with only the id and the anonymized id columns
df_list_map = df[['id','anonymized_id']]
df_list_map.head()

Unnamed: 0,id,anonymized_id
0,153674,ENT9911468079
1,33114,PAN9295749920
2,5396,HOT4153079899
3,154292,BUT9265605214
4,7397,HOT8992863305


In [55]:
#rename id to original_id and anonymized_id to new_id
df_list_map = df_list_map.rename(columns={'id': 'original_id', 'anonymized_id': 'new_id'})
#save the df to csv
df_list_map.to_csv('listings_map.csv', index=False)

In [56]:
# drop id column from df
df.drop(['id'], axis=1, inplace=True)

In [57]:
# rename anonymized_id to id
df = df.rename(columns={'anonymized_id': 'id'})
# bring id column as the first column of the df
df = df[['id'] + [col for col in df.columns if col != 'id']]
df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,ENT9911468079,https://www.airbnb.com/rooms/153674,20230606220143,2023-06-07,city scrape,Rental unit in Paris · ★4.44 · 1 bedroom · 3 beds · 1 bath,"Ideally located in the heart of Paris, in a lively area popular with Parisians, this nice and modern flat can accommodate up to 4 people.<br /><br /><b>The space</b><br />Close to shops, transports and animations, this flat is perfect for a stay in the capital. <br /> <br />This beautiful and cosy flat of 29 sq. m is located on the 2nd floor (without elevator) and is composed of : <br />- a nice and pleasant living room with a comfortable sofa-bed, <br />- a bedroom with two simple beds, which can be set up as a double bed (thanks to specify when booking), <br />- a very well-equipped kitchen (coffee machine, kettle, toaster, oven, hotplates, dishwasher, fridge) <br />- a bathroom with shower, <br />- hoover, hair dryer, ironing facilities, baby bed, <br />- large closets, <br />- Wi-Fi, TV and fan. <br />→ Hotel quality cleaning is carried out before your arrival and after your departure. <br />→ Our guests will be provided with all the essentials for comfort and hygiene such as...","Staying in the 10th arrondissement, between the Canal de l'Ourcq and Canal Saint-Martin, you will be in the heart of one of the capital's liveliest neighbourhoods. Numerous designer boutiques, restaurants, bars, cafés, entertainment... are all within walking distance. You can also stroll along the Canal Saint-Martin, one of the favourite spots of Parisians, especially on sunny days, or walk to the Marais, the Montorgueil district, Montmartre or even Notre-Dame. You will only need to take the metro to discover the historic centre of Paris, the Père-Lachaise cemetery, the Place de la Bastille, but also the Champs-Elysées, the Eiffel Tower and all the great monuments that make Paris the most beautiful city in the world. Finally, many restaurants and terraces are close to your flat, we recommend you La Rotonde Stalingrad, Le Point Ephémère or La Guincheuse Restaurant.",https://a0.muscache.com/pictures/prohost-api/Hosting-153674/original/b899a455-c091-4e55-bff6-03aa2446d7eb.jpeg,739021,https://www.airbnb.com/users/show/739021,Catherine,2011-06-24,"Paris, France","Après des années à voyager pour mon travail dans des hôtels impersonnels, je suis sensible au fait de profiter d'un appartement, même le temps d'un court séjour. J'apprécie de pouvoir me faire à manger plutôt que d'aller chaque soir au restaurant.\n\nA Paris, je loue un petit deux pièces rue Chaudron dans le 10e.. Cet appartement me ressemble et j'espère que chacun s'y sentira comme chez lui.\n\nI used to travel a lot for my work and would have appreciated being in a flat instead of being in an hotel. I am happy if I can cook instead of going to restaurant everyday.\n\nI rent one flat which is located in the 10th arrondissement of Paris. I have chosen it because of the courtyard and also because it is close to my own flat (it is very practical when my family or some friends come to visit me). I hope that everyone will feel at home when renting it.",within an hour,100%,100%,f,https://a0.muscache.com/im/pictures/user/a70251f6-4a1d-4d24-a8f5-837479acd0ad.jpg?aki_policy=profile_small,https://a0.muscache.com/im/pictures/user/a70251f6-4a1d-4d24-a8f5-837479acd0ad.jpg?aki_policy=profile_x_medium,Gare du Nord - Gare de I'Est,1.0,3.0,"['email', 'phone']",t,t,"Paris, Île-de-France, France",Entrepôt,Entrepôt,48.88303,2.366308,Entire rental unit,Entire home/apt,4,0.0,1 bath,1.0,3.0,"[""Essentials"", ""Hot water kettle"", ""Refrigerator"", ""Radiant heating"", ""Toaster"", ""Smoke alarm"", ""Cooking basics"", ""Coffee maker"", ""Crib"", ""Hair dryer"", ""Dishes and silverware"", ""Kitchen"", ""TV"", ""Dishwasher"", ""Oven"", ""Bed linens"", ""Hangers"", ""Shampoo"", ""Iron"", ""Wifi"", ""Hot water"", ""Freezer""]",106.0,2,1125,2.0,2.0,30.0,1125.0,2.0,35.0,2023-06-06,t,0,16,34,301,2023-06-07,252,19,0,2011-07-02,2023-05-02,4.44,4.68,4.47,4.68,4.71,4.42,4.47,7511000063553,t,1,1,0,0,1.73
1,PAN9295749920,https://www.airbnb.com/rooms/33114,20230606220143,2023-06-08,previous scrape,Rental unit in Paris · ★4.60 · 1 bedroom · 1 bed · 1 bath,"A beautifully restored 1-bedroom, 1-bath apartment in a French Haussmann building, with a full kitchen near the historic Latin Quarter of Paris.<br /><br /><b>The space</b><br />We decorated this apartment for ourselves to live in, and we are happy to let travelers use it while we are living abroad for a couple of years!<br /><br />It is a beautifully restored 1-bedroom, 1-bath apartment in a French Haussmann building, with a full kitchen near the historic Latin Quarter of Paris. Near pedestrian street Rue Mouffetard, outdoor markets, Botanical Gardens, and the Mosque. 2 minute walk to metro (line 5 - St. Marcel). 20 minute walk to Islands and historic Latin Quarter. In quiet building with quiet residents, on a private courtyard. We ask that renters please respect that this is a building with permanent residents, and to be quiet and respectful. There is a queen-size bed. Sleeps 3 with the couch (not a sofa-bed - not meant to be a permanent bed). The apartment is fully furnished, wi...","The Latin Quarter is one of the oldest districts in Paris, and has (among its many features) old roman baths and ruins. There are many cobble-streets and great dining nearby, all perfect for one discovering Paris. Our favorite is Rue Mouffetard, which is about a 10 minute walk, and it has tons of little boutiques, markets, cheese and wine shops, restaurants, and other fun things to do.",https://a0.muscache.com/pictures/238829/9a7d51ed_original.jpg,143558,https://www.airbnb.com/users/show/143558,Axel,2010-06-13,"Salt Lake City, UT","I have lived in 3 countries and traveled to 20+ others. I love to discover new destinations around the world for nature and hikes, and like to simply meet people globally, because I love learning about different cultures. \r\n\r\n",a few days or more,33%,0%,f,https://a0.muscache.com/im/pictures/user/62319130-52a3-4b1f-862e-0cfb4426479f.jpg?aki_policy=profile_small,https://a0.muscache.com/im/pictures/user/62319130-52a3-4b1f-862e-0cfb4426479f.jpg?aki_policy=profile_x_medium,Panthéon,2.0,2.0,"['email', 'phone', 'work_email']",t,t,"Paris, Île-de-France, France",Panthéon,Panthéon,48.83936,2.35849,Entire rental unit,Entire home/apt,2,0.0,1 bath,1.0,1.0,"[""TV with standard cable"", ""Essentials"", ""Fire extinguisher"", ""Iron"", ""Hair dryer"", ""Dishes and silverware"", ""Paid parking off premises"", ""Wifi"", ""Long term stays allowed"", ""Kitchen"", ""Cooking basics"", ""Hot water"", ""Hangers"", ""First aid kit"", ""Heating""]",88.0,100,365,100.0,100.0,365.0,365.0,100.0,365.0,2023-06-06,t,0,0,0,66,2023-06-08,70,0,0,2011-09-16,2019-05-20,4.6,4.74,4.33,4.83,4.83,4.83,4.74,7510501191068,f,1,1,0,0,0.49
2,HOT4153079899,https://www.airbnb.com/rooms/5396,20230606220143,2023-06-08,previous scrape,Rental unit in Paris · ★4.55 · Studio · 1 bed · 1 bath,"Cozy, well-appointed and graciously designed studio apartment that mixes old and new aesthetics at the very heart of Paris, on the famous Isle St Louis that is a stone's throw from Le Marais, Quartier Latin, Notre Dame and Le Louvre.<br /><br /><b>The space</b><br />Small, well appointed studio apartment at the very heart of Paris, on the famous Isle St Louis.<br /><br />We are looking for quiet persons or couples to rent our fully equipped studio (about 30 square meters or 300 square feet) in central Paris on the famous Isle St. Louis.<br /><br />No smoking and parties are forbidden<br /><br />The location is superb in heart of Paris and to close to all amenities (buses, metro, velib)<br /><br />No street view or Seine view, the best is the localisation<br />The flat is at the 2nd floor (no lift), quiet because it's on courtyard<br /><br /><b>Guest access</b><br />The flat includes :<br /> - A redone bathroom <br />- A kitchenette fully equipped (oven, fridge, hob, kitchen utensil...","You are within walking distance to the Louvre, Notre Dame, Le Marais, Les Halles, Chatelet, St. Germain, Les Tuileries, le Jardin des Plantes, St. Michel, Sorbonne, Institut du Monde Arab, the Bastille and the Latin Quarter.",https://a0.muscache.com/pictures/52413/f9bf76f5_original.jpg,7903,https://www.airbnb.com/users/show/7903,Borzou,2009-02-14,"İstanbul, Turkey",The flat is owned by journalists who spend a lot of time traveling for work who understand what people need when they're away from home. Guillaume and his partners manage arrivals and checkins.. We all love Paris and try to make it easy for people to come and visit the city.,within an hour,100%,99%,f,https://a0.muscache.com/im/users/7903/profile_pic/1280002723/original.jpg?aki_policy=profile_small,https://a0.muscache.com/im/users/7903/profile_pic/1280002723/original.jpg?aki_policy=profile_x_medium,Saint-Paul - Ile Saint-Louis,1.0,1.0,"['email', 'phone']",t,t,"Paris, Ile-de-France, France",Hôtel-de-Ville,Hôtel-de-Ville,48.85247,2.35835,Entire rental unit,Entire home/apt,2,0.0,1 bath,0.0,1.0,"[""TV with standard cable"", ""Shower gel"", ""Essentials"", ""Fire extinguisher"", ""Clothing storage: closet"", ""Hot water kettle"", ""Refrigerator"", ""Extra pillows and blankets"", ""Toaster"", ""Smoke alarm"", ""Host greets you"", ""Cooking basics"", ""Body soap"", ""Cleaning products"", ""Hair dryer"", ""Dishes and silverware"", ""Washer"", ""Kitchen"", ""Oven"", ""Bed linens"", ""Hangers"", ""Heating"", ""Shampoo"", ""Dedicated workspace"", ""Long term stays allowed"", ""Wifi"", ""Hot water"", ""Stove"", ""Drying rack for clothing""]",117.0,15,1125,1.0,1.0,1125.0,1125.0,1.0,1125.0,2023-06-06,t,1,13,43,223,2023-06-08,340,43,4,2009-06-30,2023-05-19,4.55,4.59,4.52,4.78,4.83,4.95,4.53,7510402838018,f,1,1,0,0,2.0
3,BUT9265605214,https://www.airbnb.com/rooms/154292,20230606220143,2023-06-07,city scrape,Rental unit in Paris · ★4.62 · 1 bedroom · 1 bed · 1 bath,"Nice flat designed by an architect, in an area full of life with a lot of shops and restaurants. Located in a typically parisian neighborhood, only 10 mn from the very center of Paris (Châtelet/ Notre Dame).<br /><br />Careful for people allergic to cats, a cat lived in the apt a few months ago.<br /><br /><b>The space</b><br />You can enjoy a big living room, nicely decorated, an american (open) kitchen with all fournitures, and a quiet bedroom. Oven, wash machine, dish washer, Flat screen TV, WI-FI are at your disposal. <br />The apt is located in a charming neighborhood, which feels like a village with its numerous cafés and shops : everything you need can be find in less than 3 mn.<br /><br />Around the place there is 2 supermarkets, an excellent Bakery with lovely croissants, nice bars, and - because it's France - great cheese and wine shops ! For (french) lovers : you can even find flowers just next to the apt.<br /><br />A great experience in the old typical Paris, only 10 m...","Charming authentic parisian neighborhood, with a lot of shops and restaurants. You can find whatever you need, from good bakeries to excellent wineries and cheese.",https://a0.muscache.com/pictures/bfc84997-6491-4501-9ef1-c897142f4913.jpg,137719,https://www.airbnb.com/users/show/137719,Mattis,2010-06-03,"Paris, France",Je suis un journaliste français qui aime voyager et rencontrer de nouvelles personnes.,within an hour,100%,83%,t,https://a0.muscache.com/im/pictures/user/74a6aea2-e889-45a0-8831-e80304cef68d.jpg?aki_policy=profile_small,https://a0.muscache.com/im/pictures/user/74a6aea2-e889-45a0-8831-e80304cef68d.jpg?aki_policy=profile_x_medium,Buttes-Chaumont - Belleville,1.0,2.0,"['email', 'phone', 'work_email']",t,t,"Paris, Île-de-France, France",Buttes-Chaumont,Buttes-Chaumont,48.87417,2.38581,Entire rental unit,Entire home/apt,2,0.0,1 bath,1.0,1.0,"[""Essentials"", ""Private entrance"", ""Refrigerator"", ""Extra pillows and blankets"", ""Free washer \u2013 In unit"", ""Smoke alarm"", ""Host greets you"", ""HDTV with standard cable"", ""Cooking basics"", ""Coffee maker"", ""Game console"", ""Hair dryer"", ""Dishes and silverware"", ""Paid parking off premises"", ""Ethernet connection"", ""Kitchen"", ""Pocket wifi"", ""Dishwasher"", ""Oven"", ""Bed linens"", ""Hangers"", ""Luggage dropoff allowed"", ""Heating"", ""Shampoo"", ""Iron"", ""Dedicated workspace"", ""Long term stays allowed"", ""Wifi"", ""Hot water"", ""Stove""]",99.0,5,21,5.0,7.0,8.0,1125.0,5.1,1049.7,2023-06-06,t,4,5,8,52,2023-06-07,49,5,0,2011-07-20,2022-10-17,4.62,4.77,4.3,4.86,4.91,4.79,4.74,7511900942838,f,1,1,0,0,0.34
4,HOT8992863305,https://www.airbnb.com/rooms/7397,20230606220143,2023-06-08,city scrape,Rental unit in Paris · ★4.72 · 2 bedrooms · 2 beds · 1 bath,"VERY CONVENIENT, WITH THE BEST LOCATION !<br /><br /><b>The space</b><br />PLEASE ASK ME BEFORE TO DO A REQUEST !!!<br /><br />PLEASE ASK ME BEFORE TO DO A REQUEST !!!<br /><br /><br />HEART OF THE HISTORICAL PARIS<br /><br /> Marais, in a securated and classified building, 2d floor, charming 2/3Rooms apt 40 m2, Ideal location in the very heart of historical Paris, metro station: Hôtel-de-Ville, just a few minutes on foot to the Centre Pompidou, Place des Vosges, the Seine River, Notre-Dame Cathedral, the Saint-Louis island... and most of the monuments and museums in Paris. Local restaurants and shops cater to all tastes and budgets. Very charming, quiet, sunny, fully furnished, two bedrooms (one with a large bed, the other with a large and very good sofa/bed), Large bathroom with italian shower. Internet+tel+TV included. Electricity included for normal using. Perfect for one couple, two couples or one couple with one or two children.<br />The apart, at the second floor without lif...",The host has not provided a neighbourhood overview yet.,https://a0.muscache.com/pictures/67928287/330bd78c_original.jpg,2626,https://www.airbnb.com/users/show/2626,Franck,2008-08-30,"Paris, France","I am a writer,54, author of novels, books of linguistics...",within an hour,100%,69%,t,https://a0.muscache.com/im/pictures/user/ad6a9447-d6fa-4b6b-a820-1c5b52cd5359.jpg?aki_policy=profile_small,https://a0.muscache.com/im/pictures/user/ad6a9447-d6fa-4b6b-a820-1c5b52cd5359.jpg?aki_policy=profile_x_medium,Le Marais,2.0,9.0,"['email', 'phone']",t,t,Hôtel-de-Ville,Hôtel-de-Ville,Hôtel-de-Ville,48.85909,2.35315,Entire rental unit,Entire home/apt,4,0.0,1 bath,2.0,2.0,"[""TV with standard cable"", ""Essentials"", ""Refrigerator"", ""Extra pillows and blankets"", ""Free washer \u2013 In unit"", ""Smoke alarm"", ""Host greets you"", ""Cooking basics"", ""Coffee maker"", ""Microwave"", ""Hair dryer"", ""Dishes and silverware"", ""Paid parking off premises"", ""Kitchen"", ""Oven"", ""Bed linens"", ""Hangers"", ""Heating"", ""Shampoo"", ""Iron"", ""Dedicated workspace"", ""Wifi"", ""Hot water"", ""Dryer"", ""Stove""]",130.0,10,130,8.0,10.0,130.0,130.0,9.9,130.0,2023-06-06,t,0,15,24,226,2023-06-08,333,27,1,2011-04-08,2023-05-14,4.72,4.8,4.44,4.91,4.88,4.93,4.73,7510400829623,f,2,2,0,0,2.25


### Create new host_id's with 8 random digits

In [58]:
# get all unique host_id values in a list
host_ids = df['host_id'].unique().tolist()
print(len(set(host_ids)))

# create a mapping dictionary of each unique host_id with a new anonymized_host_id and ensure that there are no duplicate values
# in the dictionary values
mapping_dict = {}
seen = set()
for i in range(len(host_ids)):
    u = True
    while u:
        y = randomize_id_8()
        if y not in seen:
            seen.add(y)
            u = False
    mapping_dict[host_ids[i]] = y

46014


In [59]:
assert len(list(set(mapping_dict.keys()))) == len(list(set(mapping_dict.values())))

In [60]:
# create column in df with anonymized_host_id's where each anonymized_host_id is dervied from the dictionary mapping_dict
df['anonymized_host_id'] = df['host_id'].map(mapping_dict)

In [61]:
#ensure that there are 46014 distinc values in anonymized_host_id column
print(len(df['anonymized_host_id'].unique().tolist()))

46014


In [62]:
#create a dataframe only with distinct pairs of host_id and anonymized_host_id and rename host_id to original_host_id and anonymized_host_id to new_host_id
df_host_map = df[['host_id','anonymized_host_id']].drop_duplicates().rename(columns={'host_id': 'original_host_id', 'anonymized_host_id': 'new_host_id'})
df_host_map.head()

Unnamed: 0,original_host_id,new_host_id
0,739021,60703157
1,143558,69933352
2,7903,63042574
3,137719,93008392
4,2626,97826507


In [63]:
#create csv file with the mapping of the host_id and the anonymized_host_id
df_host_map.to_csv('hosts_map.csv', index=False)

In [64]:
#remove host_id column from df
df.drop(['host_id'], axis=1, inplace=True)

In [65]:
#rename anonymized_host_id to host_id
df = df.rename(columns={'anonymized_host_id': 'host_id'})

In [66]:
len(df['host_id'].unique().tolist())

46014

### GDPR concerns

Create new listing urls

In [67]:
# create new_listing_url column with urls starting with "https://www.airbnb.com/rooms" and then the id of the listing
df['new_listing_url'] = 'https://www.airbnb.com/rooms/' + df['id'].astype(str)

In [68]:
#delete listing url column and rename new_listing_url to listing_url
df.drop(['listing_url'], axis=1, inplace=True)
df = df.rename(columns={'new_listing_url': 'listing_url'})

In [69]:
df['listing_url'].head()

0    https://www.airbnb.com/rooms/ENT9911468079
1    https://www.airbnb.com/rooms/PAN9295749920
2    https://www.airbnb.com/rooms/HOT4153079899
3    https://www.airbnb.com/rooms/BUT9265605214
4    https://www.airbnb.com/rooms/HOT8992863305
Name: listing_url, dtype: object

Drop picture urls and thumbnail urls

In [70]:
# drop thumbnail urls and host picture urls and picture urls
df.drop(['host_thumbnail_url'], axis=1, inplace=True)
df.drop(['host_picture_url'], axis=1, inplace=True)
df.drop(['picture_url'], axis=1, inplace=True)

Create new names and host urls

In [71]:
from faker import Faker
# Create a Faker instance for French
fake = Faker(['fr_FR', 'en_US'])

# create a dictionary from with keys as numbers from 1 to 4 for these 4 choices i gave above for faker to call each one of the radnom functions

my_dict = {1: fake.first_name_male,
           2: fake.first_name_female,
           3: fake.first_name_nonbinary,
           4: fake.company}

# create a function that will return a random value from the dictionary
def get_random_name():
    return my_dict[random.randint(1, 4)]()

print(get_random_name())


Sharon


In [72]:
# create new random host_name and host_url for each unique host_id and map them according to unique host_id values
# create a list of unique host_id values
host_ids = df['host_id'].unique().tolist()
# create a dictionary with key the host_id and value a random host_name
mapping_dict = {}
for i in range(len(host_ids)):
    mapping_dict[host_ids[i]] = get_random_name()

#do the same for host_url
mapping_dict_url = {}
for i in range(len(host_ids)):
    mapping_dict_url[host_ids[i]] = 'https://www.airbnb.com/randomuser/' + str(host_ids[i])


In [73]:
#create new columns in df with the new host_name and new_host_url
df['new_host_name'] = df['host_id'].map(mapping_dict)
df['new_host_url'] = df['host_id'].map(mapping_dict_url)

In [74]:
#drop host_name and host_url columns
df.drop(['host_name'], axis=1, inplace=True)
df.drop(['host_url'], axis=1, inplace=True)
#rename new_host_name to host_name and new_host_url to host_url
df = df.rename(columns={'new_host_name': 'host_name', 'new_host_url': 'host_url'})

Reorder the columns

In [75]:
# reorder the columns of the df, have listing_url after id and have host_id, host_name, host_url after listing_url
df = df[['id', 'listing_url', 'host_id', 'host_name', 'host_url'] + [col for col in df.columns if col not in ['id', 'listing_url', 'host_id', 'host_name', 'host_url']]]
df.head()

Unnamed: 0,id,listing_url,host_id,host_name,host_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,ENT9911468079,https://www.airbnb.com/rooms/ENT9911468079,60703157,Jason,https://www.airbnb.com/randomuser/60703157,20230606220143,2023-06-07,city scrape,Rental unit in Paris · ★4.44 · 1 bedroom · 3 beds · 1 bath,"Ideally located in the heart of Paris, in a lively area popular with Parisians, this nice and modern flat can accommodate up to 4 people.<br /><br /><b>The space</b><br />Close to shops, transports and animations, this flat is perfect for a stay in the capital. <br /> <br />This beautiful and cosy flat of 29 sq. m is located on the 2nd floor (without elevator) and is composed of : <br />- a nice and pleasant living room with a comfortable sofa-bed, <br />- a bedroom with two simple beds, which can be set up as a double bed (thanks to specify when booking), <br />- a very well-equipped kitchen (coffee machine, kettle, toaster, oven, hotplates, dishwasher, fridge) <br />- a bathroom with shower, <br />- hoover, hair dryer, ironing facilities, baby bed, <br />- large closets, <br />- Wi-Fi, TV and fan. <br />→ Hotel quality cleaning is carried out before your arrival and after your departure. <br />→ Our guests will be provided with all the essentials for comfort and hygiene such as...","Staying in the 10th arrondissement, between the Canal de l'Ourcq and Canal Saint-Martin, you will be in the heart of one of the capital's liveliest neighbourhoods. Numerous designer boutiques, restaurants, bars, cafés, entertainment... are all within walking distance. You can also stroll along the Canal Saint-Martin, one of the favourite spots of Parisians, especially on sunny days, or walk to the Marais, the Montorgueil district, Montmartre or even Notre-Dame. You will only need to take the metro to discover the historic centre of Paris, the Père-Lachaise cemetery, the Place de la Bastille, but also the Champs-Elysées, the Eiffel Tower and all the great monuments that make Paris the most beautiful city in the world. Finally, many restaurants and terraces are close to your flat, we recommend you La Rotonde Stalingrad, Le Point Ephémère or La Guincheuse Restaurant.",2011-06-24,"Paris, France","Après des années à voyager pour mon travail dans des hôtels impersonnels, je suis sensible au fait de profiter d'un appartement, même le temps d'un court séjour. J'apprécie de pouvoir me faire à manger plutôt que d'aller chaque soir au restaurant.\n\nA Paris, je loue un petit deux pièces rue Chaudron dans le 10e.. Cet appartement me ressemble et j'espère que chacun s'y sentira comme chez lui.\n\nI used to travel a lot for my work and would have appreciated being in a flat instead of being in an hotel. I am happy if I can cook instead of going to restaurant everyday.\n\nI rent one flat which is located in the 10th arrondissement of Paris. I have chosen it because of the courtyard and also because it is close to my own flat (it is very practical when my family or some friends come to visit me). I hope that everyone will feel at home when renting it.",within an hour,100%,100%,f,Gare du Nord - Gare de I'Est,1.0,3.0,"['email', 'phone']",t,t,"Paris, Île-de-France, France",Entrepôt,Entrepôt,48.88303,2.366308,Entire rental unit,Entire home/apt,4,0.0,1 bath,1.0,3.0,"[""Essentials"", ""Hot water kettle"", ""Refrigerator"", ""Radiant heating"", ""Toaster"", ""Smoke alarm"", ""Cooking basics"", ""Coffee maker"", ""Crib"", ""Hair dryer"", ""Dishes and silverware"", ""Kitchen"", ""TV"", ""Dishwasher"", ""Oven"", ""Bed linens"", ""Hangers"", ""Shampoo"", ""Iron"", ""Wifi"", ""Hot water"", ""Freezer""]",106.0,2,1125,2.0,2.0,30.0,1125.0,2.0,35.0,2023-06-06,t,0,16,34,301,2023-06-07,252,19,0,2011-07-02,2023-05-02,4.44,4.68,4.47,4.68,4.71,4.42,4.47,7511000063553,t,1,1,0,0,1.73
1,PAN9295749920,https://www.airbnb.com/rooms/PAN9295749920,69933352,Michel,https://www.airbnb.com/randomuser/69933352,20230606220143,2023-06-08,previous scrape,Rental unit in Paris · ★4.60 · 1 bedroom · 1 bed · 1 bath,"A beautifully restored 1-bedroom, 1-bath apartment in a French Haussmann building, with a full kitchen near the historic Latin Quarter of Paris.<br /><br /><b>The space</b><br />We decorated this apartment for ourselves to live in, and we are happy to let travelers use it while we are living abroad for a couple of years!<br /><br />It is a beautifully restored 1-bedroom, 1-bath apartment in a French Haussmann building, with a full kitchen near the historic Latin Quarter of Paris. Near pedestrian street Rue Mouffetard, outdoor markets, Botanical Gardens, and the Mosque. 2 minute walk to metro (line 5 - St. Marcel). 20 minute walk to Islands and historic Latin Quarter. In quiet building with quiet residents, on a private courtyard. We ask that renters please respect that this is a building with permanent residents, and to be quiet and respectful. There is a queen-size bed. Sleeps 3 with the couch (not a sofa-bed - not meant to be a permanent bed). The apartment is fully furnished, wi...","The Latin Quarter is one of the oldest districts in Paris, and has (among its many features) old roman baths and ruins. There are many cobble-streets and great dining nearby, all perfect for one discovering Paris. Our favorite is Rue Mouffetard, which is about a 10 minute walk, and it has tons of little boutiques, markets, cheese and wine shops, restaurants, and other fun things to do.",2010-06-13,"Salt Lake City, UT","I have lived in 3 countries and traveled to 20+ others. I love to discover new destinations around the world for nature and hikes, and like to simply meet people globally, because I love learning about different cultures. \r\n\r\n",a few days or more,33%,0%,f,Panthéon,2.0,2.0,"['email', 'phone', 'work_email']",t,t,"Paris, Île-de-France, France",Panthéon,Panthéon,48.83936,2.35849,Entire rental unit,Entire home/apt,2,0.0,1 bath,1.0,1.0,"[""TV with standard cable"", ""Essentials"", ""Fire extinguisher"", ""Iron"", ""Hair dryer"", ""Dishes and silverware"", ""Paid parking off premises"", ""Wifi"", ""Long term stays allowed"", ""Kitchen"", ""Cooking basics"", ""Hot water"", ""Hangers"", ""First aid kit"", ""Heating""]",88.0,100,365,100.0,100.0,365.0,365.0,100.0,365.0,2023-06-06,t,0,0,0,66,2023-06-08,70,0,0,2011-09-16,2019-05-20,4.6,4.74,4.33,4.83,4.83,4.83,4.74,7510501191068,f,1,1,0,0,0.49
2,HOT4153079899,https://www.airbnb.com/rooms/HOT4153079899,63042574,Tiffany,https://www.airbnb.com/randomuser/63042574,20230606220143,2023-06-08,previous scrape,Rental unit in Paris · ★4.55 · Studio · 1 bed · 1 bath,"Cozy, well-appointed and graciously designed studio apartment that mixes old and new aesthetics at the very heart of Paris, on the famous Isle St Louis that is a stone's throw from Le Marais, Quartier Latin, Notre Dame and Le Louvre.<br /><br /><b>The space</b><br />Small, well appointed studio apartment at the very heart of Paris, on the famous Isle St Louis.<br /><br />We are looking for quiet persons or couples to rent our fully equipped studio (about 30 square meters or 300 square feet) in central Paris on the famous Isle St. Louis.<br /><br />No smoking and parties are forbidden<br /><br />The location is superb in heart of Paris and to close to all amenities (buses, metro, velib)<br /><br />No street view or Seine view, the best is the localisation<br />The flat is at the 2nd floor (no lift), quiet because it's on courtyard<br /><br /><b>Guest access</b><br />The flat includes :<br /> - A redone bathroom <br />- A kitchenette fully equipped (oven, fridge, hob, kitchen utensil...","You are within walking distance to the Louvre, Notre Dame, Le Marais, Les Halles, Chatelet, St. Germain, Les Tuileries, le Jardin des Plantes, St. Michel, Sorbonne, Institut du Monde Arab, the Bastille and the Latin Quarter.",2009-02-14,"İstanbul, Turkey",The flat is owned by journalists who spend a lot of time traveling for work who understand what people need when they're away from home. Guillaume and his partners manage arrivals and checkins.. We all love Paris and try to make it easy for people to come and visit the city.,within an hour,100%,99%,f,Saint-Paul - Ile Saint-Louis,1.0,1.0,"['email', 'phone']",t,t,"Paris, Ile-de-France, France",Hôtel-de-Ville,Hôtel-de-Ville,48.85247,2.35835,Entire rental unit,Entire home/apt,2,0.0,1 bath,0.0,1.0,"[""TV with standard cable"", ""Shower gel"", ""Essentials"", ""Fire extinguisher"", ""Clothing storage: closet"", ""Hot water kettle"", ""Refrigerator"", ""Extra pillows and blankets"", ""Toaster"", ""Smoke alarm"", ""Host greets you"", ""Cooking basics"", ""Body soap"", ""Cleaning products"", ""Hair dryer"", ""Dishes and silverware"", ""Washer"", ""Kitchen"", ""Oven"", ""Bed linens"", ""Hangers"", ""Heating"", ""Shampoo"", ""Dedicated workspace"", ""Long term stays allowed"", ""Wifi"", ""Hot water"", ""Stove"", ""Drying rack for clothing""]",117.0,15,1125,1.0,1.0,1125.0,1125.0,1.0,1125.0,2023-06-06,t,1,13,43,223,2023-06-08,340,43,4,2009-06-30,2023-05-19,4.55,4.59,4.52,4.78,4.83,4.95,4.53,7510402838018,f,1,1,0,0,2.0
3,BUT9265605214,https://www.airbnb.com/rooms/BUT9265605214,93008392,Melissa,https://www.airbnb.com/randomuser/93008392,20230606220143,2023-06-07,city scrape,Rental unit in Paris · ★4.62 · 1 bedroom · 1 bed · 1 bath,"Nice flat designed by an architect, in an area full of life with a lot of shops and restaurants. Located in a typically parisian neighborhood, only 10 mn from the very center of Paris (Châtelet/ Notre Dame).<br /><br />Careful for people allergic to cats, a cat lived in the apt a few months ago.<br /><br /><b>The space</b><br />You can enjoy a big living room, nicely decorated, an american (open) kitchen with all fournitures, and a quiet bedroom. Oven, wash machine, dish washer, Flat screen TV, WI-FI are at your disposal. <br />The apt is located in a charming neighborhood, which feels like a village with its numerous cafés and shops : everything you need can be find in less than 3 mn.<br /><br />Around the place there is 2 supermarkets, an excellent Bakery with lovely croissants, nice bars, and - because it's France - great cheese and wine shops ! For (french) lovers : you can even find flowers just next to the apt.<br /><br />A great experience in the old typical Paris, only 10 m...","Charming authentic parisian neighborhood, with a lot of shops and restaurants. You can find whatever you need, from good bakeries to excellent wineries and cheese.",2010-06-03,"Paris, France",Je suis un journaliste français qui aime voyager et rencontrer de nouvelles personnes.,within an hour,100%,83%,t,Buttes-Chaumont - Belleville,1.0,2.0,"['email', 'phone', 'work_email']",t,t,"Paris, Île-de-France, France",Buttes-Chaumont,Buttes-Chaumont,48.87417,2.38581,Entire rental unit,Entire home/apt,2,0.0,1 bath,1.0,1.0,"[""Essentials"", ""Private entrance"", ""Refrigerator"", ""Extra pillows and blankets"", ""Free washer \u2013 In unit"", ""Smoke alarm"", ""Host greets you"", ""HDTV with standard cable"", ""Cooking basics"", ""Coffee maker"", ""Game console"", ""Hair dryer"", ""Dishes and silverware"", ""Paid parking off premises"", ""Ethernet connection"", ""Kitchen"", ""Pocket wifi"", ""Dishwasher"", ""Oven"", ""Bed linens"", ""Hangers"", ""Luggage dropoff allowed"", ""Heating"", ""Shampoo"", ""Iron"", ""Dedicated workspace"", ""Long term stays allowed"", ""Wifi"", ""Hot water"", ""Stove""]",99.0,5,21,5.0,7.0,8.0,1125.0,5.1,1049.7,2023-06-06,t,4,5,8,52,2023-06-07,49,5,0,2011-07-20,2022-10-17,4.62,4.77,4.3,4.86,4.91,4.79,4.74,7511900942838,f,1,1,0,0,0.34
4,HOT8992863305,https://www.airbnb.com/rooms/HOT8992863305,97826507,Joseph,https://www.airbnb.com/randomuser/97826507,20230606220143,2023-06-08,city scrape,Rental unit in Paris · ★4.72 · 2 bedrooms · 2 beds · 1 bath,"VERY CONVENIENT, WITH THE BEST LOCATION !<br /><br /><b>The space</b><br />PLEASE ASK ME BEFORE TO DO A REQUEST !!!<br /><br />PLEASE ASK ME BEFORE TO DO A REQUEST !!!<br /><br /><br />HEART OF THE HISTORICAL PARIS<br /><br /> Marais, in a securated and classified building, 2d floor, charming 2/3Rooms apt 40 m2, Ideal location in the very heart of historical Paris, metro station: Hôtel-de-Ville, just a few minutes on foot to the Centre Pompidou, Place des Vosges, the Seine River, Notre-Dame Cathedral, the Saint-Louis island... and most of the monuments and museums in Paris. Local restaurants and shops cater to all tastes and budgets. Very charming, quiet, sunny, fully furnished, two bedrooms (one with a large bed, the other with a large and very good sofa/bed), Large bathroom with italian shower. Internet+tel+TV included. Electricity included for normal using. Perfect for one couple, two couples or one couple with one or two children.<br />The apart, at the second floor without lif...",The host has not provided a neighbourhood overview yet.,2008-08-30,"Paris, France","I am a writer,54, author of novels, books of linguistics...",within an hour,100%,69%,t,Le Marais,2.0,9.0,"['email', 'phone']",t,t,Hôtel-de-Ville,Hôtel-de-Ville,Hôtel-de-Ville,48.85909,2.35315,Entire rental unit,Entire home/apt,4,0.0,1 bath,2.0,2.0,"[""TV with standard cable"", ""Essentials"", ""Refrigerator"", ""Extra pillows and blankets"", ""Free washer \u2013 In unit"", ""Smoke alarm"", ""Host greets you"", ""Cooking basics"", ""Coffee maker"", ""Microwave"", ""Hair dryer"", ""Dishes and silverware"", ""Paid parking off premises"", ""Kitchen"", ""Oven"", ""Bed linens"", ""Hangers"", ""Heating"", ""Shampoo"", ""Iron"", ""Dedicated workspace"", ""Wifi"", ""Hot water"", ""Dryer"", ""Stove""]",130.0,10,130,8.0,10.0,130.0,130.0,9.9,130.0,2023-06-06,t,0,15,24,226,2023-06-08,333,27,1,2011-04-08,2023-05-14,4.72,4.8,4.44,4.91,4.88,4.93,4.73,7510400829623,f,2,2,0,0,2.25


### For licenses, delete numbers and keep only t, f or exempt

In [80]:
# see how many rows have a value in licenses aside from 'f'
df['license'].value_counts()
#replace all values in the that contain a number with a single 't'
df['license'] = df['license'].str.replace(r'.*\d+.*', 't', regex=True)


license
t                                                         36720
f                                                         19362
Available with a mobility lease only ("bail mobilité")     4733
Exempt - hotel-type listing                                 849
Name: count, dtype: int64

# Export to new csv file

In [87]:
# export df to cleaned_listings.csv
df.to_csv('cleaned_listings.csv', index=False)

# neighbourhoods mapping

In [94]:
arrondissement_mapping = {
    'Entrepôt': 10,
    'Panthéon': 5,
    'Hôtel-de-Ville': 4,
    'Buttes-Chaumont': 19,
    'Observatoire': 14,
    'Palais-Bourbon': 7,
    'Reuilly': 12,
    'Opéra': 9,
    'Louvre': 1,
    'Popincourt': 11,
    'Buttes-Montmartre': 18,
    'Ménilmontant': 20,
    'Temple': 3,
    'Bourse': 2,
    'Batignolles-Monceau': 17,
    'Passy': 16,
    'Gobelins': 13,
    'Élysée': 8,
    'Vaugirard': 15,
    'Luxembourg': 6
}


# create a new df with id and name columns from the dictionary arrondissement_mapping
df_arrondissement_map = pd.DataFrame(list(arrondissement_mapping.items()),columns = ['name','id'])
#export to csv 
df_arrondissement_map.to_csv('arrondissement_map.csv', index=False)

In [95]:
# assert that the distinct neughbourhood_cleansed values are the same as the keys of the dictionary arrondissement_mapping
assert set(df['neighbourhood_cleansed'].unique().tolist()) == set(arrondissement_mapping.keys())