# Clean the AirBnB data
### Clean listings data

In [13]:
import pandas as pd

listings_og = pd.read_csv('listings_og.csv')
listings_og.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,44400740.0,https://www.airbnb.com/rooms/44400735,20220900000000.0,2022-09-29,city scrape,Dolce Nido Falcade,,,https://a0.muscache.com/pictures/4e8f96f4-cbfc...,204428387,...,4.0,2.0,4.0,,t,2,2,0,0,0.52
1,7.019696e+17,https://www.airbnb.com/rooms/701969602670827249,20220900000000.0,2022-09-29,city scrape,Affittimoderni Ponte di Legno Ski - Pdl12,The <b>apartment in Ponte di Legno</b> has 1 b...,,https://a0.muscache.com/pictures/prohost-api/H...,21763382,...,,,,,t,1,1,0,0,
2,34381720.0,https://www.airbnb.com/rooms/34381719,20220900000000.0,2022-09-29,city scrape,Ferienwohnung Papstlhof: Reinheit in Glas,Der Papstlhof liegt im schönen Weindorf Kalter...,500 Meter vom Papstlhof entfernt finden Sie di...,https://a0.muscache.com/pictures/b7343d0c-12aa...,250027698,...,,,,,f,4,4,0,0,
3,33209030.0,https://www.airbnb.com/rooms/33209027,20220900000000.0,2022-09-29,city scrape,Feienwohnung Papstlhof: Balance in Weiss,Der Papstlhof liegt im schönen Weindorf Kalter...,500 Meter vom Papstlhof entfernt finden Sie di...,https://a0.muscache.com/pictures/b72fe4ed-1c86...,250027698,...,,,,,f,4,4,0,0,
4,18988660.0,https://www.airbnb.com/rooms/18988656,20220900000000.0,2022-09-29,city scrape,Villa Vesta for 5 persons,Note: You can directly book the best price if ...,,https://a0.muscache.com/pictures/prohost-api/H...,131033815,...,,,,017082-CIM-00014,t,44,44,0,0,


In [27]:
import re
import numpy as np

# Reduce to select columns
listings = listings_og[['id', 'name', 'description', 'picture_url', 'host_since', 'host_response_time', 'host_is_superhost',
             'host_listings_count', 'host_identity_verified', 'room_type', 'accommodates', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price', 'availability_90', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'instant_bookable']]
listings = listings.rename(columns={'id':'listing_id'})

# Clean up description
cleaned_description_list = []
for description in listings['description']:
    if type(description) == type('str'):
        cleaned_description = re.sub('<[^<]+?>', ' ', description)      # remove html
        cleaned_description = cleaned_description.replace('\n', ' ')    # remove linebreaks
        cleaned_description = cleaned_description.replace('   ', ' ')   # remove extra space
        cleaned_description = cleaned_description.replace('  ', ' ')    # remove extra space again
    else:
        cleaned_description = np.nan

    cleaned_description_list.append(cleaned_description)

# Save to dataframe
listings['description'] = cleaned_description_list

# Drop duplicates
listings = listings.dropna()

# Drop rows with the same description
listings = listings.drop_duplicates(subset='description', keep='first')

# Pare down to 100
listings = listings.sample(n=100, random_state=24)
listings = listings.reset_index(drop=True)
listings

Unnamed: 0,listing_id,name,description,picture_url,host_since,host_response_time,host_is_superhost,host_listings_count,host_identity_verified,room_type,...,availability_90,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable
0,42433599.0,Be & Being (Milena) - next to Sabbioni Beach,Be & Being is a modern apartment located in a ...,https://a0.muscache.com/pictures/miso/Hosting-...,2020-02-20,within an hour,t,1.0,t,Entire home/apt,...,77,31,4.90,4.90,4.90,4.97,5.00,4.97,4.77,f
1,34200714.0,Casa Carina 022251-AT-330714,Guest access A disposizione ce un ampio balco...,https://a0.muscache.com/pictures/36f6b80b-0b1a...,2019-04-24,within an hour,f,3.0,t,Private room,...,0,81,4.63,4.84,4.91,4.89,4.84,4.52,4.65,t
2,53536257.0,"026 Bilocale con caminetto, Giustino","In the immediate vicinity of Pinzolo, in the r...",https://a0.muscache.com/pictures/prohost-api/H...,2016-10-04,within a few hours,f,54.0,t,Entire home/apt,...,79,1,5.00,5.00,5.00,5.00,5.00,3.00,3.00,t
3,21158645.0,Appartamento nel cuore della Val di Fiemme,Appartamento molto caldo e confortevole a 2 pa...,https://a0.muscache.com/pictures/2cb0b950-62e3...,2017-04-01,within an hour,f,2.0,t,Entire home/apt,...,63,17,4.41,4.59,4.47,4.76,4.71,4.59,4.59,t
4,30341842.0,Tonale -Vista mozzafiato a 300m da impianti.,Luminoso e grazioso bilocale d'angolo di 30m2 ...,https://a0.muscache.com/pictures/17885e5e-17fd...,2018-11-27,within an hour,f,1.0,f,Entire home/apt,...,90,54,4.44,4.43,4.57,4.78,4.78,4.67,4.43,t
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,13594109.0,Garda Lakeside B&B Casa Sandra Bertolini,The highlights of Casa Sandra Bertolini are th...,https://a0.muscache.com/pictures/ba3a36f9-e5ed...,2015-03-26,within a few hours,t,7.0,t,Private room,...,0,7,4.71,4.86,4.86,4.86,4.86,4.86,4.86,f
96,14315038.0,Lago di Garda casa tipica in borgo medioevale,Tradizionale casa in pietra e legno con impare...,https://a0.muscache.com/pictures/076aa77a-14cb...,2015-09-15,within an hour,f,1.0,f,Entire home/apt,...,18,5,5.00,4.80,4.80,5.00,5.00,5.00,4.40,t
97,26556844.0,Mansarda StudioApartment in the center of Torbole,"The apartment is cozy ""Mansarda"" under the roo...",https://a0.muscache.com/pictures/miso/Hosting-...,2013-05-06,within an hour,f,8.0,t,Entire home/apt,...,30,15,4.80,4.80,4.73,5.00,4.93,5.00,4.73,t
98,22547035.0,Al Pescatore for 4 persons,Note: You can directly book the best price if ...,https://a0.muscache.com/pictures/prohost-api/H...,2017-12-06,within an hour,f,112.0,t,Entire home/apt,...,15,4,4.50,4.50,4.75,4.75,4.50,5.00,4.75,t


In [28]:
# listings.to_csv('listings.csv')

### Clean reviews data

In [16]:
reviews_og = pd.read_csv('reviews_og.csv')
reviews_og.head(10)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,37736.0,105252.0,2010-09-26,180558,Esha,"Lovely quaint place in the mountains. Huge, sp..."
1,37736.0,109090.0,2010-10-01,179520,Jack,Expansively large apartment with lots of room ...
2,37736.0,177407.0,2011-02-01,358799,Valerie,"Adorable apartment, but watch your head on tho..."
3,37736.0,688419.0,2011-11-02,1334818,Marius & Katha,we had a beautiful stay at danieles house! Her...
4,37736.0,775479.0,2011-12-10,258681,Chiara,a really wonderful house! the photos don't lie...
5,37736.0,1215740.0,2012-05-02,685271,Maria,We loved this place and felt that we should ha...
6,37736.0,3034630.0,2012-12-06,3223988,Elmar & Tina,Beautiful little appartment in the mountains c...
7,37736.0,3521812.0,2013-02-11,1574462,Mariana,The apartment is new and very nice. The beds a...
8,37736.0,3555136.0,2013-02-16,13958507,Nenad,Sehr schön und sehr sauber !!!! Toller Ausblic...
9,37736.0,3891917.0,2013-03-25,4928271,Tim,"Die FeWo ist sehr hübsch eingerichtet, tolles ..."


In [17]:
reviews = reviews_og.copy()
reviews = reviews.drop(columns='reviewer_id')

# Exclude reviews not in the listings dataframe
reviews = reviews[reviews['listing_id'].isin(listings['listing_id'])]

# Clean up comments
cleaned_comments_list = []
for comment in reviews['comments']:
    if type(comment) == type('str') and len(comment)>0:
        cleaned_comment = re.sub('<[^<]+?>', ' ', comment)      # remove html
        cleaned_comment = cleaned_comment.replace('\n', ' ')    # remove linebreaks
        cleaned_comment = cleaned_comment.replace('   ', ' ')   # remove extra space
        cleaned_comment = cleaned_comment.replace('  ', ' ')    # remove extra space again
    else:
        cleaned_comment = np.nan

    cleaned_comments_list.append(cleaned_comment)

# Save to dataframe
reviews['comments'] = cleaned_comments_list

# Drop duplicates
reviews = reviews.dropna()

# Drop rows with the same description
reviews = reviews.drop_duplicates(subset='comments', keep='first')

# Keep only first name in reviewer_name
review_first_name_list = []
for name in reviews['reviewer_name']:
    first_name = name.split()[0]
    review_first_name_list.append(first_name)
reviews['reviewer_name'] = review_first_name_list

reviews

Unnamed: 0,listing_id,id,date,reviewer_name,comments
96,1.854763e+06,3.628033e+07,2015-06-27,Alessandro,"Struttura a dir poco splendida, curata nei min..."
306,1.875390e+06,9.432904e+06,2013-12-28,Elisa,I giorni trascorsi allo chalet sono stati perf...
307,1.875390e+06,9.527083e+06,2014-01-01,Dario,Chalet perfetto per chi vuole stare tranquillo...
308,1.875390e+06,9.793940e+06,2014-01-10,Alexey,Paolo is a great host and the chalet is a fant...
309,1.875390e+06,1.033288e+07,2014-02-14,Andy,We came to Malga Ciappela after 2metres of sno...
...,...,...,...,...,...
105858,6.664468e+17,7.084708e+17,2022-09-04,Vladimir,"Очень понравилось расположение, красивый вид. ..."
105938,6.695624e+17,6.882574e+17,2022-08-07,Angel,"Posto molto bello e tranquillo, belle passeggi..."
105974,6.897558e+17,6.939997e+17,2022-08-15,Gloria,"La casa è davvero molto bella, fornita di tutt..."
105975,6.897558e+17,6.983271e+17,2022-08-21,Sandra,"bello e accogliente l'appartamento, Alessandra..."


In [18]:
# reviews.to_csv('reviews.csv')

### Find individual example

In [19]:
# Which listing has the most reviews?
reviews.groupby('listing_id').count().sort_values('id', ascending=False)

Unnamed: 0_level_0,id,date,reviewer_name,comments
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6.937576e+06,364,364,364,364
2.054723e+06,315,315,315,315
3.194015e+06,268,268,268,268
1.025536e+06,168,168,168,168
1.399423e+07,166,166,166,166
...,...,...,...,...
4.730395e+07,1,1,1,1
5.753042e+17,1,1,1,1
3.127352e+07,1,1,1,1
5.786564e+17,1,1,1,1


In [20]:
reviews[reviews['listing_id']==19500580]
reviews[reviews['listing_id']==9191785]
# reviews[reviews['listing_id']==6937576].to_csv('sample1.csv')
# reviews[reviews['listing_id']==2054723].to_csv('sample2.csv')
reviews[reviews['listing_id']==2054723]

Unnamed: 0,listing_id,id,date,reviewer_name,comments
1310,2054723.0,1.154160e+07,2014-04-07,Gorazio,Хочу поблагодарить Роберто и всю его семью на ...
1316,2054723.0,1.200749e+07,2014-04-21,Sigrid,Sehr großzügige und neue Wohnung mit großer Te...
1327,2054723.0,1.214647e+07,2014-04-24,Mareen,Roberto is a great hosts. He was always availa...
1328,2054723.0,1.228703e+07,2014-04-28,Luís,Perfezione è la parola che esprime l'appartame...
1329,2054723.0,1.235246e+07,2014-04-29,Artem,"The apartments are wonderful, comfortable, cle..."
...,...,...,...,...,...
1744,2054723.0,7.055587e+17,2022-08-31,Silke,"What a beautifull place, especially the view! ..."
1745,2054723.0,7.157552e+17,2022-09-14,Sandra,Wir hatten eine schöne Zeit auf der tollen Ter...
1746,2054723.0,7.208100e+17,2022-09-21,Henryka,Wieder einmal alles perfekt! Danke Roberto
1747,2054723.0,7.237143e+17,2022-09-25,Kristin,Alles super! Kommunikation und Check in und ou...


In [21]:
# Select reviews from a single listing
df = reviews[reviews['listing_id']==2054723]

# Remove Czech review (won't work with text analytics)
df = df[df['reviewer_name'] != 'Zuzana']

# Remove unnecessary columns, format
df = df[['date', 'reviewer_name', 'comments']]
df = df.rename(columns={'date':'Date', 'reviewer_name': 'Name', 'comments':'Reviews'})
df = df.sort_values(by='Date', ascending=False)
df = df.reset_index(drop=True)

# Save and display results
df.to_csv('TrentinoReviews.csv')
df

Unnamed: 0,Date,Name,Reviews
0,2022-09-27,Fabian,"Die Wohnung ist super gemütlich, sauber und sc..."
1,2022-09-25,Kristin,Alles super! Kommunikation und Check in und ou...
2,2022-09-21,Henryka,Wieder einmal alles perfekt! Danke Roberto
3,2022-09-14,Sandra,Wir hatten eine schöne Zeit auf der tollen Ter...
4,2022-08-31,Silke,"What a beautifull place, especially the view! ..."
...,...,...,...
309,2014-04-29,Artem,"The apartments are wonderful, comfortable, cle..."
310,2014-04-28,Luís,Perfezione è la parola che esprime l'appartame...
311,2014-04-24,Mareen,Roberto is a great hosts. He was always availa...
312,2014-04-21,Sigrid,Sehr großzügige und neue Wohnung mit großer Te...


In [22]:
listings[listings['listing_id']==19500580]
listings[listings['listing_id']==9191785]
# listings[listings['listing_id']==6937576]
listings[listings['listing_id']==2054723]

Unnamed: 0,listing_id,name,description,picture_url,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,...,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable
31,2054723.0,"Lake Garda, wide terrace and sun","Placed in a very sunny, quite and nice area cl...",https://a0.muscache.com/pictures/49170274-d1be...,2013-12-09,within an hour,100%,100%,t,2.0,...,32,5,4.95,4.98,4.97,4.98,4.97,4.77,4.93,t
