In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# import full dataset
listings = pd.read_csv('listings.csv')
listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,6422,https://www.airbnb.com/rooms/6422,20221221170309,2022-12-22,city scrape,Nashville Charm,30 day or more rental during COVID. Show COVID...,Historic East Nashville is home to many new an...,https://a0.muscache.com/pictures/pro_photo_too...,12172,...,4.96,4.92,4.98,,f,1,0,1,0,4.06
1,59576,https://www.airbnb.com/rooms/59576,20221221170309,2022-12-21,city scrape,Large Main Suite near Lake *ladies only NS plz,non-smokers only please<br /><br />furnished r...,We live on a cul-de-sac in a small community o...,https://a0.muscache.com/pictures/4e4e34db-3155...,812128,...,5.0,5.0,5.0,,f,10,1,9,0,0.34
2,72906,https://www.airbnb.com/rooms/72906,20221221170309,2022-12-21,city scrape,Vandy/Belmont/10 mins to Broadway - Sunny 800 ...,Entire top floor. Private. 800 sq ft of bright...,Historic Belmont-Hillsboro neighborhood. Walk-...,https://a0.muscache.com/pictures/58602855/3788...,176117,...,4.99,4.96,4.89,,f,1,1,0,0,4.76
3,319705,https://www.airbnb.com/rooms/319705,20221221170309,2022-12-21,city scrape,"SuperSweetSTUDIO, jacuzzi, open Nov 23, 6 mo",Huge fully furnished private room /studio apar...,"I'm very near the Percy Priest Lake, natural ...",https://a0.muscache.com/pictures/3470453/c7d82...,22296,...,4.68,4.62,4.57,,f,8,0,7,1,0.36
4,289242,https://www.airbnb.com/rooms/289242,20221221170309,2022-12-21,city scrape,"MorningstarHouse, monthly room- open Aug 19",Morningstar House! Extended stay - price reduc...,The Morningstar House is in a quiet cul-de-sac...,https://a0.muscache.com/pictures/40a7ab72-20f7...,22296,...,4.78,4.47,4.69,,f,8,0,7,1,0.57


In [3]:
listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [4]:
#subsetting the listings columns only by the features of the house 
features = listings[['name', 'price', 'neighbourhood_cleansed', 'room_type', 'minimum_nights', 'accommodates', 'bathrooms_text', 'bedrooms', 'beds']]
features.head()


Unnamed: 0,name,price,neighbourhood_cleansed,room_type,minimum_nights,accommodates,bathrooms_text,bedrooms,beds
0,Nashville Charm,$40.00,District 6,Private room,30,2,1 private bath,2.0,3.0
1,Large Main Suite near Lake *ladies only NS plz,$45.00,District 12,Private room,30,1,1 private bath,1.0,1.0
2,Vandy/Belmont/10 mins to Broadway - Sunny 800 ...,$90.00,District 18,Entire home/apt,2,4,1 bath,2.0,2.0
3,"SuperSweetSTUDIO, jacuzzi, open Nov 23, 6 mo",$39.00,District 12,Private room,30,3,1 private bath,1.0,5.0
4,"MorningstarHouse, monthly room- open Aug 19",$33.00,District 12,Private room,30,1,1 shared bath,1.0,3.0


In [5]:
#unique room types
features['room_type'].unique()
#room types are categorical they are all needed 

array(['Private room', 'Entire home/apt', 'Shared room', 'Hotel room'],
      dtype=object)

In [6]:
#unique bathroom text 
features['bathrooms_text'].unique()

array(['1 private bath', '1 bath', '1 shared bath', '2.5 baths',
       '2 baths', '1.5 baths', '3 baths', '3.5 baths', '4 baths',
       '4.5 baths', '1.5 shared baths', '7 baths', '3.5 shared baths',
       '5.5 baths', '8 baths', '0 baths', '2.5 shared baths', 'Half-bath',
       '6 baths', '5 baths', '2 shared baths', '9.5 baths', '7.5 baths',
       'Private half-bath', '14 baths', '3 shared baths', '6.5 baths',
       '9 baths', '19 baths', nan, '12 baths', '16 baths', '17 baths',
       '10.5 baths', '4 shared baths', '12.5 baths', '18 baths',
       '13.5 baths', '8.5 baths', '17.5 baths'], dtype=object)

In [7]:
#what is a private bath? 
features[features['bathrooms_text'] == '1 private bath'].head(3)

#private baths seem to be in private rooms with 30 day minumum stays (rentals)

Unnamed: 0,name,price,neighbourhood_cleansed,room_type,minimum_nights,accommodates,bathrooms_text,bedrooms,beds
0,Nashville Charm,$40.00,District 6,Private room,30,2,1 private bath,2.0,3.0
1,Large Main Suite near Lake *ladies only NS plz,$45.00,District 12,Private room,30,1,1 private bath,1.0,1.0
3,"SuperSweetSTUDIO, jacuzzi, open Nov 23, 6 mo",$39.00,District 12,Private room,30,3,1 private bath,1.0,5.0


In [8]:
#what is a shared bath?
features[features['bathrooms_text'] == '1 shared bath'].head(5)

Unnamed: 0,name,price,neighbourhood_cleansed,room_type,minimum_nights,accommodates,bathrooms_text,bedrooms,beds
4,"MorningstarHouse, monthly room- open Aug 19",$33.00,District 12,Private room,30,1,1 shared bath,1.0,3.0
6,"MorningstarHouse, monthly single room- open SOON!",$32.00,District 12,Private room,30,3,1 shared bath,1.0,3.0
12,Cheap Vandy / Downtown Nashville CrashPad,$500.00,District 19,Shared room,1,3,1 shared bath,1.0,1.0
17,Cheap Downtown Crashpad (Solo Travelers Only),$104.00,District 19,Shared room,1,1,1 shared bath,1.0,1.0
52,"Walk to Vandy, Downtown & Music Row - FREE SHU...",$82.00,District 19,Private room,14,2,1 shared bath,1.0,1.0


In [9]:
#the text for baths is important since it varies with room_type we cannot just use the number avalable for the bathrooms field as a number field
#we would be able to split bathrooms_text into two different fields - bathroom_count and bathroom_type 
#bathroom types - shared, private, not categorized.
features[['bathroom_count','bathroom_type']] = features["bathrooms_text"].str.split(" ", 1, expand=True)
features.head(4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,name,price,neighbourhood_cleansed,room_type,minimum_nights,accommodates,bathrooms_text,bedrooms,beds,bathroom_count,bathroom_type
0,Nashville Charm,$40.00,District 6,Private room,30,2,1 private bath,2.0,3.0,1,private bath
1,Large Main Suite near Lake *ladies only NS plz,$45.00,District 12,Private room,30,1,1 private bath,1.0,1.0,1,private bath
2,Vandy/Belmont/10 mins to Broadway - Sunny 800 ...,$90.00,District 18,Entire home/apt,2,4,1 bath,2.0,2.0,1,bath
3,"SuperSweetSTUDIO, jacuzzi, open Nov 23, 6 mo",$39.00,District 12,Private room,30,3,1 private bath,1.0,5.0,1,private bath


In [10]:
features['bathroom_count'].unique()

array(['1', '2.5', '2', '1.5', '3', '3.5', '4', '4.5', '7', '5.5', '8',
       '0', 'Half-bath', '6', '5', '9.5', '7.5', 'Private', '14', '6.5',
       '9', '19', nan, '12', '16', '17', '10.5', '12.5', '18', '13.5',
       '8.5', '17.5'], dtype=object)

In [11]:
features.loc[features['bathroom_count'] == 'Half-bath', 'bathroom_count'] = '0.5'
features['bathroom_count'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


array(['1', '2.5', '2', '1.5', '3', '3.5', '4', '4.5', '7', '5.5', '8',
       '0', '0.5', '6', '5', '9.5', '7.5', 'Private', '14', '6.5', '9',
       '19', nan, '12', '16', '17', '10.5', '12.5', '18', '13.5', '8.5',
       '17.5'], dtype=object)

In [12]:
#dropping nan and 'Private' total rows
count_remove = ['Private', 'nan']
features = features[features.bathroom_count.isin(count_remove) == False]
features['bathroom_count'].unique()

array(['1', '2.5', '2', '1.5', '3', '3.5', '4', '4.5', '7', '5.5', '8',
       '0', '0.5', '6', '5', '9.5', '7.5', '14', '6.5', '9', '19', nan,
       '12', '16', '17', '10.5', '12.5', '18', '13.5', '8.5', '17.5'],
      dtype=object)

In [13]:
#making bathroom text data into float
features['bathroom_count'] = features['bathroom_count'].astype(float) 
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8126 entries, 0 to 8126
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    8126 non-null   object 
 1   price                   8126 non-null   object 
 2   neighbourhood_cleansed  8126 non-null   object 
 3   room_type               8126 non-null   object 
 4   minimum_nights          8126 non-null   int64  
 5   accommodates            8126 non-null   int64  
 6   bathrooms_text          8123 non-null   object 
 7   bedrooms                7787 non-null   float64
 8   beds                    8079 non-null   float64
 9   bathroom_count          8123 non-null   float64
 10  bathroom_type           8122 non-null   object 
dtypes: float64(3), int64(2), object(6)
memory usage: 761.8+ KB


In [14]:
#bathroom_type cleaning 
features['bathroom_type'].unique()

array(['private bath', 'bath', 'shared bath', 'baths', 'shared baths',
       None, nan], dtype=object)

In [15]:
#missing values in features 
missing = pd.concat([features.isnull().sum(), 100 * features.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count')

Unnamed: 0,count,%
name,0,0.0
price,0,0.0
neighbourhood_cleansed,0,0.0
room_type,0,0.0
minimum_nights,0,0.0
accommodates,0,0.0
bathrooms_text,3,0.036919
bathroom_count,3,0.036919
bathroom_type,4,0.049225
beds,47,0.57839


In [16]:
features[features['bedrooms'].isnull()].head(5)

Unnamed: 0,name,price,neighbourhood_cleansed,room_type,minimum_nights,accommodates,bathrooms_text,bedrooms,beds,bathroom_count,bathroom_type
41,Private Loft Apt Vandy / Belmont,$99.00,District 18,Entire home/apt,2,4,1 bath,,2.0,1.0,bath
61,Cozy Guesthouse in 12S Neighborhood,$96.00,District 18,Entire home/apt,2,3,1 bath,,2.0,1.0,bath
82,Beautiful Loft Apt East Nashville,$91.00,District 6,Entire home/apt,2,2,1 bath,,2.0,1.0,bath
101,Nashville Carriage House,$76.00,District 6,Entire home/apt,1,2,1 bath,,1.0,1.0,bath
119,Sunny treetop loft walk to 5 points,$139.00,District 6,Entire home/apt,1,2,1 bath,,1.0,1.0,bath


In [17]:
#if the bedrooms is NA and it is an entire home/ apt the row should be dropped as it is missing valuable data that cannot be assumed 

to_remove = features[features['bedrooms'].isnull()][features['room_type'] == 'Entire home/apt']
to_remove.head()


  to_remove = features[features['bedrooms'].isnull()][features['room_type'] == 'Entire home/apt']


Unnamed: 0,name,price,neighbourhood_cleansed,room_type,minimum_nights,accommodates,bathrooms_text,bedrooms,beds,bathroom_count,bathroom_type
41,Private Loft Apt Vandy / Belmont,$99.00,District 18,Entire home/apt,2,4,1 bath,,2.0,1.0,bath
61,Cozy Guesthouse in 12S Neighborhood,$96.00,District 18,Entire home/apt,2,3,1 bath,,2.0,1.0,bath
82,Beautiful Loft Apt East Nashville,$91.00,District 6,Entire home/apt,2,2,1 bath,,2.0,1.0,bath
101,Nashville Carriage House,$76.00,District 6,Entire home/apt,1,2,1 bath,,1.0,1.0,bath
119,Sunny treetop loft walk to 5 points,$139.00,District 6,Entire home/apt,1,2,1 bath,,1.0,1.0,bath


In [18]:
#list of labels for to_remove df 
to_remove_labels = list(to_remove.index)
features = features.drop(labels = to_remove_labels)
features[features['bedrooms'].isnull()][features['room_type'] == 'Entire home/apt']

  features[features['bedrooms'].isnull()][features['room_type'] == 'Entire home/apt']


Unnamed: 0,name,price,neighbourhood_cleansed,room_type,minimum_nights,accommodates,bathrooms_text,bedrooms,beds,bathroom_count,bathroom_type


In [19]:
#what is still remaining of bedrooms with null values? 
features[features['bedrooms'].isnull()]

#private rooms are remaining... should bedrooms automatically be assumed as 1? Does this logic fall short? 

Unnamed: 0,name,price,neighbourhood_cleansed,room_type,minimum_nights,accommodates,bathrooms_text,bedrooms,beds,bathroom_count,bathroom_type
799,Remodeled Apart. within 15 minutes of main events,$80.00,District 16,Private room,1,2,1 private bath,,1.0,1.0,private bath
964,East Nashville Artists Bungalow - $0 Cleaning Fee,$135.00,District 6,Private room,2,2,1 private bath,,1.0,1.0,private bath
1070,East Nashville Musician's Loft--Walk to 5 points!,$80.00,District 6,Private room,2,5,1.5 baths,,1.0,1.5,baths
1073,"Quite Bordeaux house, < 5 miles to Downtown",$69.00,District 2,Private room,1,2,1 shared bath,,1.0,1.0,shared bath
1167,Eclipse room,$400.00,District 19,Private room,1,2,1 private bath,,1.0,1.0,private bath
1755,Big Comfy Cozy Family Room,$99.00,District 1,Private room,2,4,1 private bath,,4.0,1.0,private bath
1768,Studio 154 - Deluxe Posh Suite Two Queens,$239.00,District 19,Private room,1,4,1 private bath,,1.0,1.0,private bath
1770,Studio 154 - Posh Suite,$212.00,District 19,Private room,1,2,1 private bath,,1.0,1.0,private bath
1772,Studio 154 - Deluxe Posh Suite Two Queens w/ ...,$258.00,District 19,Private room,1,6,1 private bath,,3.0,1.0,private bath
1773,Studio 154 - Riverview Suite 2 King,$304.00,District 19,Private room,1,4,1 private bath,,2.0,1.0,private bath


In [20]:
#check beds and bedrooms 
features[features['bedrooms'].isnull()][features['beds'].isnull()]

  features[features['bedrooms'].isnull()][features['beds'].isnull()]


Unnamed: 0,name,price,neighbourhood_cleansed,room_type,minimum_nights,accommodates,bathrooms_text,bedrooms,beds,bathroom_count,bathroom_type
1774,Studio 154 - Riverview Suite 1 King,$291.00,District 19,Private room,1,2,1 private bath,,,1.0,private bath
4006,Hotel Preston,$0.00,District 13,Hotel room,1,0,,,,,


In [21]:
#replace private rooms bedrooms NaN with 1 since a private room entails one room  
features['bedrooms']= features['bedrooms'].fillna(1.0)
features[features['bedrooms'].isnull()]

Unnamed: 0,name,price,neighbourhood_cleansed,room_type,minimum_nights,accommodates,bathrooms_text,bedrooms,beds,bathroom_count,bathroom_type


In [22]:
#missing values in features 
missing = pd.concat([features.isnull().sum(), 100 * features.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count')

Unnamed: 0,count,%
name,0,0.0
price,0,0.0
neighbourhood_cleansed,0,0.0
room_type,0,0.0
minimum_nights,0,0.0
accommodates,0,0.0
bedrooms,0,0.0
bathrooms_text,3,0.038241
bathroom_count,3,0.038241
bathroom_type,4,0.050988


In [23]:
#what rows contain missing beds?
features[features['beds'].isnull()]

Unnamed: 0,name,price,neighbourhood_cleansed,room_type,minimum_nights,accommodates,bathrooms_text,bedrooms,beds,bathroom_count,bathroom_type
156,Beautiful Victorian in heart of 5 points/downtown,$306.00,District 6,Entire home/apt,2,8,2.5 baths,3.0,,2.5,baths
207,"Charming flat in heart of Nashville, Sylvan Park",$120.00,District 24,Entire home/apt,1,3,1 bath,1.0,,1.0,bath
785,Modern Space in Scenic Neighborhood,$75.00,District 23,Entire home/apt,30,4,1 bath,1.0,,1.0,bath
1774,Studio 154 - Riverview Suite 1 King,$291.00,District 19,Private room,1,2,1 private bath,1.0,,1.0,private bath
2266,Private guest room and bathroom in East Nash home,$71.00,District 5,Private room,1,2,1 private bath,1.0,,1.0,private bath
2810,Cozy Nashville Room/Newly Remodeled Near Opryland,$49.00,District 15,Private room,30,1,1 shared bath,1.0,,1.0,shared bath
2922,90% off hotel prices,$30.00,District 33,Private room,30,1,1 shared bath,1.0,,1.0,shared bath
2980,Master bedroom/ en suite,$35.00,District 33,Private room,30,1,3 shared baths,4.0,,3.0,shared baths
3307,Sonder Belcourt | Two-Bedroom Apartment w/ Bal...,$148.00,District 18,Entire home/apt,2,6,2 baths,2.0,,2.0,baths
3401,Sonder The Saddlery | Two-Bedroom Apartment,$302.00,District 19,Entire home/apt,2,6,1.5 baths,2.0,,1.5,baths


In [24]:
#drop all of the missing beds since it cannot be assumed at any level. 

#check to see if anything else is NA
features[features['bathrooms_text'].isnull()]

Unnamed: 0,name,price,neighbourhood_cleansed,room_type,minimum_nights,accommodates,bathrooms_text,bedrooms,beds,bathroom_count,bathroom_type
2472,Nashville Riverfront Lofts - Deluxe Penthouse #9,"$1,364.00",District 19,Entire home/apt,1,9,,3.0,4.0,,
2474,Downtown Nashville! Market Street Unit #4,$215.00,District 19,Entire home/apt,1,2,,1.0,1.0,,
4006,Hotel Preston,$0.00,District 13,Hotel room,1,0,,1.0,,,


In [25]:
#drop all of these as well.... drop all the rest of the na values in the features data frame
features = features.dropna()

In [26]:
#double checking
#missing values in features 
missing = pd.concat([features.isnull().sum(), 100 * features.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count')

Unnamed: 0,count,%
name,0,0.0
price,0,0.0
neighbourhood_cleansed,0,0.0
room_type,0,0.0
minimum_nights,0,0.0
accommodates,0,0.0
bathrooms_text,0,0.0
bedrooms,0,0.0
beds,0,0.0
bathroom_count,0,0.0


In [27]:
#drop the bathrooms_text since it is redundant 
features.drop(columns = ['bathrooms_text'])

Unnamed: 0,name,price,neighbourhood_cleansed,room_type,minimum_nights,accommodates,bedrooms,beds,bathroom_count,bathroom_type
0,Nashville Charm,$40.00,District 6,Private room,30,2,2.0,3.0,1.0,private bath
1,Large Main Suite near Lake *ladies only NS plz,$45.00,District 12,Private room,30,1,1.0,1.0,1.0,private bath
2,Vandy/Belmont/10 mins to Broadway - Sunny 800 ...,$90.00,District 18,Entire home/apt,2,4,2.0,2.0,1.0,bath
3,"SuperSweetSTUDIO, jacuzzi, open Nov 23, 6 mo",$39.00,District 12,Private room,30,3,1.0,5.0,1.0,private bath
4,"MorningstarHouse, monthly room- open Aug 19",$33.00,District 12,Private room,30,1,1.0,3.0,1.0,shared bath
...,...,...,...,...,...,...,...,...,...,...
8122,"3-BR Nashville Dream, close to Lower Broadway","$1,400.00",District 19,Entire home/apt,1,8,3.0,4.0,3.0,baths
8123,"Modern, Eclectic Entire Home minutes from down...","$1,400.00",District 24,Entire home/apt,4,8,4.0,5.0,3.5,baths
8124,Newly Built Home + Perfect Locations,$369.00,District 23,Entire home/apt,30,9,4.0,4.0,4.0,baths
8125,Walk to Broadway! Parking available!,$50.00,District 19,Entire home/apt,1,6,1.0,3.0,1.0,bath


In [30]:
# save the data to a new csv file

from library.sb_utils import save_file

datapath = '../data'
save_file(listings_data, 'listings_data_cleaned.csv', datapath)

ModuleNotFoundError: No module named 'library.sb_utils'

In [None]:
#Will amenities be warranted?

In [33]:
#seperate out amenities 
amenities = listings[['amenities']]
amenities.iloc[0]

amenities    ["Coffee maker: pour-over coffee", "First aid ...
Name: 0, dtype: object

In [45]:
#find every unique amenity

all_amenities = amenities['amenities'].tolist()


unique_values = [list(i) for i in set(tuple(i) for i in all_amenities)]


all_amenities

['["Coffee maker: pour-over coffee", "First aid kit", "Free street parking", "Fire extinguisher", "Wifi", "Washer \\u2013\\u00a0In building", "Mini fridge", "Hair dryer", "Shampoo", "Microwave", "Long term stays allowed", "Extra pillows and blankets", "Lock on bedroom door", "Children\\u2019s books and toys", "Heating", "Carbon monoxide alarm", "Hangers", "Ethernet connection", "Essentials", "Hot water", "Dishes and silverware", "Iron", "Host greets you", "Bed linens", "TV", "Backyard", "Portable fans", "Dryer \\u2013 In building", "Luggage dropoff allowed", "Smoke alarm", "Kitchen", "Air conditioning", "Bathtub"]',
 '["Books and reading material", "First aid kit", "Oven", "Free street parking", "Free parking on premises", "Fire extinguisher", "Wifi", "Hair dryer", "Body soap", "Outdoor furniture", "Cooking basics", "55\\" TV", "Laundromat nearby", "Hot water kettle", "Microwave", "Long term stays allowed", "Extra pillows and blankets", "Lock on bedroom door", "Refrigerator", "Heating"