In [2]:
#Pandas and numpy for data frame manipulation
import pandas as pd
import numpy as np

#Webscraping tools
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

import time
import os

#Visualization
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

import datetime
import re

#Wordcloud
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

#Folium
import folium
from folium.plugins import HeatMap


plt.style.use('seaborn')
sns.set_palette("husl")
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

### Read in the data

Procured dataset from [Inside Airbnb website](http://insideairbnb.com/get-the-data.html).

In [3]:
df_new = pd.read_csv('listings_september.csv')

In [4]:
df_new.shape

(7776, 96)

In [5]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7776 entries, 0 to 7775
Data columns (total 96 columns):
id                                  7776 non-null int64
listing_url                         7776 non-null object
scrape_id                           7776 non-null int64
last_scraped                        7776 non-null object
name                                7776 non-null object
summary                             7604 non-null object
space                               5983 non-null object
description                         7696 non-null object
experiences_offered                 7776 non-null object
neighborhood_overview               5524 non-null object
notes                               4127 non-null object
transit                             5699 non-null object
access                              5389 non-null object
interaction                         5140 non-null object
house_rules                         5439 non-null object
thumbnail_url                       0 no

In [6]:
df_new.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'access', 'interaction', 'house_rules',
       'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url',
       'host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms',

### Wordcloud

In [None]:
def create_wordcloud(dataframe, columnName):
    '''
    Creates a  wordcloud from texts from a dataframe column.
    '''
    text = " ".join(str(review) for review in dataframe[columnName])
    print ("There are {} words in the combination of all {}.".format(len(text), columnName))
    wordcloud = WordCloud(width=800, height=400,background_color="white").generate(text)
    plt.figure(figsize=(20,10))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    file = wordcloud.to_file(columnName+".png")
    
    return file

In [None]:
create_wordcloud(df, 'name')

In [None]:
create_wordcloud(df_wc, 'transit')

### Folium heat map

In [None]:
for_chi = pd.read_csv('listings_september.csv')

In [None]:
### Not necessary of clean price in run beforehand
for_chi['price'] = for_chi['price'].str[1:]
for_chi['price'] = pd.to_numeric(for_chi['price'], errors='coerce').fillna(0)

In [None]:
max_amount = float(for_chi['price'].max())

In [None]:
hchi = folium.Map(location=[41.8, -87.6], zoom_start = 10)

hm_wide = HeatMap(list(zip(for_chi.latitude.values, for_chi.longitude.values, for_chi.price.values)), min_opacity=0.2, max_val=max_amount, radius=10, blur=18, max_zoom = 1)

hchi.add_child(hm_wide)

hchi.save('chi_map.html')

### Clean Zipcode

In [7]:
def clean_zipcode(dataframe):
    '''
    Takes the zipcode column and cleans it up, making it a five digit code
    throughout, and renaming columnn names '''
    dataframe['zipcode_clean'] = dataframe['zipcode'].str[:5]
    dataframe['zipcode_clean'] = dataframe['zipcode_clean'].astype('float')
    dataframe.rename(columns = {'zipcode' : 'zipcode_old','zipcode_clean': 'zipcode'}, inplace=True)
    
    return dataframe

clean_zipcode(df_new)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,zipcode
0,2384,https://www.airbnb.com/rooms/2384,20180914083817,2018-09-14,Hyde Park-Walk to UChicago or Theological Semi...,"As the sole guest in my quiet, vintage (1924) ...","The spacious bedroom has a queen size bed, che...","As the sole guest in my quiet, vintage (1924) ...",none,My building is located one block from beautifu...,...,City registration pending,"{""Illinois State"","" Cook County"","" IL"","" CHICA...",f,f,strict_14_with_grace_period,f,f,1,2.94,60637.0
1,4505,https://www.airbnb.com/rooms/4505,20180914083817,2018-09-14,1 Great Apartment. 352 Great Reviews. 1 bad one.,Across the street from CTA train. Runs every 6...,"We travel a lot, we know what people need. We...",Across the street from CTA train. Runs every 6...,none,,...,City registration pending,"{""Illinois State"","" Cook County"","" IL"","" CHICA...",t,f,moderate,f,f,1,3.09,60608.0
2,6715,https://www.airbnb.com/rooms/6715,20180914083817,2018-09-14,Lincoln Park Oasis - Unit 2 ONLY,Unit 1 & Unit 2 are rented separately. They ca...,License #: (Phone number hidden by Airbnb) Be...,Unit 1 & Unit 2 are rented separately. They ca...,none,Things To Do & Close to: - An awesome Children...,...,2114275,"{""Illinois State"","" Cook County"","" IL"","" CHICA...",f,f,strict_14_with_grace_period,f,f,2,0.82,60614.0
3,7126,https://www.airbnb.com/rooms/7126,20180914083817,2018-09-14,Tiny Studio Apartment 94 Walk Score,,This is a very small studio apartment with a ...,This is a very small studio apartment with a ...,none,"Ukrainian Village was just named ""Hottest Neig...",...,City registration pending,"{""Illinois State"","" Cook County"","" IL"","" CHICA...",f,f,moderate,f,f,1,2.78,60622.0
4,9811,https://www.airbnb.com/rooms/9811,20180914083817,2018-09-14,Barbara's Hideaway - Old Town,One-bedroom hideaway tucked into Old Town step...,"This lovely one bedroom ""hideaway"" is located ...",One-bedroom hideaway tucked into Old Town step...,none,Chicago’s Old Town neighborhood is squeezed be...,...,2079260,"{""Illinois State"","" Cook County"","" IL"","" CHICA...",t,f,strict_14_with_grace_period,f,f,9,0.56,60614.0
5,10610,https://www.airbnb.com/rooms/10610,20180914083817,2018-09-14,3 Comforts of Cooperative Living,The condo is the 2nd floor in a lovely 1912 3-...,Newly furnished with queen bed and the comfort...,The condo is the 2nd floor in a lovely 1912 3-...,none,It's a 10 minute walk from the lakefront bike ...,...,City registration pending,"{""Illinois State"","" Cook County"","" IL"","" CHICA...",t,f,moderate,f,f,6,0.65,60615.0
6,10945,https://www.airbnb.com/rooms/10945,20180914083817,2018-09-14,The Biddle House (#1),Beautiful first floor apartment in Historic Ol...,This 1st floor apartment has a queen size bed ...,Beautiful first floor apartment in Historic Ol...,none,,...,2120297,"{""Illinois State"","" Cook County"","" IL"","" CHICA...",t,f,strict_14_with_grace_period,f,f,9,0.15,60614.0
7,12068,https://www.airbnb.com/rooms/12068,20180914083817,2018-09-14,Chicago GOLD COAST 1 Bedroom Condo,"Located in Chicago's Gold Coast / Old Town, st...",ONE MONTH MINIMUM. Parking available. Well-ap...,"Located in Chicago's Gold Coast / Old Town, st...",none,The condo is on Lasalle just off of Division; ...,...,City registration pending,"{""Illinois State"","" Cook County"","" IL"","" CHICA...",f,f,strict_14_with_grace_period,f,f,2,0.19,60610.0
8,12140,https://www.airbnb.com/rooms/12140,20180914083817,2018-09-14,Lincoln Park Guest House,,Luxurious and modern accommodations in the hea...,Luxurious and modern accommodations in the hea...,none,The Guest House is in the heart of Chicago's m...,...,R17000022154,"{""Illinois State"","" Cook County"","" IL"","" CHICA...",f,f,moderate,f,f,1,0.10,60614.0
9,22362,https://www.airbnb.com/rooms/22362,20180914083817,2018-09-14,*** Luxury in Chicago! 2BR/ 2Ba / Parking / BB...,Fantastic condo in a Great Location with cover...,Our home is Fully Furnished & Equipped!! Just...,Fantastic condo in a Great Location with cover...,none,"Our home overlooks the park and is on a quiet,...",...,City registration pending,"{""Illinois State"","" Cook County"","" IL"","" CHICA...",f,f,moderate,f,t,1,0.15,60642.0


### Clean Price

The cleaning process includes removing the $ sign and converting the dataframe object to a int64.

In [8]:
def clean_price(dataframe, list_of_columns):
    '''
    Iterate through a list_of_columns in a dataframe (df)
    and convert price objects into int64.
    '''
    
    for column in list_of_columns:
        dataframe[column] = dataframe[column].str[1:]
        dataframe[column] = pd.to_numeric(dataframe[column], errors='coerce').fillna(0)
        dataframe[column] = dataframe[column].astype('int64')
    return dataframe

In [9]:
df_new = clean_price(df_new, ['price', 'security_deposit', 'cleaning_fee', 'extra_people'])

### Rent as a proxy for location

We used the rent data from Zillow to arrive at numerical proxies for the zipcode location.

In [10]:
df_rent = pd.read_csv("data/chicago-rent.csv")

In [11]:
df_new = pd.merge(df_new, df_rent, on ='zipcode')

### Datetime

In [12]:
def make_datetime_object(dataframe, list_of_columns):
    '''
    Iterate through a list_of_columns in a dataframe (df)
    and convert date objects into datetime.
    '''
    
    for column in list_of_columns:
        dataframe[column] = pd.to_datetime(dataframe[column], format = '%m-%d-%Y', errors ='ignore')
        dataframe[column] = dataframe[column].astype('datetime64')
    return dataframe

In [13]:
now = datetime.datetime.now().date()
print(now)

2018-10-15


In [14]:
df_new['now'] = now

In [16]:
df_new = make_datetime_object(df_new, ['host_since', 'first_review', 'last_review', 'now'])

In [17]:
df_new['review_period'] = (df_new['last_review'] - df_new['first_review']).astype('timedelta64[D]')

In [18]:
df_new['number_of_days_as_host'] = (df_new['now'] - df_new['host_since']).astype('timedelta64[D]')

### Clean amenities

Thanks, Ben!

In [19]:
def clean_amenities(dataframe):
    
    dataframe['amenities'] = [value[1:-1] for value in dataframe['amenities']]
    dataframe['amenities'] = [len(value.split(',')) for value in dataframe['amenities']]
    
    return dataframe

clean_amenities(df_new)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,zipcode,zvi,now,review_period,number_of_days_as_host
0,2384,https://www.airbnb.com/rooms/2384,20180914083817,2018-09-14,Hyde Park-Walk to UChicago or Theological Semi...,"As the sole guest in my quiet, vintage (1924) ...","The spacious bedroom has a queen size bed, che...","As the sole guest in my quiet, vintage (1924) ...",none,My building is located one block from beautifu...,...,strict_14_with_grace_period,f,f,1,2.94,60637.0,1400,2018-10-15,1311.0,3699.0
1,298321,https://www.airbnb.com/rooms/298321,20180914083817,2018-09-14,nice room with king size bed,,great room for rent near metro train station (...,great room for rent near metro train station (...,none,,...,flexible,f,f,1,0.53,60637.0,1400,2018-10-15,2275.0,2485.0
2,1810118,https://www.airbnb.com/rooms/1810118,20180914083817,2018-09-14,Huge Private 1BR/Full Bath near U of Chicago,HUGE Bedroom (22'x12' / 24.5m²) w/ private-acc...,"This quiet, sunny, spacious, private room is t...",HUGE Bedroom (22'x12' / 24.5m²) w/ private-acc...,none,Wake up and stop by the Robust Coffee shop for...,...,moderate,f,f,1,4.41,60637.0,1400,2018-10-15,1664.0,1824.0
3,2604454,https://www.airbnb.com/rooms/2604454,20180914083817,2018-09-14,Cozy Single-Family Home near University of Chi...,Comfortable House in Hyde Park: This beautiful...,The house is a wood-framed Eastlake Victorian ...,Comfortable House in Hyde Park: This beautiful...,none,The house is essentially on the University of ...,...,strict_14_with_grace_period,f,f,3,1.44,60637.0,1400,2018-10-15,1483.0,1670.0
4,3984383,https://www.airbnb.com/rooms/3984383,20180914083817,2018-09-14,The West Room near U of Chicago,Comfortable House in Hyde Park: This beautiful...,My house is a small old Victorian wood-framed ...,Comfortable House in Hyde Park: This beautiful...,none,,...,moderate,f,f,3,0.93,60637.0,1400,2018-10-15,1431.0,1670.0
5,4056327,https://www.airbnb.com/rooms/4056327,20180914083817,2018-09-14,The NorthEast Room near U of C,One of 3 private rooms listed in a comfy Eastl...,,One of 3 private rooms listed in a comfy Eastl...,none,,...,moderate,f,f,3,0.07,60637.0,1400,2018-10-15,364.0,1670.0
6,4409593,https://www.airbnb.com/rooms/4409593,20180914083817,2018-09-14,Comfy 2BR/1BA~Near Lakefront~Free Parking!,Our comfy and spacious condo is in a prime loc...,This is my fully furnished condo that I would ...,Our comfy and spacious condo is in a prime loc...,none,"Woodlawn, located in the South Side of the Cit...",...,moderate,f,f,1,0.45,60637.0,1400,2018-10-15,1196.0,2339.0
7,6576714,https://www.airbnb.com/rooms/6576714,20180914083817,2018-09-14,Private BR;Parking by UofC/HydePark,This comfortable condo is a stones throw away...,"Real bed; Room can accommodate 3, two in the b...",This comfortable condo is a stones throw away...,none,Woodlawn is in a blossoming area of Chicago. I...,...,moderate,f,f,2,2.26,60637.0,1400,2018-10-15,1126.0,1237.0
8,6724355,https://www.airbnb.com/rooms/6724355,20180914083817,2018-09-14,Cozy 2 Bedroom Private Level,Located a few blocks from the University of Ch...,This rehabbed condo is just a mile away from L...,Located a few blocks from the University of Ch...,none,"One of Chicago's best coffee shops ""Greenline ...",...,strict_14_with_grace_period,f,f,1,2.17,60637.0,1400,2018-10-15,1144.0,1265.0
9,6764230,https://www.airbnb.com/rooms/6764230,20180914083817,2018-09-14,Sofa surfing for an evening,Lovely room in an artistic home perfect for th...,,Lovely room in an artistic home perfect for th...,none,,...,flexible,f,f,2,0.33,60637.0,1400,2018-10-15,1156.0,1224.0


### Get dummies for the host_is_superhost column

In [21]:
X = pd.get_dummies(df_new.host_is_superhost, drop_first=True)

In [22]:
df_clean4 = df_new.join(X)

In [24]:
df_clean4.to_csv('data/listing_final.csv', index =False)