## Business + Data Understanding

Gather necessary data to answer your questions

Handle categorical and missing data

Provide insight into the methods you chose and why you chose them

In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
%matplotlib inline

listing_df = pd.read_csv('./listings_seattle.csv')
calendar_df = pd.read_csv('./calendar_seattle.csv')
reviews_df = pd.read_csv('./reviews_seattle.csv')

### Understanding the data

We'll start with the listing_df

In [86]:
listing_df.shape

(3818, 92)

In [87]:
listing_df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'thumbnail_url', 'medium_url', 'picture_url',
       'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since',
       'host_location', 'host_about', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'street', 'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', '

In [88]:
set(listing_df['host_verifications'])

{'None',
 "['email', 'facebook', 'google', 'linkedin', 'reviews', 'kba']",
 "['email', 'facebook', 'reviews', 'jumio']",
 "['email', 'facebook', 'reviews', 'kba']",
 "['email', 'facebook', 'reviews']",
 "['email', 'facebook']",
 "['email', 'linkedin', 'reviews', 'jumio']",
 "['email', 'phone', 'facebook', 'amex', 'reviews', 'jumio']",
 "['email', 'phone', 'facebook', 'amex', 'reviews', 'kba']",
 "['email', 'phone', 'facebook', 'google', 'amex', 'reviews', 'jumio']",
 "['email', 'phone', 'facebook', 'google', 'jumio']",
 "['email', 'phone', 'facebook', 'google', 'kba']",
 "['email', 'phone', 'facebook', 'google', 'linkedin', 'amex', 'reviews', 'jumio']",
 "['email', 'phone', 'facebook', 'google', 'linkedin', 'amex', 'reviews', 'kba']",
 "['email', 'phone', 'facebook', 'google', 'linkedin', 'kba']",
 "['email', 'phone', 'facebook', 'google', 'linkedin', 'reviews', 'jumio', 'kba']",
 "['email', 'phone', 'facebook', 'google', 'linkedin', 'reviews', 'jumio']",
 "['email', 'phone', 'facebook

### Feature Classification

In [None]:
#General Information
gen_cols = ['id', 'name']

#text description
text_cols = ['summary', 'space', 'description', 'neighborhood_overview', 'notes', 'transit']

#host info
host_cols = ['host_id', 'host_url', 'host_name', 'host_since',
       'host_location', 'host_about', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified']

#location info
loc_cols = ['street', 'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 
       'smart_location', 'latitude', 'longitude',
       'is_location_exact']

#apartment info
apt_cols = ['property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet']

#price info
price_cols = ['price', 'weekly_price', 'monthly_price', 'security_deposit',
       'cleaning_fee', 'guests_included', 'extra_people']

#availability
aval_cols = ['minimum_nights', 'maximum_nights', 'calendar_updated', 
       'availability_30', 'availability_60', 'availability_90',
       'availability_365']

#reviews - satisfaction
rev_cols = ['number_of_reviews', 
       'first_review', 'last_review', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'reviews_per_month']

#booking
book_cols = ['instant_bookable', 'cancellation_policy', 'require_guest_profile_picture',
       'require_guest_phone_verification']


### Cleaning the data

In [63]:
#Identifying and dropping columns with only one value

uni_cols = []

for col in listing_df.columns:
    if len(listing_df[col].unique()) == 1:
        uni_cols.append(col)
        listing_df = listing_df.drop(columns=[col])

In [82]:
#Checking for columns with > 25% missing values

for col in listing_df.columns:
    if listing_df[col].isnull().mean() > 0.25:
        print("The column {} has {:.1f}% of missing values".format(col , 100*listing_df[col].isnull().mean()))

The column neighborhood_overview has 27.0% of missing values
The column notes has 42.1% of missing values
The column square_feet has 97.5% of missing values
The column weekly_price has 47.4% of missing values
The column monthly_price has 60.3% of missing values
The column security_deposit has 51.1% of missing values
The column cleaning_fee has 27.0% of missing values


In [9]:
listing_df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'thumbnail_url', 'medium_url', 'picture_url',
       'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since',
       'host_location', 'host_about', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'street', 'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', '

In [50]:
test = lambda x: float(x.replace('$', '').replace(',', ''))

listing_df['price'].apply(test)

0        85.0
1       150.0
2       975.0
3       100.0
4       450.0
        ...  
3813    359.0
3814     79.0
3815     93.0
3816     99.0
3817     87.0
Name: price, Length: 3818, dtype: float64