In [1]:
import pandas as pd
import csv
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
file_url = '/Users/xzhou/github/project_archives/files_airbnb/sf_airbnb/{}.csv'

listings_summary = file_url.format('listings_summary')
reviews_file = file_url.format('reviews_details')
calendar_file = file_url.format('calendar_details')

In [3]:
df_listings = pd.read_csv(listings_summary, parse_dates=['last_review'])

In [4]:
df_listings.shape

(6633, 16)

In [5]:
df_listings.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,958,"Bright, Modern Garden Unit - 1BR/1B",1169,Holly,,Western Addition,37.76931,-122.433856,Entire home/apt,170,1,152,2018-07-29,1.38,1,122
1,5858,Creative Sanctuary,8904,Philip And Tania,,Bernal Heights,37.745112,-122.421018,Entire home/apt,235,30,112,2017-08-06,0.99,1,365
2,7918,A Friendly Room - UCSF/USF - San Francisco,21994,Aaron,,Haight Ashbury,37.76669,-122.452505,Private room,65,32,17,2016-11-21,0.16,9,365
3,8142,Friendly Room Apt. Style -UCSF/USF - San Franc...,21994,Aaron,,Haight Ashbury,37.764872,-122.451828,Private room,65,32,7,2017-08-12,0.15,9,328
4,8339,Historic Alamo Square Victorian,24215,Rosy,,Western Addition,37.775249,-122.436374,Entire home/apt,675,7,26,2017-11-10,0.24,2,78


In [6]:
df_listings.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [7]:
irrlevant_columns_to_drop = ['name', 'host_id', 'host_name', 'neighbourhood_group']

In [8]:
df_listings_smaller = df_listings.drop(irrlevant_columns_to_drop, axis=1)

In [9]:
df_listings_smaller.dtypes

id                                         int64
neighbourhood                             object
latitude                                 float64
longitude                                float64
room_type                                 object
price                                      int64
minimum_nights                             int64
number_of_reviews                          int64
last_review                       datetime64[ns]
reviews_per_month                        float64
calculated_host_listings_count             int64
availability_365                           int64
dtype: object

In [10]:
datetime.now().year

2018

In [11]:
def duration(starting_time):
    duration =0
    starting_year = starting_time.year
    duration = datetime.now().year - starting_year + 1
    return duration

In [12]:
df_listings_smaller['yrs_since_last_review'] = df_listings_smaller.last_review.apply(lambda x: duration(x))

In [13]:
df_listings_smaller.head()

Unnamed: 0,id,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,yrs_since_last_review
0,958,Western Addition,37.76931,-122.433856,Entire home/apt,170,1,152,2018-07-29,1.38,1,122,1.0
1,5858,Bernal Heights,37.745112,-122.421018,Entire home/apt,235,30,112,2017-08-06,0.99,1,365,2.0
2,7918,Haight Ashbury,37.76669,-122.452505,Private room,65,32,17,2016-11-21,0.16,9,365,3.0
3,8142,Haight Ashbury,37.764872,-122.451828,Private room,65,32,7,2017-08-12,0.15,9,328,2.0
4,8339,Western Addition,37.775249,-122.436374,Entire home/apt,675,7,26,2017-11-10,0.24,2,78,2.0


In [14]:
df_listings_smaller.drop('last_review', axis=1, inplace=True)

In [15]:
df_listings_smaller.isnull().sum()

id                                   0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
reviews_per_month                 1142
calculated_host_listings_count       0
availability_365                     0
yrs_since_last_review             1142
dtype: int64

In [16]:
df_listings_smaller.dropna(inplace=True)

In [17]:
df_listings_smaller.shape

(5491, 12)

In [18]:
df_listings_smaller.dtypes

id                                  int64
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
yrs_since_last_review             float64
dtype: object

In [19]:
categorical_data = ['neighbourhood', 'room_type']

In [20]:
df_listings_cleaned = pd.get_dummies(df_listings_smaller, prefix=categorical_data, columns=categorical_data,)

In [21]:
df_listings_cleaned.columns

Index(['id', 'latitude', 'longitude', 'price', 'minimum_nights',
       'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365',
       'yrs_since_last_review', 'neighbourhood_Bayview',
       'neighbourhood_Bernal Heights', 'neighbourhood_Castro/Upper Market',
       'neighbourhood_Chinatown', 'neighbourhood_Crocker Amazon',
       'neighbourhood_Diamond Heights', 'neighbourhood_Downtown/Civic Center',
       'neighbourhood_Excelsior', 'neighbourhood_Financial District',
       'neighbourhood_Glen Park', 'neighbourhood_Golden Gate Park',
       'neighbourhood_Haight Ashbury', 'neighbourhood_Inner Richmond',
       'neighbourhood_Inner Sunset', 'neighbourhood_Lakeshore',
       'neighbourhood_Marina', 'neighbourhood_Mission',
       'neighbourhood_Nob Hill', 'neighbourhood_Noe Valley',
       'neighbourhood_North Beach', 'neighbourhood_Ocean View',
       'neighbourhood_Outer Mission', 'neighbourhood_Outer Richmond',
       'neighbourhood_O

In [22]:
df_listings_cleaned.head()

Unnamed: 0,id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,yrs_since_last_review,...,neighbourhood_Russian Hill,neighbourhood_Seacliff,neighbourhood_South of Market,neighbourhood_Twin Peaks,neighbourhood_Visitacion Valley,neighbourhood_West of Twin Peaks,neighbourhood_Western Addition,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,958,37.76931,-122.433856,170,1,152,1.38,1,122,1.0,...,0,0,0,0,0,0,1,1,0,0
1,5858,37.745112,-122.421018,235,30,112,0.99,1,365,2.0,...,0,0,0,0,0,0,0,1,0,0
2,7918,37.76669,-122.452505,65,32,17,0.16,9,365,3.0,...,0,0,0,0,0,0,0,0,1,0
3,8142,37.764872,-122.451828,65,32,7,0.15,9,328,2.0,...,0,0,0,0,0,0,0,0,1,0
4,8339,37.775249,-122.436374,675,7,26,0.24,2,78,2.0,...,0,0,0,0,0,0,1,1,0,0


In [23]:
# Create correlation matrix
corr_matrix = df_listings_cleaned.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [24]:
to_drop

['room_type_Private room']

In [26]:
# Drop features 
df_listings_cleaned.drop(to_drop, axis=1, inplace=True)

In [27]:
df_listings_cleaned.columns

Index(['id', 'latitude', 'longitude', 'price', 'minimum_nights',
       'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365',
       'yrs_since_last_review', 'neighbourhood_Bayview',
       'neighbourhood_Bernal Heights', 'neighbourhood_Castro/Upper Market',
       'neighbourhood_Chinatown', 'neighbourhood_Crocker Amazon',
       'neighbourhood_Diamond Heights', 'neighbourhood_Downtown/Civic Center',
       'neighbourhood_Excelsior', 'neighbourhood_Financial District',
       'neighbourhood_Glen Park', 'neighbourhood_Golden Gate Park',
       'neighbourhood_Haight Ashbury', 'neighbourhood_Inner Richmond',
       'neighbourhood_Inner Sunset', 'neighbourhood_Lakeshore',
       'neighbourhood_Marina', 'neighbourhood_Mission',
       'neighbourhood_Nob Hill', 'neighbourhood_Noe Valley',
       'neighbourhood_North Beach', 'neighbourhood_Ocean View',
       'neighbourhood_Outer Mission', 'neighbourhood_Outer Richmond',
       'neighbourhood_O

In [59]:
test=['id', 'latitude', 'longitude', 'price','minimum_nights']

In [60]:
X = df_listings_cleaned.drop(test, axis=1)
y = df_listings_cleaned['price']

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [62]:
ss = StandardScaler()
X_train_trans = ss.fit_transform(X_train)
X_test_trans = ss.transform(X_test)

In [63]:
lm = linear_model.LinearRegression()
lm.fit(X_train_trans, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [64]:
lm.score(X_test_trans, y_test)

0.15319359258089005