In [216]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error, mean_squared_error

from sklearn.model_selection import GridSearchCV

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
train.head(10).T

In [None]:
test.head(1)

In [None]:
# column names
train.columns

In [None]:
# null value count for each column
train.isnull().sum()

In [None]:
# unique values in each column
train.apply(pd.Series.nunique)

In [None]:
# overview of data ranges, min, max, mean, etc.
train.describe()

In [None]:
# data types of each column
train.dtypes

In [None]:
#features with too many unique variables / provides low value and should be dropped
# id, amenities, description, first_review, name, thumbnail_url
train_processed = train.drop(['id', 'amenities', 'description', 'first_review', 'name', 'thumbnail_url'], axis=1)

In [None]:
# change date to numeric data type
# host_since, last_review, 
train_processed['host_since'] = pd.to_datetime(train_processed['host_since'])
train_processed['last_review'] = pd.to_datetime(train_processed['last_review'])

In [215]:
# deal with NA
train_processed['bathrooms'] = train_processed['bathrooms'].fillna(train_processed['bathrooms'].mean())
train_processed['host_has_profile_pic'] = train_processed['host_has_profile_pic'].fillna('NA')
train_processed['host_identity_verified'] = train_processed['host_identity_verified'].fillna('NA')
train_processed['neighbourhood'] = train_processed['neighbourhood'].fillna('NA')

train_processed['review_scores_rating'] = train_processed['review_scores_rating'].fillna(train_processed['review_scores_rating'].mean())

# host response rate has percentages in text form, need to convert
train_processed['host_response_rate'] = train_processed['host_response_rate'].str.rstrip('%').astype('float') / 100.0
train_processed['host_response_rate'] = train_processed['host_response_rate'].fillna(train_processed['host_response_rate'].mean())


train_processed['host_since'] = train_processed['host_since'].fillna('NA')
train_processed['host_silast_reviewnce'] = train_processed['last_review'].fillna('NA')
train_processed['zipcode'] = train_processed['zipcode'].fillna('NA')
train_processed['host_since'] = train_processed['host_since'].fillna('NA')
train_processed['last_review'] = train_processed['last_review'].fillna('NA')

train_processed['bedrooms'] = train_processed['bedrooms'].fillna(train_processed['bedrooms'].mean())
train_processed['beds'] = train_processed['beds'].fillna(train_processed['beds'].mean())

train_processed.isnull().sum()


log_price                     0
property_type                 0
room_type                     0
accommodates                  0
bathrooms                     0
bed_type                      0
cancellation_policy           0
cleaning_fee                  0
city                          0
host_has_profile_pic          0
host_identity_verified        0
host_response_rate            0
host_since                    0
instant_bookable              0
last_review               15827
latitude                      0
longitude                     0
neighbourhood                 0
number_of_reviews             0
review_scores_rating          0
zipcode                       0
bedrooms                      0
beds                          0
host_silast_reviewnce         0
dtype: int64

In [None]:
# one-hot encoding for categorical varuables
# property_type, property_type, room_type, bed_type, cancellation_policy, 
# city, host_has_profile_pic, host_identity_verified, neighbourhood, 

#change categorical data into dummy variables, need to define a function so that 
#when new data comes into the pipeline, it can handle
def make_dummies(test_col, train_unique_vals, col_name):
    """
    Return a df containing len(train_unique_vals) columns for 
    each unique value in train_unique_vals. If the test_col has more 
    unique values that are not seen in train_unique_vals, value
    will be 0
    """
    dummies = {}
    for val in train_unique_vals:
        dummies[col_name + '_' + val] = (test_col == val).astype(int)
    return pd.DataFrame(dummies, index = test_col.index)

categories = ['property_type', 'room_type', 'bed_type', 'cancellation_policy', 
'city', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood' ]
data_transformed = train_processed
for category in categories:
    temp_df = make_dummies(train_processed[category], train_processed[category].unique(), category)
    data_transformed = pd.concat([train_processed, temp_df], axis = 1)
    train_processed = data_transformed

data_transformed.head()

In [None]:
def get_model_error(model, X, y, test_size):
    trainX, testX, trainY, testY = train_test_split(X, y, test_size = test_size)
    model.fit(trainX, trainY)
    pred = model.predict(testX)
    predX = model.predict(trainX)
    print(np.sqrt(mean_squared_log_error(trainY, predX)),
    np.sqrt(mean_squared_log_error(testY, pred)))
    return np.sqrt(mean_squared_log_error(trainY, predX)), np.sqrt(mean_squared_log_error(testY, pred))

In [None]:
y = train.loc[:, 'log_price']
X = train.drop(['log_price'], axis=1)

model_rf = RandomForestRegressor(n_jobs = -1)
get_model_error(model_rf, X, y, 0.3)