In the EDA notebook, we have made a few attempts at building a linear regression model, this time we will be introducing another feature to the dataset "amenities" and see if this improves our model

In [1]:
#import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,accuracy_score
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import datasets

#Listings includes full descriptions and average review score
#Reviews includes unique id for each reviewer and detailed comments
#Calendar includes listing id and the price and availability for that day

df_listings = pd.read_csv('listings.csv')
df_reviews = pd.read_csv('reviews.csv')
df_calendar = pd.read_csv('calendar.csv')

In [3]:
df_listings_new = df_listings[['neighbourhood_cleansed','property_type','room_type','bathrooms','bedrooms', 'accommodates', 'beds', 
                               'bed_type','amenities', 'price' ]]

In [4]:
#let us perform some cleaning
#first we remove the comma in the price column, then split to remove the $ and finally convert to numeric

df_price = df_listings_new['price'].str.replace(',', '')
df_price = df_price.str.split('$', n=1, expand=True)
df_price_new = df_price.rename(columns = {1:'price_new'})
df_price_new = df_price_new['price_new'].apply(pd.to_numeric)

In [5]:
df_listings_newer = pd.concat([df_listings_new, df_price_new], axis=1)

In [6]:
# We have about 0.04% of missing data, we can drop them
df_listings_newer = df_listings_newer.dropna().reset_index(drop=True)

In [7]:
df_listings_newer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3554 entries, 0 to 3553
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   neighbourhood_cleansed  3554 non-null   object 
 1   property_type           3554 non-null   object 
 2   room_type               3554 non-null   object 
 3   bathrooms               3554 non-null   float64
 4   bedrooms                3554 non-null   float64
 5   accommodates            3554 non-null   int64  
 6   beds                    3554 non-null   float64
 7   bed_type                3554 non-null   object 
 8   amenities               3554 non-null   object 
 9   price                   3554 non-null   object 
 10  price_new               3554 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 305.5+ KB


In [8]:
#Using the z-score method for removing outliers and setting threshold as 3

z = np.abs(stats.zscore(df_listings_newer['price_new']))

threshold_z = 3

outlier_indices = np.where(z > threshold_z)[0]
df_no_outliers = df_listings_newer.drop(outlier_indices).reset_index(drop=True)

In [9]:
df_no_outliers['amenities']

0       {TV,"Wireless Internet",Kitchen,"Free Parking ...
1       {TV,Internet,"Wireless Internet","Air Conditio...
2       {TV,"Cable TV","Wireless Internet","Air Condit...
3       {TV,Internet,"Wireless Internet","Air Conditio...
4       {Internet,"Wireless Internet","Air Conditionin...
                              ...                        
3515    {Internet,"Wireless Internet","Air Conditionin...
3516    {TV,Internet,"Wireless Internet","Air Conditio...
3517    {"translation missing: en.hosting_amenity_49",...
3518    {Kitchen,Gym,"Family/Kid Friendly",Washer,Drye...
3519    {"Wireless Internet",Kitchen,Essentials,"trans...
Name: amenities, Length: 3520, dtype: object

In [10]:
# Now let us clean up amenities feature and split up into columns

# function to remove curly braces and double quotes in each cell
def remove_characters(s):
    return s.replace('{', '').replace('}', '').replace('"', '')

# Apply the function to each cell in the DataFrame
data = (pd.DataFrame(df_no_outliers['amenities'])).applymap(remove_characters)


#Next split text in each cell to a list of strings

# Iterate through each row in the DataFrame
for index, row in data.iterrows():
    # Iterate through each column in the row
    for col in data.columns:
        # Split the text using ',' as the delimiter and write the list back into the cell
        data.at[index, col] = row[col].split(',')


#Finally split amenities into several columns using each string as column name
        
# Initialize an empty DataFrame to store the results
result_df = pd.DataFrame()

# Iterate through each row in the DataFrame
for index, row in data.iterrows():
    # Iterate through each string in the list of strings
    for string in row['amenities']:
        # If the string is not already a column in result_df, create it and fill with 'No'
        if string not in result_df.columns:
            result_df[string] = 'No'
        # Fill 'Yes' in the cell for the current string
        result_df.at[index, string] = 'Yes'

# Fill remaining NaN values with 'No'
result_df.fillna('No', inplace=True)

data_new = pd.DataFrame(result_df)


In [11]:
data_new

Unnamed: 0,TV,Wireless Internet,Kitchen,Free Parking on Premises,Pets live on this property,Dog(s),Heating,Family/Kid Friendly,Washer,Dryer,...,Washer / Dryer,Smoking Allowed,Unnamed: 14,Suitable for Events,Wheelchair Accessible,Elevator in Building,Pool,Doorman,Paid Parking Off Premises,Free Parking on Street
0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,...,No,No,No,No,No,No,No,No,No,No
1,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,...,No,No,No,No,No,No,No,No,No,No
2,Yes,Yes,Yes,Yes,No,No,Yes,No,Yes,Yes,...,No,No,No,No,No,No,No,No,No,No
3,Yes,Yes,Yes,Yes,No,No,Yes,No,Yes,Yes,...,No,No,No,No,No,No,No,No,No,No
4,No,Yes,Yes,No,No,No,Yes,No,No,No,...,No,No,No,No,No,No,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3515,No,Yes,Yes,Yes,No,No,Yes,Yes,No,No,...,No,No,No,No,No,No,No,No,No,No
3516,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,...,No,Yes,No,No,No,Yes,No,No,No,No
3517,No,No,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
3518,No,No,Yes,No,No,No,No,Yes,Yes,Yes,...,No,No,No,No,No,No,No,No,No,No


In [12]:
#Again, we are dropping categorical features with levels > 10 and some amenities we can't deduce

df_listings_newest = pd.concat([df_no_outliers.drop(columns =['price', 'amenities', 'neighbourhood_cleansed', 'property_type'], axis=1), data_new.drop(columns = ['translation missing: en.hosting_amenity_49', 'translation missing: en.hosting_amenity_50', ''], axis = 1)], axis = 1)

In [13]:
# create dummy features

final_df = pd.get_dummies(df_listings_newest)

In [14]:
 #choose features
X = final_df.drop('price_new', axis=1)

#target
y = final_df['price_new']


#split into train and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 42) 

#Instantiate Model 
lr = LinearRegression()

#Fit
lr.fit(X_train, y_train)

#Predict and score the model
y_test_preds = lr.predict(X_test)
y_train_preds = lr.predict(X_train)

train_score = r2_score(y_train, y_train_preds)
test_score = r2_score(y_test, y_test_preds)

print("rsquared on the training data = {}".format(train_score))
print("rsquared on the test data = {}".format(test_score))

rsquared on the training data = 0.564640998932096
rsquared on the test data = 0.552074341754843


Much improved model after throwing in some additional features. I have made a few attempts using linear regression but our best r2 score on the test data is about 0.55.  Perhaps it is worth looking at another algorithm, Random Forest

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [16]:
#Check for and handle categorical variables, here we apply a slightly different approach to handling categorical variables

le = LabelEncoder()
x_cat = df_listings_newest.select_dtypes(include=['object']).apply(le.fit_transform)
x_num = df_listings_newest.select_dtypes(exclude=['object']).values
x = pd.concat([pd.DataFrame(x_num), x_cat], axis=1).values
 
# Instantiating the model
regressor = RandomForestRegressor(n_estimators=10, random_state=0, oob_score=True)

#train/test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .30, random_state = 42) 
 
# Fit the regressor with x and y data
regressor.fit(x_train, y_train)

In [17]:
# Access the OOB Score
oob_score = regressor.oob_score_
print(f'Out-of-Bag Score: {oob_score}')
 
# Making predictions on the same data and new data
y_test_preds = regressor.predict(x_test)
y_train_preds = regressor.predict(x_train)

# Evaluating the model

mse_train = mean_squared_error(y_train, y_train_preds)
print(f'Mean Squared Error training data: {mse_train}') 

mse_test = mean_squared_error(y_test, y_test_preds)
print(f'Mean Squared Error test data: {mse_test}') 

r2_train = r2_score(y_train, y_train_preds)
print(f'R-squared for training data: {r2_train}')

r2 = r2_score(y_test, y_test_preds)
print(f'R-squared for test data: {r2}')



Out-of-Bag Score: 0.9570936311707934
Mean Squared Error training data: 0.17403814935064896
Mean Squared Error test data: 0.3802462121212097
R-squared for training data: 0.9999838009943174
R-squared for test data: 0.9999612620631679


Random Forest Algorithm gives us a much better result than Linear Regression