In [None]:
#This notebook is to answer Question #3
# What features influence the price. Predict the price using ML model.

# In this notebook, we will create a Linear Regression Model to predict the features that influence the price. 
# We want to know what features are directly related to the price. 
# We will also perform data cleaning and data wrangling operations before the data can be fed into the model for prediction.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

listing = pd.read_csv('listings.csv') #reading the listing dataset in a datafram called 'listing'
listing.head() #displaying the enteries of the dataframe 'listing'

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,1944,cafeheaven Pberg/Mitte,2164,Lulah,Mitte,Brunnenstr. Nord,52.54425,13.39749,Private room,21,60,18,2018-11-11,0.24,1,251
1,3176,Fabulous Flat in great Location,3718,Britta,Pankow,Prenzlauer Berg Südwest,52.53500,13.41758,Entire home/apt,90,62,145,2019-06-27,1.14,1,344
2,3309,BerlinSpot Schöneberg near KaDeWe,4108,Jana,Tempelhof - Schöneberg,Schöneberg-Nord,52.49885,13.34906,Private room,28,7,27,2019-05-31,0.35,1,317
3,6883,Stylish East Side Loft in Center with AC & 2 b...,16149,Steffen,Friedrichshain-Kreuzberg,Frankfurter Allee Süd FK,52.51171,13.45477,Entire home/apt,125,3,128,2019-10-21,1.08,1,20
4,7071,BrightRoom with sunny greenview!,17391,BrightRoom,Pankow,Helmholtzplatz,52.54316,13.41509,Private room,33,3,266,2019-11-09,2.13,2,30
5,9991,Geourgeous flat - outstanding views,33852,Philipp,Pankow,Prenzlauer Berg Südwest,52.53303,13.41605,Entire home/apt,180,6,7,2019-07-15,0.13,1,45
6,14325,Apartment in Prenzlauer Berg,55531,Chris + Oliver,Pankow,Prenzlauer Berg Nordwest,52.54785,13.40556,Entire home/apt,70,90,24,2019-07-01,0.21,4,215
7,16644,In the Heart of Berlin - Kreuzberg,64696,Rene,Friedrichshain-Kreuzberg,nördliche Luisenstadt,52.50479,13.43510,Entire home/apt,90,60,48,2017-12-14,0.42,2,87
8,17904,Beautiful Kreuzberg studio/WiFi (reg. pend.),68997,Matthias,Neukölln,Reuterstraße,52.49548,13.42182,Entire home/apt,49,5,252,2019-11-11,2.13,1,286
9,20858,Designer Loft in Berlin Mitte,71331,Marc,Pankow,Prenzlauer Berg Südwest,52.53695,13.40762,Entire home/apt,129,3,81,2019-11-04,0.93,1,167


In [32]:
def clean_data(df):
    '''
    INPUT
    df - pandas dataframe 
    
    OUTPUT
    X - A matrix holding all of the features you want to consider when predicting the response
    y - the corresponding response vector
    
    Perform to obtain the correct X and y objects
    This function cleans df using the following steps to produce X and y:
    1. Drop all the rows with no price
    2. Create X as all the columns that are not the price column and that are numeric
    3. Create y as the price column
    4. Drop the 'id', 'name', 'host_id', 'host_name','neighbourhood', 'latitude', 'longitude', 'price',
                'last_review','calculated_host_listings_count', 'availability_365' columns from X
    5. For each numeric variable in X, fill the column with the mean value of the column.
    6. Create dummy columns for all the categorical variables in X, drop the original columns
    7. Return X and y
    '''
    
    #Drop all the rows with no price
    df = df.dropna(subset = ['price'], axis = 0)
    y = df['price']
    
    #Drop columns that we do not want to consider 
    df = df.drop(['id', 'name', 'host_id', 'host_name',
                  'neighbourhood', 'latitude', 'longitude', 'price',
                  'last_review'], axis = 1)
    
    
    #Fill the mean in the columns where we have some NaN values.
    num_vars = df.select_dtypes(include = ['float', 'int']).columns
    for col in num_vars:
        df[col].fillna((df[col].mean()), inplace = True)
        
        
    #Fill Dummy variables for the categorical columns. 
    cat_vars = df.select_dtypes(include = ['object']).copy().columns
    for var in cat_vars:
        df = pd.concat([df.drop(var, axis = 1), pd.get_dummies(df[var], prefix =var, prefix_sep='_', drop_first=False)], axis = 1)
        
    X = df
    return X, y
    
#Use the function to create X and y
X, y = clean_data(listing) 

In [33]:
#Create Training and Testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [34]:
#Instantiat the Model
lm_model = LinearRegression(normalize=False)

In [35]:
#Fit the Model

lm_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [36]:
#Predictions
y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)

In [37]:
#Scoring the Model

test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)

In [38]:
#Print training and testing score
print("The rsquared on the training data was {}.  The rsquared on the test data was {}.".format(train_score, test_score))

The rsquared on the training data was 0.13998550844860203.  The rsquared on the test data was 0.12438280041842009.


In [40]:
# From the above results of 'rsquared' values, we can see that our model is not predicting the outcomes in the most appropriate way.
# rsquared of almost 13% on both Training and Test datasets shows the limitations of our model. One of the main reasons for this
# is that I have used very few features to predict the price. If i Had used more features, i could have achieved better results. 
# The data set i used for my project is the summarized version and contains only few columns. During data cleaning, i dropped 
# certain columns that were not useful. 

In [39]:
def coef_weights(coefficients, X_train):
    '''
    INPUT:
    coefficients - the coefficients of the linear model 
    X_train - the training data, so the column names can be used
    OUTPUT:
    coefs_df - a dataframe holding the coefficient, estimate, and abs(estimate)
    
    Provides a dataframe that can be used to understand the most influential coefficients
    in a linear model by providing the coefficient estimates along with the name of the 
    variable attached to the coefficient.
    '''
    coefs_df = pd.DataFrame()
    coefs_df['est_int'] = X_train.columns
    coefs_df['coefs'] = lm_model.coef_
    coefs_df['abs_coefs'] = np.abs(lm_model.coef_)
    coefs_df = coefs_df.sort_values('abs_coefs', ascending=False)
    return coefs_df

#Use the function
coef_df = coef_weights(lm_model.coef_, X_train)

#A quick look at the results
coef_df

Unnamed: 0,est_int,coefs,abs_coefs
18,room_type_Hotel room,542.686719,542.686719
20,room_type_Shared room,-204.264646,204.264646
19,room_type_Private room,-189.50201,189.50201
17,room_type_Entire home/apt,-148.920062,148.920062
13,neighbourhood_group_Spandau,-43.479768,43.479768
15,neighbourhood_group_Tempelhof - Schöneberg,32.421358,32.421358
5,neighbourhood_group_Charlottenburg-Wilm.,30.50882,30.50882
8,neighbourhood_group_Marzahn - Hellersdorf,-16.1391,16.1391
7,neighbourhood_group_Lichtenberg,-12.915212,12.915212
9,neighbourhood_group_Mitte,12.097601,12.097601


In [41]:
# From the above results, we can clearly see that the variable/feature 'room_type_Hotelroom' has the highest influence on the price.
# This means that 'room_type' will effect the price heavly either in the positive or negative manner that is, it will predict a higher or lower price.
# depending on the type of the room. If the 'room_type' is Hotel room then our model will predict ahigher price where as if the 'room_type' is shared room, it will predict
# a lessser price. 
# Anopther factor that effects the price, is the 'neighbourhood_group'. Depending on the type of 'neighbourhood_group' price will predicted either higher or lower.
# Neighbourhoods like 'neighbourhood_group_Tempelhof - Schöneberg' will have a positive effect on the prediction of the price and
# Neighbourhood like 'neighbourhood_group_Neukölln' will have a negative effect and our model will predict a lesser price.
# It can also be seen that other features like, 'minimum_nights', 'availability_365' and 'number_of_reviews' have very less effect on the prediction.

# This is the benfit of looking into the coefficients of the model, it tells us which variables/features matter the most in predicting the outcome.
# Obviously, if our model had better rsquared results, the coefficients distribution would have been different but the most important thing
# is to actually understand why the model behaves in a certain way and what factors can be modified to improve the performance of our model