In [1]:
# import modules
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 50)

from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [3]:
# import datasets
df = pd.read_csv('data_predict.csv')

In [4]:
df.head()

Unnamed: 0,id,host_response_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,6627449,100,False,1,True,True,10464,Apartment,Entire home/apt,3,1.0,1,1,125,3,21,21,95,10,10,10,10,10,10
1,5557381,100,True,1,True,True,10464,House,Private room,4,1.0,1,1,69,3,1125,94,97,10,10,10,10,10,10
2,19609887,100,False,1,True,False,10464,Apartment,Entire home/apt,7,1.0,2,5,150,2,1125,3,100,10,10,10,10,10,10
3,7949480,100,True,1,True,True,10464,House,Private room,2,1.0,1,1,101,1,7,31,100,10,10,10,10,10,10
4,9147025,100,True,1,True,False,10464,House,Entire home/apt,2,1.0,0,1,125,2,28,60,97,10,10,10,10,10,10


In [5]:
# Cleaning Data
df['price'] = df['price'].str.strip('$')
df['price'] = df['price'].str.replace(',','')
df['price'] = df['price'].astype(np.float32)

# Convert the boolean to int
arr_mapping = {True:1, False:0}
df['host_is_superhost'] = df['host_is_superhost'].map(arr_mapping)
df['host_has_profile_pic'] = df['host_has_profile_pic'].map(arr_mapping)
df['host_identity_verified'] = df['host_identity_verified'].map(arr_mapping)

# Create New Column
df['Price_daily'] = df['price']/df['minimum_nights']
df = df.drop('price', axis=1)

In [8]:
# Create X and Y to predict Listing Price 
y = np.array(df['Price_daily'])

df_x = df.drop(['Price_daily','id','zipcode'], axis=1)
df_x_dummy = pd.get_dummies(df_x)
# create dummy variables by categorical data
x = np.array(df_x_dummy)

In [9]:
# Fit the model
model.fit(x, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
# Predict the price
Y_pred = model.predict(x)

In [11]:
# Create the column about Predicted Price
df['predicted_price'] = Y_pred

In [12]:
# Calculate the diff
df['diif_price'] = df['Price_daily'] - df['predicted_price']

In [13]:
df.head()

Unnamed: 0,id,host_response_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,minimum_nights,maximum_nights,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,Price_daily,predicted_price,diif_price
0,6627449,100,0,1,1,1,10464,Apartment,Entire home/apt,3,1.0,1,1,3,21,21,95,10,10,10,10,10,10,41.666667,80.203275,-38.536608
1,5557381,100,1,1,1,1,10464,House,Private room,4,1.0,1,1,3,1125,94,97,10,10,10,10,10,10,23.0,103.9207,-80.9207
2,19609887,100,0,1,1,0,10464,Apartment,Entire home/apt,7,1.0,2,5,2,1125,3,100,10,10,10,10,10,10,75.0,115.790172,-40.790172
3,7949480,100,1,1,1,1,10464,House,Private room,2,1.0,1,1,1,7,31,100,10,10,10,10,10,10,101.0,60.785434,40.214566
4,9147025,100,1,1,1,0,10464,House,Entire home/apt,2,1.0,0,1,2,28,60,97,10,10,10,10,10,10,62.5,53.505134,8.994866


In [15]:
# Show the Negative largest difference between actial price and predicted price
# This is the listings that we can propose that they can increase their price
df.sort_values('diif_price').head(5)

Unnamed: 0,id,host_response_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,minimum_nights,maximum_nights,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,Price_daily,predicted_price,diif_price
6712,7089310,100,0,1,1,1,11432,House,Entire home/apt,16,5.5,5,5,3,28,15,80,9,8,9,9,9,9,132.666667,591.340913,-458.674247
4948,11708688,100,0,1,1,0,10456,Apartment,Entire home/apt,15,1.0,2,3,1,1125,6,95,10,10,8,10,10,10,75.0,394.741657,-319.741657
2455,512209,100,1,1,1,1,11201,House,Entire home/apt,16,5.5,5,12,5,99,47,99,10,10,10,10,10,10,160.0,455.508014,-295.508014
6917,867220,100,0,1,1,1,11226,House,Entire home/apt,16,4.0,5,5,3,1125,13,88,9,10,9,9,9,9,246.666667,530.750826,-284.084159
7378,16923912,100,0,2,1,0,11375,House,Entire home/apt,10,4.5,6,8,5,57,2,100,10,10,10,10,10,9,80.0,354.688546,-274.688546


In [16]:
# Show the positive largest difference between actial price and predicted price
df.sort_values('diif_price').tail(10)

Unnamed: 0,id,host_response_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,minimum_nights,maximum_nights,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,Price_daily,predicted_price,diif_price
7123,3585309,100,0,2,1,0,11358,House,Entire home/apt,9,3.0,4,5,1,1125,9,100,10,10,10,10,10,10,950.0,292.877986,657.122014
7221,18854380,100,0,1,1,0,11354,Condominium,Entire home/apt,2,1.0,0,1,365,365,6,88,8,9,9,8,9,8,0.452055,-672.09941,672.551464
2378,12591059,50,0,1,1,0,11225,Apartment,Entire home/apt,4,1.0,2,2,1,1125,4,80,9,7,9,10,10,8,800.0,114.606462,685.393538
7054,3181834,100,0,1,1,1,10011,Apartment,Entire home/apt,10,2.0,4,5,1,1125,15,92,9,10,9,9,10,9,990.0,295.041393,694.958607
3857,19178296,100,0,1,1,0,10001,Apartment,Entire home/apt,10,1.5,4,5,1,1125,3,93,9,9,9,9,9,9,990.0,248.739828,741.260172
3586,1056256,100,0,17,1,1,11231,House,Entire home/apt,12,2.5,5,5,1,365,46,94,9,9,10,10,10,9,1395.0,352.723361,1042.276639
3814,12469563,100,0,2,1,1,10001,Apartment,Entire home/apt,10,2.0,4,4,1,1125,11,94,9,9,9,9,9,9,1500.0,297.732107,1202.267893
5638,2939700,0,0,1,1,0,10128,Apartment,Entire home/apt,8,3.5,5,5,1,1125,1,100,10,10,10,10,10,10,2500.0,322.629584,2177.370416
4141,2952861,100,1,11,1,1,11205,House,Entire home/apt,16,6.0,5,5,1,1125,3,100,10,9,9,10,10,10,4500.0,659.818948,3840.181052
4143,2953058,100,1,11,1,1,11205,House,Entire home/apt,16,6.0,5,5,1,1125,1,100,10,10,6,6,10,10,8000.0,764.232931,7235.767069


In [20]:
# Confrim the correlation
pd.DataFrame(df.corr()['Price_daily']).sort_values('Price_daily', ascending=False)

Unnamed: 0,Price_daily
Price_daily,1.0
diif_price,0.891564
predicted_price,0.452894
accommodates,0.367932
bedrooms,0.288813
beds,0.267686
bathrooms,0.25997
review_scores_location,0.083667
host_is_superhost,0.023434
review_scores_cleanliness,0.018885
