In [526]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns

from pprint import pprint
%matplotlib inline

# df = pd.read_csv('./survey_results_public.csv')
# df.head()

In [527]:
import statsmodels.api as sm

## Step 1
Only keep variables of interest that can be modeled with logistic regression (i.e. exclude ID numbers, long text fields, and other fields which have little to no variation (such as country)

In [528]:
df = pd.read_csv('C:/Users/Helm/Desktop/UdacityDesktop/listings.csv')
df2 = df[[
'host_response_rate',
'host_acceptance_rate',
'host_is_superhost',
'host_total_listings_count',
'host_has_profile_pic',
'host_identity_verified',
'neighbourhood_group_cleansed',
'property_type',
'room_type',
'accommodates',
'bathrooms',
'bedrooms',
'beds',
'bed_type',
'amenities',
'square_feet',
'price',
'weekly_price',
'monthly_price',
'security_deposit',
'cleaning_fee',
'guests_included',
'extra_people',
'minimum_nights',
'maximum_nights',
'availability_30',
'number_of_reviews',
'review_scores_rating',
'requires_license',
'instant_bookable',
'cancellation_policy',
'require_guest_profile_picture',
'require_guest_phone_verification',
'calculated_host_listings_count',
'reviews_per_month']].copy()
df2.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,property_type,room_type,accommodates,...,availability_30,number_of_reviews,review_scores_rating,requires_license,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,96%,100%,f,3.0,t,t,Queen Anne,Apartment,Entire home/apt,4,...,14,207,95.0,f,f,moderate,f,f,2,4.07
1,98%,100%,t,6.0,t,t,Queen Anne,Apartment,Entire home/apt,4,...,13,43,96.0,f,f,strict,t,t,6,1.48
2,67%,100%,f,2.0,t,t,Queen Anne,House,Entire home/apt,11,...,1,20,97.0,f,f,strict,f,f,2,1.15
3,,,f,1.0,t,t,Queen Anne,Apartment,Entire home/apt,3,...,0,0,,f,f,flexible,f,f,1,
4,100%,,f,2.0,t,t,Queen Anne,House,Entire home/apt,6,...,30,38,92.0,f,f,strict,f,f,1,0.89


## Step 2 Data Clean Up

First I Determine whether any variable has too many missing values to be relevant to modeling.  After checking for null values below, I determine that only square feet should be removed as over 97% of the values are missing.  This is unfortunate, in my opinion, as square footage would likely be very relevant for predicting things such as nightly price.

Weekly_price, monthly_price, security_deposit, and cleaning_fee also have high percentages of missing values.  However, in those cases, the missing values can be set to 0 because a missing likely just means that there is no extra fee for those items.  

In the case of host_acceptance_rate, review_per_month, review_scores_rating, and host_response_rate, an average of the column will be used in their place instead of a null value.

In [529]:
percent_missing = (df2.isnull()/df2.shape[0]).sum()
percent_missing[percent_missing > 0.01].sort_values(ascending=False)


square_feet             0.974594
monthly_price           0.602672
security_deposit        0.511262
weekly_price            0.473808
cleaning_fee            0.269775
host_acceptance_rate    0.202462
review_scores_rating    0.169460
reviews_per_month       0.164222
host_response_rate      0.136983
dtype: float64

I convert host_response_rate and host_acceptance_rate to float rather than objects.

In [530]:
df2.dtypes

host_response_rate                   object
host_acceptance_rate                 object
host_is_superhost                    object
host_total_listings_count           float64
host_has_profile_pic                 object
host_identity_verified               object
neighbourhood_group_cleansed         object
property_type                        object
room_type                            object
accommodates                          int64
bathrooms                           float64
bedrooms                            float64
beds                                float64
bed_type                             object
amenities                            object
square_feet                         float64
price                                object
weekly_price                         object
monthly_price                        object
security_deposit                     object
cleaning_fee                         object
guests_included                       int64
extra_people                    

In [531]:
df2['host_response_rate'] = df2['host_response_rate'].str.strip('%').astype(float)


In [532]:
df2['host_acceptance_rate'] = df2['host_acceptance_rate'].str.strip('%').astype(float)

In [533]:
df2.loc[(df2['price']=='$1,000.00'),'price'] = '$1000.00'
# df2['price'] = df2['price'].replace(',', '')
df2['price'] = df2['price'].str.strip('$')
df2['price'].head(1000)
df2['price'] = df2['price'].astype(float)

In [534]:
df2.dtypes

host_response_rate                  float64
host_acceptance_rate                float64
host_is_superhost                    object
host_total_listings_count           float64
host_has_profile_pic                 object
host_identity_verified               object
neighbourhood_group_cleansed         object
property_type                        object
room_type                            object
accommodates                          int64
bathrooms                           float64
bedrooms                            float64
beds                                float64
bed_type                             object
amenities                            object
square_feet                         float64
price                               float64
weekly_price                         object
monthly_price                        object
security_deposit                     object
cleaning_fee                         object
guests_included                       int64
extra_people                    

In [535]:
df2 = df2.drop(['square_feet'], axis=1)


In [536]:
df2.dtypes

host_response_rate                  float64
host_acceptance_rate                float64
host_is_superhost                    object
host_total_listings_count           float64
host_has_profile_pic                 object
host_identity_verified               object
neighbourhood_group_cleansed         object
property_type                        object
room_type                            object
accommodates                          int64
bathrooms                           float64
bedrooms                            float64
beds                                float64
bed_type                             object
amenities                            object
price                               float64
weekly_price                         object
monthly_price                        object
security_deposit                     object
cleaning_fee                         object
guests_included                       int64
extra_people                         object
minimum_nights                  

I drop any missing minimum night rows and then convert it to a binary variable for modeling

In [537]:
df2['one_night_option'] = np.where(df2['minimum_nights']==1,1,0)



In [538]:
df2['house_or_other'] = np.where(df2['property_type'] == 'House','House','Non-House')

In [539]:
df2['shared_room'] = np.where(df2['room_type']== 'Entire home/apt ', 'No ','Yes')

In [540]:
df2 = df2.drop(['minimum_nights', 'property_type', 'room_type'], axis=1)
df2.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,accommodates,bathrooms,bedrooms,...,requires_license,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,one_night_option,house_or_other,shared_room
0,96.0,100.0,f,3.0,t,t,Queen Anne,4,1.0,1.0,...,f,f,moderate,f,f,2,4.07,1,Non-House,Yes
1,98.0,100.0,t,6.0,t,t,Queen Anne,4,1.0,1.0,...,f,f,strict,t,t,6,1.48,0,Non-House,Yes
2,67.0,100.0,f,2.0,t,t,Queen Anne,11,4.5,5.0,...,f,f,strict,f,f,2,1.15,0,House,Yes
3,,,f,1.0,t,t,Queen Anne,3,1.0,0.0,...,f,f,flexible,f,f,1,,1,Non-House,Yes
4,100.0,,f,2.0,t,t,Queen Anne,6,2.0,3.0,...,f,f,strict,f,f,1,0.89,1,House,Yes


In [541]:
df2['has_weekly_price'] = np.where(df2['weekly_price'].isnull(),0,1)
df2['has_monthly_price'] = np.where(df2['monthly_price'].isnull(),0,1)
df2['has_security_deposit'] = np.where(df2['security_deposit'].isnull(),0,1)
df2['has_extra_people_fee'] = np.where(df2['extra_people']=='$0.00',0,1)
df2['has_cleaning_fee'] = np.where(df2['cleaning_fee'].isnull(),0,1)
df2 = df2.drop(['weekly_price','monthly_price','security_deposit','extra_people','cleaning_fee'], axis=1)

In [542]:
float_int_df = df2.select_dtypes(include=['float64', 'int64'])
object_df = df2.select_dtypes(include=['object'])
object_df = object_df.drop(['amenities'], axis = 1)
object_df2 = object_df.drop(['host_has_profile_pic','bed_type','host_identity_verified','neighbourhood_group_cleansed'], axis=1)
for_mode_df = float_int_df.drop(['reviews_per_month','review_scores_rating'], axis=1)
for_mean_df = float_int_df[['reviews_per_month','review_scores_rating']]


for_mean_df.head()


Unnamed: 0,reviews_per_month,review_scores_rating
0,4.07,95.0
1,1.48,96.0
2,1.15,97.0
3,,
4,0.89,92.0


In [543]:
object_df.head()

Unnamed: 0,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,bed_type,requires_license,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,house_or_other,shared_room
0,f,t,t,Queen Anne,Real Bed,f,f,moderate,f,f,Non-House,Yes
1,t,t,t,Queen Anne,Real Bed,f,f,strict,t,t,Non-House,Yes
2,f,t,t,Queen Anne,Real Bed,f,f,strict,f,f,House,Yes
3,f,t,t,Queen Anne,Real Bed,f,f,flexible,f,f,Non-House,Yes
4,f,t,t,Queen Anne,Real Bed,f,f,strict,f,f,House,Yes


In [544]:
object_df.head(20)

Unnamed: 0,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,bed_type,requires_license,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,house_or_other,shared_room
0,f,t,t,Queen Anne,Real Bed,f,f,moderate,f,f,Non-House,Yes
1,t,t,t,Queen Anne,Real Bed,f,f,strict,t,t,Non-House,Yes
2,f,t,t,Queen Anne,Real Bed,f,f,strict,f,f,House,Yes
3,f,t,t,Queen Anne,Real Bed,f,f,flexible,f,f,Non-House,Yes
4,f,t,t,Queen Anne,Real Bed,f,f,strict,f,f,House,Yes
5,f,t,t,Queen Anne,Real Bed,f,f,strict,f,f,House,Yes
6,t,t,t,Queen Anne,Real Bed,f,f,moderate,f,f,House,Yes
7,t,t,t,Queen Anne,Real Bed,f,f,strict,t,t,Non-House,Yes
8,f,t,t,Queen Anne,Real Bed,f,f,strict,f,f,Non-House,Yes
9,t,t,t,Queen Anne,Real Bed,f,f,strict,t,t,Non-House,Yes


In [545]:


fill_mode = lambda col: col.fillna(col.mode()[0])
fill_mean = lambda col: col.fillna(col.mean())

for_mode_df_filled = for_mode_df.apply(fill_mode, axis = 1)
for_mode_df_filled.head()

for_mean_df_filled = for_mean_df.apply(fill_mean, axis = 1)


    




In [546]:
for_mean_df_filled.shape

(3818, 2)

I use the create_dummy_df that was created in the Udacity project

In [547]:
def create_dummy_df(df, cat_cols, dummy_na):
    '''
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. contains all columns that were not specified as categorical
            2. removes all the original columns in cat_cols
            3. dummy columns for each of the categorical columns in cat_cols
            4. if dummy_na is True - it also contains dummy columns for the NaN values
            5. Use a prefix of the column name with an underscore (_) for separating 
    '''
    for col in  cat_cols:
        try:
            # for each cat add dummy var, drop original column
            df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=dummy_na)], axis=1)
        except:
            continue
    return df


obj_dummies = create_dummy_df(object_df2, object_df.columns, dummy_na = False)

In [548]:
obj_dummies.mean().sort_values(ascending=False)

house_or_other_Non-House              0.546097
cancellation_policy_strict            0.371137
cancellation_policy_moderate          0.327658
host_is_superhost_t                   0.203772
instant_bookable_t                    0.154793
require_guest_phone_verification_t    0.098219
require_guest_profile_picture_t       0.084075
dtype: float64

In [549]:
X = pd.concat([obj_dummies, for_mean_df_filled, for_mode_df_filled], axis=1)
y = df2['one_night_option']

In [550]:
Model_Dataset = pd.concat([obj_dummies, for_mean_df_filled, for_mode_df_filled, df2['one_night_option']], axis=1)
Model_Dataset2 = Model_Dataset.dropna()
Model_Dataset2.shape

(3191, 23)

In [551]:


#Split into explanatory and response variables
X = Model_Dataset2.drop(['one_night_option'], axis=1)
X = X.drop(['require_guest_phone_verification_t','bathrooms','bedrooms','beds','guests_included'], axis=1)
y = Model_Dataset2['one_night_option']

#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 


lm_model = LogisticRegression() # Instantiate
lm_model.fit(X_train, y_train) #Fit
        
#Predict and score the model
y_test_preds = lm_model.predict(X_test) 


# "The r-squared score for your model was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))



In [552]:
X_Corr = X.corr()

X_Corr.to_csv('C:/Users/Helm/Desktop/UdacityDesktop/Xcorr.csv')

In [553]:
# lm_model.get_params(deep=True)

# pd.concat([pd.Series(y_test_preds),y_test], axis=1)


test2 = pd.concat([pd.Series(y_test_preds),y_test.reset_index(drop=True)], axis=1, ignore_index=True)
pd.crosstab(index=test2[0], columns=test2[1])




1,0,1
0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,468,221
1,92,177


https://sweetcode.io/easy-scikit-logistic-regression/

In [554]:
# logistic_regression = sm.Logit(X_train,sm.add_constant(train_data.age))
# result = logistic_regression.fit()
# print(result.summary())

log_reg = sm.Logit(y_train, X_train).fit()
print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.589947
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:       one_night_option   No. Observations:                 2233
Model:                          Logit   Df Residuals:                     2216
Method:                           MLE   Df Model:                           16
Date:                Mon, 09 May 2022   Pseudo R-squ.:                  0.1187
Time:                        07:50:50   Log-Likelihood:                -1317.4
converged:                       True   LL-Null:                       -1494.7
                                        LLR p-value:                 1.079e-65
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
host_is_superhost_t                -0.5832      0.123     -4.733  