In [59]:
import pandas as pd
import numpy as np

In [60]:
df = pd.read_csv('AB_NYC_2019.csv')
df.head(3)


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365


In [61]:
base = [
    'neighbourhood_group',
    'room_type',
    'latitude',
    'longitude',
    'price',
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
]

df = df[base]

In [62]:
# fill in the missing values with 0.
df = df.fillna(0)

In [63]:
### Question 1
"""
What is the most frequent observation(mode) for the column 'neighbourhood_group'?
"""
df['neighbourhood_group'].mode()
# ANSWER to Question 1
# Manhattan

0    Manhattan
dtype: object

In [64]:
from sklearn.model_selection import train_test_split

In [65]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [66]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

(29337, 9779, 9779)

In [67]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values


In [68]:
del df_train['price']
del df_val['price']
del df_test['price']

In [69]:
df_train.dtypes

neighbourhood_group                object
room_type                          object
latitude                          float64
longitude                         float64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [70]:
numerical = list(set(list(df_train.columns)) - set(['neighbourhood_group', 'room_type']))
numerical


['availability_365',
 'latitude',
 'number_of_reviews',
 'calculated_host_listings_count',
 'longitude',
 'reviews_per_month',
 'minimum_nights']

In [71]:
### Question 2
"""
What are the two features that have the biggest correlation in this dataset?
"""
train_corr_mtx = df_train.corr()

In [72]:
train_corr_mtx.unstack().abs().sort_values(ascending=False)
# ANSWER to Question 2
# (number_of_reviews, reviews_per_month) AND (availability_365, calculated_host_listings_count)

latitude                        latitude                          1.000000
longitude                       longitude                         1.000000
calculated_host_listings_count  calculated_host_listings_count    1.000000
reviews_per_month               reviews_per_month                 1.000000
minimum_nights                  minimum_nights                    1.000000
number_of_reviews               number_of_reviews                 1.000000
availability_365                availability_365                  1.000000
number_of_reviews               reviews_per_month                 0.590374
reviews_per_month               number_of_reviews                 0.590374
availability_365                calculated_host_listings_count    0.225913
calculated_host_listings_count  availability_365                  0.225913
availability_365                number_of_reviews                 0.174477
number_of_reviews               availability_365                  0.174477
availability_365         

In [73]:
above_average = (y_train >= 152)

In [74]:
from sklearn.metrics import mutual_info_score

In [75]:
def mutual_info_price_score(series):
    return mutual_info_score(series, above_average)

In [76]:
categorical = list(df_train.dtypes[df_train.dtypes == 'object'].index)
categorical

['neighbourhood_group', 'room_type']

In [77]:
### Question 3

# Calculate the mutual information score with the (binarized) price
# for the two categorical variables that we have. Use the training set only.
mi = df_train[categorical].apply(mutual_info_price_score)
mi.sort_values(ascending=False).round(2)

# ANSWER to Question 3
# `room_type`

room_type              0.14
neighbourhood_group    0.05
dtype: float64

In [78]:
### Question 4
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [79]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
train_dicts[0]

{'neighbourhood_group': 'Brooklyn',
 'room_type': 'Entire home/apt',
 'availability_365': 50,
 'latitude': 40.7276,
 'number_of_reviews': 29,
 'calculated_host_listings_count': 13,
 'longitude': -73.94495,
 'reviews_per_month': 0.7,
 'minimum_nights': 3}

In [80]:
dv = DictVectorizer(sparse=False)

In [81]:
X_train = dv.fit_transform(train_dicts)
dv.get_feature_names()

['availability_365',
 'calculated_host_listings_count',
 'latitude',
 'longitude',
 'minimum_nights',
 'neighbourhood_group=Bronx',
 'neighbourhood_group=Brooklyn',
 'neighbourhood_group=Manhattan',
 'neighbourhood_group=Queens',
 'neighbourhood_group=Staten Island',
 'number_of_reviews',
 'reviews_per_month',
 'room_type=Entire home/apt',
 'room_type=Private room',
 'room_type=Shared room']

In [82]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [83]:
model = LogisticRegression(solver='liblinear', C=1.0, random_state=42, max_iter=10000)

In [84]:
model.fit(X_train, above_average)

LogisticRegression(max_iter=10000, random_state=42, solver='liblinear')

In [85]:
model.intercept_[0]

-0.09177657605990823

In [86]:
model.coef_[0].round(3)

array([ 3.000e-03,  4.000e-03, -5.814e+00, -3.164e+00, -1.100e-02,
       -8.400e-02,  1.250e-01,  1.575e+00, -3.000e-02, -1.677e+00,
       -3.000e-03, -4.200e-02,  1.960e+00, -8.170e-01, -1.235e+00])

In [89]:
y_pred = model.predict_proba(X_val)[:,1]

In [90]:
(y_pred >= 152).mean()

0.0