In [1]:
# !wget 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('AB_NYC_2019.csv')

In [4]:
features = ['neighbourhood_group',
'room_type',
'latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']

In [5]:
df = df[features]

In [6]:
df.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Private room,40.64749,-73.97237,149,1,9,0.21,6,365
1,Manhattan,Entire home/apt,40.75362,-73.98377,225,1,45,0.38,2,355
2,Manhattan,Private room,40.80902,-73.9419,150,3,0,,1,365
3,Brooklyn,Entire home/apt,40.68514,-73.95976,89,1,270,4.64,1,194
4,Manhattan,Entire home/apt,40.79851,-73.94399,80,10,9,0.1,1,0


In [7]:
df = df.fillna(0)

In [8]:
df.isna().sum()

neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [9]:
df.neighbourhood_group.mode()

0    Manhattan
dtype: object

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [12]:
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [13]:
len(df_train), len(df_val), len(df_test)

(29337, 9779, 9779)

In [14]:
y_train = df_train.price.values
y_val = df_val.price.values

In [15]:
del df_train['price']
del df_val['price']

### Question 2

In [16]:
df_train.corr()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


In [17]:
correlations = df_train.corr().abs()

In [261]:
correlations.unstack().sort_values(ascending = False)[7:18]

number_of_reviews               reviews_per_month                 0.590374
reviews_per_month               number_of_reviews                 0.590374
availability_365                calculated_host_listings_count    0.225913
calculated_host_listings_count  availability_365                  0.225913
availability_365                number_of_reviews                 0.174477
number_of_reviews               availability_365                  0.174477
availability_365                reviews_per_month                 0.165376
reviews_per_month               availability_365                  0.165376
availability_365                minimum_nights                    0.138901
minimum_nights                  availability_365                  0.138901
longitude                       reviews_per_month                 0.134642
dtype: float64

In [199]:
df = pd.read_csv('AB_NYC_2019.csv')
df = df[features]
df = df.fillna(0)
df['above_average'] = (df['price'] >= 152).astype(int)
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)
y_train = df_train['above_average'].values
y_val = df_val['above_average'].values
del df_train['above_average']
del df_val['above_average']
del df_train['price']
del df_val['price']


### Question 3

In [109]:
df.dtypes

neighbourhood_group                object
room_type                          object
latitude                          float64
longitude                         float64
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
above_average                       int64
dtype: object

In [110]:
from sklearn.metrics import mutual_info_score

In [111]:
categorical = ['neighbourhood_group', 'room_type']

In [112]:
def calculate_mi(series):
    return mutual_info_score(series, y_train)

df_mi = df_train[categorical].apply(calculate_mi)
df_mi = round(df_mi.sort_values(ascending=False).to_frame(name='MI'),2)


display(df_mi.head())

Unnamed: 0,MI
room_type,0.14
neighbourhood_group,0.05


### Question 4

In [172]:
from sklearn.feature_extraction import DictVectorizer

In [200]:
train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')

In [201]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

DictVectorizer(sparse=False)

In [202]:
X_train = dv.transform(train_dict)
X_val = dv.transform(val_dict)

In [203]:
dv.get_feature_names()

['availability_365',
 'calculated_host_listings_count',
 'latitude',
 'longitude',
 'minimum_nights',
 'neighbourhood_group=Bronx',
 'neighbourhood_group=Brooklyn',
 'neighbourhood_group=Manhattan',
 'neighbourhood_group=Queens',
 'neighbourhood_group=Staten Island',
 'number_of_reviews',
 'reviews_per_month',
 'room_type=Entire home/apt',
 'room_type=Private room',
 'room_type=Shared room']

In [204]:
from sklearn.linear_model import LogisticRegression

In [205]:
model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)

In [206]:
model.fit(X_train, y_train)

LogisticRegression(random_state=42, solver='liblinear')

In [207]:
y_pred = model.predict(X_val)

In [208]:
model.coef_

array([[ 3.03835166e-03,  3.58398154e-03, -5.81758480e+00,
        -3.16648720e+00, -1.13465576e-02, -8.24381729e-02,
         1.25268304e-01,  1.57569036e+00, -2.94348756e-02,
        -1.68067150e+00, -3.24680539e-03, -4.19338057e-02,
         1.95687790e+00, -8.20007372e-01, -1.22845641e+00]])

In [211]:
round((y_pred == y_val).mean(), 2)

0.79

### Question 5

In [223]:
features = ['neighbourhood_group',
'room_type',
'latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']

df = pd.read_csv('AB_NYC_2019.csv')
df = df[features]
df = df.fillna(0)
df['above_average'] = (df['price'] >= 152).astype(int)
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)
y_train = df_train['above_average'].values
y_val = df_val['above_average'].values
del df_train['above_average']
del df_val['above_average']
del df_train['price']
del df_val['price']
features.remove('price')
accuracy = {}

train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)
X_val = dv.transform(val_dict)
model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
baseline = (y_pred == y_val).mean()

for feature in features:
    df_train_small = df_train.drop(columns = feature)
    df_val_small = df_val.drop(columns = feature)
    train_dict = df_train_small.to_dict(orient='records')
    val_dict = df_val_small.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)
    X_train = dv.transform(train_dict)
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    accuracy[feature] = abs((y_pred == y_val).mean() - baseline)
import operator
print(sorted(accuracy.items(), key=operator.itemgetter(1)))

[('reviews_per_month', 0.0), ('number_of_reviews', 0.0006135596686777101), ('minimum_nights', 0.0008180795582369838), ('calculated_host_listings_count', 0.0011248593925760053), ('longitude', 0.0037836179568463413), ('latitude', 0.003988137846405615), ('availability_365', 0.009305654974946398), ('neighbourhood_group', 0.04090397791185196), ('room_type', 0.06207178648123535)]


In [217]:
features

['neighbourhood_group',
 'room_type',
 'latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

### Question 6

In [260]:
df = pd.read_csv('AB_NYC_2019.csv')
features = ['neighbourhood_group',
'room_type',
'latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']
df = df[features]

df = df.fillna(0)
from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)
y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
del df_train['price']
del df_val['price']

from sklearn.feature_extraction import DictVectorizer

train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)
X_val = dv.transform(val_dict)

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
for alpha in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha = alpha)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
#     y_pred = 5*np.ones(len(y_pred))
    print(f'alpha = {alpha}: {round(mean_squared_error(y_pred, y_val, squared = False), 3)}')

alpha = 0: 0.497
alpha = 0.01: 0.497
alpha = 0.1: 0.497
alpha = 1: 0.497
alpha = 10: 0.498


In [245]:
y_val[:20]

array([4.18965474, 4.49980967, 5.30330491, 4.79579055, 6.61873898,
       4.66343909, 5.3981627 , 5.01727984, 4.38202663, 5.25227343,
       5.22574667, 4.51085951, 4.83628191, 6.32972091, 4.65396035,
       5.420535  , 5.52545294, 4.26267988, 4.87519732, 5.24702407])

In [255]:
dv.get_feature_names()

['availability_365',
 'calculated_host_listings_count',
 'latitude',
 'longitude',
 'minimum_nights',
 'neighbourhood_group=Bronx',
 'neighbourhood_group=Brooklyn',
 'neighbourhood_group=Manhattan',
 'neighbourhood_group=Queens',
 'neighbourhood_group=Staten Island',
 'number_of_reviews',
 'reviews_per_month',
 'room_type=Entire home/apt',
 'room_type=Private room',
 'room_type=Shared room']