In [41]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [42]:
import plotly.io as pio

# Create a custom theme and set it as default
pio.templates["custom"] = pio.templates["plotly_white"]
pio.templates["custom"].layout.margin = {'b': 25, 'l': 25, 'r': 25, 't': 50}
pio.templates["custom"].layout.width = 450
pio.templates["custom"].layout.height = 300
pio.templates["custom"].layout.autosize = False
pio.templates["custom"].layout.font.family="Arial"
pio.templates["custom"].layout.title.update({"x":0.5, "xref":"paper", "font_family":"Arial Black"})
pio.templates["custom"].layout.xaxis.update({"showline":True, "linecolor":"darkgray"})
pio.templates["custom"].layout.yaxis.update({"showline":True, "linecolor":"darkgray"})
pio.templates["custom"].layout.colorway = ['#1F77B4', '#FF7F0E', '#2CA02C', '#D62728', '#9467BD',
                                           '#8C564B', '#E377C2', '#7F7F7F', '#BCBD22', '#17BECF']
pio.templates.default = "custom"

In [43]:
data = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv"

In [44]:
!curl -o AB_NYC_2019.csv $data 

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 90 6912k   90 6240k    0     0  10.1M      0 --:--:-- --:--:-- --:--:-- 10.2M
100 6912k  100 6912k    0     0  10.9M      0 --:--:-- --:--:-- --:--:-- 11.0M


## Data preparation

In [45]:
usecols = [
    'room_type', 'neighbourhood_group',
    'latitude', 'longitude', 'price','minimum_nights',
    'number_of_reviews', 'reviews_per_month', 
    'calculated_host_listings_count', 'availability_365'
]

df = pd.read_csv('AB_NYC_2019.csv', usecols=usecols)

In [46]:
df['reviews_per_month'] = df.reviews_per_month.fillna(0)

In [47]:
df['price'] = df['price'] >= 152

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

In [50]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [51]:
cat = ['neighbourhood_group', 'room_type']

num = [
    'latitude', 'longitude', 'minimum_nights', 'number_of_reviews',
    'reviews_per_month', 'calculated_host_listings_count',
    'availability_365'
]

## Training the model

You get a convergence warning:

In [52]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Define the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False), cat),  # OneHotEncoder for categorical features
        ('num', 'passthrough', num)  # Pass through numerical features without scaling
    ]
)

# Fit and transform the training data
X_train = preprocessor.fit_transform(df_train[cat + num])

# Transform the validation data
X_val = preprocessor.transform(df_val[cat + num])

In [53]:
model = LogisticRegression(solver='lbfgs', C=1.0)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
y_pred = model.predict_proba(X_val)[:, 1]
accuracy_score(y_val, y_pred >= 0.5)

0.7862767154105736

We can fix this model by using a scaler. You can read more about scalers
[here](https://scikit-learn.org/stable/modules/preprocessing.html).

Also, we'll show you how to use `OneHotEncoding` instead of `DictVectorizer`

## Feature scaling + OHE

In [55]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [56]:
# Define the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False), cat),  # OneHotEncoder for categorical features
        ('num', StandardScaler(), num)  # StandardScaler for numerical features
    ]
)

# Fit and transform the training data
X_train = preprocessor.fit_transform(df_train[cat + num])

# Transform the validation data
X_val = preprocessor.transform(df_val[cat + num])

In [57]:
preprocessor.get_feature_names_out()

array(['cat__neighbourhood_group_Bronx',
       'cat__neighbourhood_group_Brooklyn',
       'cat__neighbourhood_group_Manhattan',
       'cat__neighbourhood_group_Queens',
       'cat__neighbourhood_group_Staten Island',
       'cat__room_type_Entire home/apt', 'cat__room_type_Private room',
       'cat__room_type_Shared room', 'num__latitude', 'num__longitude',
       'num__minimum_nights', 'num__number_of_reviews',
       'num__reviews_per_month', 'num__calculated_host_listings_count',
       'num__availability_365'], dtype=object)

And now let's train the model:

In [60]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train, y_train)

We can check it's accuracy:

In [61]:
y_pred = model.predict_proba(X_val)[:, 1]
accuracy_score(y_val, y_pred >= 0.5)

0.7978320891706718

It's a little bit better than the version without scaled features.