In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("kc_house_data.csv")
data.head(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


Выберем столбцы, с которыми будем работать, а также выделим целевой признак - bedrooms, после чего удалим его из данных.

In [3]:
data2 = data[['price', 'bedrooms', 'view', 'condition', 'grade', 'sqft_living15']]
y = data2['bedrooms']
y = y.apply(lambda x: 0 if x <= 3 else 1)
data2 = data2.drop(columns=['bedrooms'])
y.head(10), data2.columns

(0    0
 1    0
 2    0
 3    1
 4    0
 5    1
 6    0
 7    0
 8    0
 9    0
 Name: bedrooms, dtype: int64,
 Index(['price', 'view', 'condition', 'grade', 'sqft_living15'], dtype='object'))

Предобработка данных.

In [5]:
data2.isna().sum()

price            0
view             0
condition        0
grade            0
sqft_living15    0
dtype: int64

In [6]:
#пропусков в данных нет.

In [7]:
data2.dtypes

price            float64
view               int64
condition          int64
grade              int64
sqft_living15      int64
dtype: object

Нормализуем числовые признаки.

In [8]:
from sklearn.preprocessing import StandardScaler
norm = StandardScaler()
X = data2
X = pd.DataFrame(norm.fit_transform(X), columns = X.columns)
X.describe(), X.head()

(              price          view     condition         grade  sqft_living15
 count  2.161300e+04  2.161300e+04  2.161300e+04  2.161300e+04   2.161300e+04
 mean  -3.682080e-17 -1.052023e-17 -2.577456e-16  2.524855e-16   8.153176e-17
 std    1.000023e+00  1.000023e+00  1.000023e+00  1.000023e+00   1.000023e+00
 min   -1.266860e+00 -3.057595e-01 -3.702668e+00 -5.663344e+00  -2.316325e+00
 25%   -5.941897e-01 -3.057595e-01 -6.291869e-01 -5.588357e-01  -7.244971e-01
 50%   -2.453924e-01 -3.057595e-01 -6.291869e-01 -5.588357e-01  -2.138280e-01
 75%    2.857709e-01 -3.057595e-01  9.075535e-01  2.919156e-01   5.448802e-01
 max    1.950299e+01  4.914129e+00  2.444294e+00  4.545672e+00   6.162239e+00,
       price      view  condition     grade  sqft_living15
 0 -0.866717 -0.305759  -0.629187 -0.558836      -0.943355
 1 -0.005688 -0.305759  -0.629187 -0.558836      -0.432686
 2 -0.980849 -0.305759  -0.629187 -1.409587       1.070140
 3  0.174090 -0.305759   2.444294 -0.558836      -0.914174
 4

Разделим полученный набор данных на тестовую и обучающую выборку.

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.33, random_state = 1)

Построим классификатор типа логистическая регрессия.

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

lr = LogisticRegression()
lr.fit(X_train, y_train)

Оценим точность построенного классификатора с помощью метрик precision, recall и F1 на тестовой выборке.

In [10]:
accuracy_score(y_validation, lr.predict(X_validation))

0.7004065610542549

In [11]:
precision_score(y_validation, lr.predict(X_validation))

0.6937561942517344

In [12]:
recall_score(y_validation, lr.predict(X_validation))

0.47961630695443647

In [13]:
f1_score(y_validation, lr.predict(X_validation))

0.5671460401053271

Accuracy, доля правильных ответов - 0.7 

Precision, точность предсказания - 0.7

Recall, полнота модели - 0.48

f1_score, гармоническое среднее полноты и точности - 0.57


Построим классификатор типа случайный лес

In [16]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')
rf.fit(X_train, y_train)

Оценим точность построенного классификатора с помощью метрик precision, recall и F1 на тестовой выборке.

In [17]:
accuracy_score(y_validation, rf.predict(X_validation))

0.6685826440487873

In [18]:
precision_score(y_validation, rf.predict(X_validation))

0.6022099447513812

In [19]:
recall_score(y_validation, rf.predict(X_validation))

0.5601233299075026

In [20]:
f1_score(y_validation, rf.predict(X_validation))

0.5804046858359957

Accuracy, доля правильных ответов - 0.67

Precision, точность предсказания - 0.6

Recall, полнота модели - 0.56

f1_score, гармоническое среднее полноты и точности - 0.58

С помощью GridSearch переберем различные комбинации гиперпараметров.

In [21]:
from sklearn.model_selection import GridSearchCV
rf_grid = RandomForestClassifier()

param_grid = {'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500]}

grid_search = GridSearchCV(estimator=rf_grid, param_grid=param_grid)
grid_search.fit(X_train, y_train)

In [24]:
grid_search.best_params_

{'n_estimators': 450}

In [25]:
rf_best = grid_search.best_estimator_

In [26]:
accuracy_score(y_validation, rf_best.predict(X_validation))

0.6676012897798963

In [27]:
precision_score(y_validation, rf_best.predict(X_validation))

0.6

In [28]:
recall_score(y_validation, rf_best.predict(X_validation))

0.5632065775950668

In [29]:
f1_score(y_validation, rf_best.predict(X_validation))

0.5810213818695883

In [30]:
from sklearn.model_selection import GridSearchCV
rf_grid = RandomForestClassifier()

param_grid = {'n_estimators': [400, 410, 420, 430, 440, 450, 460, 470, 480, 490]}

grid_search = GridSearchCV(estimator=rf_grid, param_grid=param_grid)
grid_search.fit(X_train, y_train)

In [32]:
grid_search.best_params_

{'n_estimators': 470}

In [33]:
rf_best2 = grid_search.best_estimator_

In [34]:
accuracy_score(y_validation, rf_best2.predict(X_validation))

0.6661993551100519

In [35]:
precision_score(y_validation, rf_best2.predict(X_validation))

0.5982468955441929

In [37]:
recall_score(y_validation, rf_best2.predict(X_validation))

0.5611510791366906

In [38]:
f1_score(y_validation, rf_best2.predict(X_validation))

0.5791055329680043

Таким образом, наиболее точным классификатором можно назвать классификатор типа логистическая регрессия.