In [1]:
import pandas as pd
import numpy as np


In [2]:
rooms = pd.read_csv('cian.csv')
rooms.sample(10)


Unnamed: 0,Name,Price,all_sqr,live_sqr,kitch,floor,year,pledge,comis,looks,prepayment
432,"1-комн. квартира, 40 м²",40 000,40,20,10,23,2019,40 000,50,361,1
1799,"1-комн. квартира, 38 м²",80 000,38,22,9,2,1968,80 000,комисси,121,1
591,"1-комн. квартира, 46 м²",45 000,46,18,12,7,2018,45 000,25,621,1
390,"1-комн. квартира, 38 м²",37 000,38,19,6,4,1960,37 000,70,1 0,1
743,"1-комн. квартира, 42 м²",95 000,42,26,10,5,2021,95 000,30,43,1
1648,"1-комн. квартира, 38 м²",50 000,38,20,9,3,1988,50 000,комисси,5 3,1
1700,"1-комн. квартира, 31 м²",60 000,31,15,6,2,2020,60 000,комисси,521,1
635,"1-комн. квартира, 27 м²",50 000,27,12,10,14,2019,55 000,50,655,1
1196,"1-комн. квартира, 38 м²",80 000,38,22,9,2,1968,80 000,комисси,94,1
1708,"1-комн. квартира, 35 м²",43 000,35,18,8,1,1960,43 000,50,79,1


# Подготовка данных

In [3]:
print(len(rooms))
rooms = rooms.drop_duplicates()
print(len(rooms))


1903
1458


In [4]:
rooms['Price'] = rooms['Price'].map(lambda x: np.int64(x.split()[0]))
rooms['pledge'] = rooms['pledge'].map(lambda x: np.int64(x.split()[0]) if x[0].isdigit() else np.nan)
rooms.pledge = rooms.pledge.map(lambda x: np.int64(x) if not np.isnan(x) else np.nan)
rooms['looks'] = rooms['looks'].map(lambda x: np.int64('0'.join(x.split())) if ' ' in x and 'п' not in x else np.int64(x.split()[0]))
rooms['comis'] = rooms['comis'].map(lambda x: np.float64('0.' + str(x)) if str(x).isdigit() else np.nan)
rooms.live_sqr = rooms.live_sqr.map(lambda x: np.int64(x) if ',' not in x else np.int64(x[:-1]))
rooms.kitch = rooms.kitch.map(lambda x: np.int64(x) if ',' not in x else np.int64(x[:-1]))
#rooms.pledge = rooms.pledge.astype(np.int64)
rooms = rooms.drop('prepayment', axis=1)



In [9]:
print(rooms.isna().sum())
rooms.dropna(inplace=True)
print(rooms.isna().sum())


Name          0
Price         0
all_sqr       0
live_sqr      0
kitch         0
floor         0
year          0
pledge       73
comis       244
looks         0
dtype: int64
Name        0
Price       0
all_sqr     0
live_sqr    0
kitch       0
floor       0
year        0
pledge      0
comis       0
looks       0
dtype: int64


# Разделение выборки

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# np.set_printoptions(suppress=True)
Y = pd.cut(rooms.Price.rank(pct=True), bins=[0, 0.33, 0.66, 1], labels=['low', 'middle', 'high'])
X = rooms.drop(['Name', 'Price'], axis=1).values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


# Логистическая регрессия

In [11]:
from sklearn.linear_model import LogisticRegression

logr = LogisticRegression() # random_state=42, solver='lbfgs', max_iter=100)
logr.fit(X_train, Y_train)
logr_pred = logr.predict(X_test)
accuracy_score(Y_test, logr_pred)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9135802469135802

# Дерево решений

In [12]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier() # max_depth = 4, random_state=42)
clf = clf.fit(X_train, Y_train)
clf_pred = clf.predict(X_test)
accuracy_score(Y_test, clf_pred)


0.9711934156378601

# Ансамбль

In [13]:
from sklearn.ensemble import VotingClassifier

ans = VotingClassifier(estimators=[('lr', logr), ('dtc', clf)], voting='hard')
ans = ans.fit(X_train, Y_train)
ans_pred = ans.predict(X_test)
accuracy_score(Y_test, ans_pred)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9629629629629629

# KNC

In [14]:
from sklearn.neighbors import KNeighborsClassifier

knc = KNeighborsClassifier()
knc.fit(X_train, Y_train)
knc_pred = knc.predict(X_test)
accuracy_score(Y_test, knc_pred)


0.8024691358024691

# GridSearch

In [15]:
from sklearn.model_selection import GridSearchCV
params = {
    'n_neighbors' : np.arange(1, 11),
    'metric' : ['manhattan', 'euclidean'],
    'weights' : ['uniform', 'distance']
}

knc_grid = GridSearchCV(knc, params, cv=5, scoring='accuracy', n_jobs=-1)
knc_grid.fit(X_train, Y_train)
accuracy_score(Y_test, knc_grid.best_estimator_.predict(X_test))


0.8518518518518519