In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('house_prices/house_price_train.csv')
data = data.drop(columns=['id'])
print(data.shape)
data.head()

(1226, 14)


Unnamed: 0,author,author_type,location,deal_type,accommodation_type,floor,floors_count,rooms_count,total_meters,district,street,underground,residential_complex,price
0,BARNES International Realty,real_estate_agent,Москва,sale,flat,3,7,4,118.0,Таганский,Серебрянический переулок,Китай-город,Титул на Серебрянической,104.5
1,Гранит Инвест,real_estate_agent,Санкт-Петербург,sale,flat,6,14,2,55.9,Василеостровский,набережная Реки Смоленки,Приморская,Айно,15.2048
2,А101,developer,Москва,sale,flat,15,18,3,52.5,,,Прокшино,Прокшино ЖК,19.557878
3,Арсенал-Недвижимость,developer,Санкт-Петербург,sale,flat,2,12,1,53.48,Приморский,проспект Авиаконструкторов,Комендантский проспект,Modum,12.033
4,Contact Real Estate,real_estate_agent,Москва,sale,flat,14,16,4,136.5,Пресненский,Костикова,Улица 1905 года,Lucky,175.0


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

categorical = ['author', 'author_type', 'location', 'deal_type', 'accommodation_type', 'district', 'street', 'underground', 'residential_complex']
numeric = ['floor', 'floors_count', 'rooms_count', 'total_meters']
# other = ['price']

column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('scaling', StandardScaler(), numeric),
    # ('other',  'passthrough', other)
])

prices = data.iloc[:, [-1]].to_numpy()
data = column_transformer.fit_transform(data.iloc[:, :-1]).toarray()

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, prices, train_size=0.8)

In [7]:
from sklearn.metrics import mean_absolute_error

def calc_score(true, pred):
    return (60 - mean_absolute_error(true, pred)) / 3.75

In [8]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

In [9]:
model = Ridge()
model.fit(X_train, y_train)
pred = model.predict(X_test)
calc_score(y_test, pred)

7.735807900332273

In [10]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
pred = model.predict(X_test)
calc_score(y_test, pred)

  model.fit(X_train, y_train)


9.855100995862328

In [11]:
model = Lasso()
model.fit(X_train, y_train)
pred = model.predict(X_test)
calc_score(y_test, pred)

7.288733249764073

In [12]:
features = {
    'iterations': 1000,
    'random_seed': 0,
    'eval_metric': 'MAE',
    'verbose': 100
}
model = CatBoostRegressor(
    **features
)
model.fit(X_train, y_train, eval_set=(X_test, y_test))
pred = model.predict(X_test)
calc_score(y_test, pred)

Learning rate set to 0.05076
0:	learn: 73.6757838	test: 70.0004842	best: 70.0004842 (0)	total: 59.2ms	remaining: 59.2s
100:	learn: 27.4170186	test: 31.4503100	best: 31.4466499 (99)	total: 202ms	remaining: 1.79s
200:	learn: 20.6357280	test: 27.9555226	best: 27.9555226 (200)	total: 346ms	remaining: 1.38s
300:	learn: 17.7761399	test: 26.9370922	best: 26.9370922 (300)	total: 491ms	remaining: 1.14s
400:	learn: 15.9145738	test: 26.2793065	best: 26.2793065 (400)	total: 636ms	remaining: 950ms
500:	learn: 14.6403040	test: 25.7060604	best: 25.7060604 (500)	total: 780ms	remaining: 777ms
600:	learn: 13.4863763	test: 25.3742112	best: 25.3688193 (598)	total: 923ms	remaining: 613ms
700:	learn: 12.1791147	test: 25.0798038	best: 25.0685010 (695)	total: 1.07s	remaining: 455ms
800:	learn: 11.3548646	test: 24.7316241	best: 24.7316241 (800)	total: 1.21s	remaining: 301ms
900:	learn: 10.7567318	test: 24.5028530	best: 24.5028530 (900)	total: 1.36s	remaining: 149ms
999:	learn: 9.9535414	test: 24.2944007	best: 

9.522044265386

In [13]:
test = pd.read_csv('house_prices/house_price_test.csv')
ids = test['id']
test = test.drop(columns=['id'])
print(test.shape)
test.head()

(303, 13)


Unnamed: 0,author,author_type,location,deal_type,accommodation_type,floor,floors_count,rooms_count,total_meters,district,street,underground,residential_complex
0,ГлавстройСПб,developer,Санкт-Петербург,sale,flat,19,24,1,34.5,Приморский,Ивинская,Комендантский проспект,Юнтолово
1,Основа,developer,Москва,sale,flat,11,13,2,71.4,Замоскворечье,Большой Строченовский переулок,Павелецкая,Резиденции Замоскворечье
2,ID 112819123,developer,Москва,sale,flat,-1,-1,-1,202.0,Щукино,проезд 4-й Красногорский,Стрешнево,Moments
3,Сенатор,developer,Санкт-Петербург,sale,flat,3,12,1,47.4,Курортный,Приморское шоссе,,МФК Морская ривьера
4,MR Group,developer,Москва,sale,flat,3,36,1,40.8,Беговой,Ленинградский проспект,Белорусская,Slava


In [14]:
test = column_transformer.transform(test)

In [15]:
pred = model.predict(test.toarray())
res = pd.DataFrame({
    'id': ids,
    'price': pred
})
res.to_csv('price_pred.csv' ,index=False)