In [132]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree

In [133]:
df = pd.read_csv("astana.csv")
df

Unnamed: 0,index,location,price,floor,area,is_furnished,rooms
0,681098403,Алматы,280000,4,55.0,полностью,2
1,688558724,Сарыарка,165000,10,60.0,полностью,2
2,687779417,Есильский,200000,12,40.0,полностью,1
3,681851810,Алматы,300000,9,55.0,полностью,2
4,688418774,Сарыарка,180000,8,40.0,полностью,1
...,...,...,...,...,...,...,...
4227,685451375,Алматы,250000,10,72.0,частично,4
4228,681736449,Есильский,290000,7,69.0,полностью,5
4229,682406566,Есильский,200000,4,38.0,полностью,3
4230,675517453,Есильский,270000,2,73.0,полностью,3


In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4232 entries, 0 to 4231
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         4232 non-null   int64  
 1   location      4232 non-null   object 
 2   price         4232 non-null   int64  
 3   floor         4232 non-null   int64  
 4   area          4232 non-null   float64
 5   is_furnished  4232 non-null   object 
 6   rooms         4232 non-null   int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 231.6+ KB


In [135]:
numerical_features = ["price", "floor", "area", "rooms"]
categorical_features = ["location", "is_furnished"]

In [136]:
correlation_matrix = df[numerical_features].corr()
correlation_matrix

Unnamed: 0,price,floor,area,rooms
price,1.0,0.216194,0.787738,0.062492
floor,0.216194,1.0,0.214294,0.015775
area,0.787738,0.214294,1.0,0.062954
rooms,0.062492,0.015775,0.062954,1.0


As we saw in EDA `area` has the highest correlation with `price`

In [137]:
df_train = df.drop(["price"], axis=1)

In [138]:
# target = np.log1p(df.price)
target = df.price
target

0       280000
1       165000
2       200000
3       300000
4       180000
         ...  
4227    250000
4228    290000
4229    200000
4230    270000
4231    400000
Name: price, Length: 4232, dtype: int64

In [139]:
X_train_full, X_test, y_train_full, y_test = train_test_split(df_train, target, test_size=0.2, random_state=118)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=118)

In [140]:
X_train.shape, X_val.shape, X_test.shape

((2538, 6), (847, 6), (847, 6))

In [141]:
dv = DictVectorizer(sparse=False)
X_train_matrix = dv.fit_transform(X_train.to_dict(orient="records"))
X_val_matrix = dv.transform(X_val.to_dict(orient="records"))

In [142]:
model = LinearRegression()
model.fit(X_train_matrix, y_train)

# Predicting on validation set
y_val_pred = model.predict(X_val_matrix)

# Calculating RMSE
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
rmse

150524.09915459223

In [143]:
sample_entity = {
    "location": "Есильский",
    "floor": 5,
    "area": 42,
    "is_furnished": "полностью",
    "rooms": 2
}
X = dv.transform(sample_entity)
round(model.predict(X)[0], 0)

86034.0

In [144]:
X_test.drop(["index"], axis=1).iloc[13].to_dict()

{'location': 'Алматы',
 'floor': 10,
 'area': 70.0,
 'is_furnished': 'частично',
 'rooms': 3}

In [145]:
y_test.iloc[13]

180000

In [146]:
X = dv.transform(X_test.drop(["index"], axis=1).iloc[13].to_dict())
round(model.predict(X)[0], 0)

170979.0

Close enough!

In [147]:
# Use DictVectorizer(sparse=True) to turn the dataframes into matrices.
dv = DictVectorizer(sparse=True)
X_train_matrix = dv.fit_transform(X_train.to_dict(orient="records"))
X_val_matrix = dv.transform(X_val.to_dict(orient="records"))

In [148]:
dt = DecisionTreeRegressor(max_depth=1)

In [149]:
dt.fit(X_train_matrix, y_train)

In [150]:
# Predicting on validation set
y_val_pred = dt.predict(X_val_matrix)

# Calculating RMSE
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
rmse

186748.30161706512

In [151]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)

In [152]:
rf.fit(X_train_matrix, y_train)

In [153]:
preds = rf.predict(X_val_matrix)
mean_squared_error(y_val, preds, squared=False)

173171.63473485326

In [154]:
for n in [10, 15, 20, 25, 30]:
    score = []
    for m in range(10, 200, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=n, random_state=118, n_jobs=-1)
        rf.fit(X_train_matrix, y_train)
        preds = rf.predict(X_val_matrix)
        score.append(round(mean_squared_error(y_val, preds, squared=False), 3))
    print("max_depth=", n, "rmse=", round(np.mean(score), 2))

max_depth= 10 rmse= 166616.13
max_depth= 15 rmse= 165933.62
max_depth= 20 rmse= 162918.62
max_depth= 25 rmse= 165295.68
max_depth= 30 rmse= 161251.11


In [156]:
import xgboost as xgb

In [157]:
# Create DMatrix for train and validation
dtrain = xgb.DMatrix(X_train_matrix, label=y_train)
dval = xgb.DMatrix(X_val_matrix, label=y_val)

In [158]:
# Create a watchlist
watchlist = [(dtrain, "train"), (dval, "dval")]

In [159]:
# Train a model with these parameters for 100 rounds:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist)

[0]	train-rmse:262602.38241	dval-rmse:192639.73040
[1]	train-rmse:216768.42386	dval-rmse:163967.21942
[2]	train-rmse:185465.09681	dval-rmse:152173.98157
[3]	train-rmse:161903.72084	dval-rmse:146478.57950
[4]	train-rmse:144945.66960	dval-rmse:145040.57078
[5]	train-rmse:133432.80493	dval-rmse:142631.40434
[6]	train-rmse:124955.20689	dval-rmse:142742.72435
[7]	train-rmse:117094.86784	dval-rmse:143139.60265
[8]	train-rmse:111271.96293	dval-rmse:143436.37667
[9]	train-rmse:107159.95618	dval-rmse:143436.27979
[10]	train-rmse:103730.12260	dval-rmse:143244.05038
[11]	train-rmse:100260.33671	dval-rmse:144305.99469
[12]	train-rmse:96004.90869	dval-rmse:146443.03492
[13]	train-rmse:94638.54342	dval-rmse:146368.02116
[14]	train-rmse:93607.54529	dval-rmse:146005.58594
[15]	train-rmse:92316.13008	dval-rmse:146073.87473
[16]	train-rmse:91151.53268	dval-rmse:146809.53505
[17]	train-rmse:88249.04831	dval-rmse:146836.03795
[18]	train-rmse:87547.80788	dval-rmse:146017.33405
[19]	train-rmse:86519.44839	d