# Model for streamlit

#### 1step. Import all libraries.

In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle


#### 2step. Read the CSV.

In [2]:

df = pd.read_csv('apartment_price_in_Kokshetau_kaggle.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,price,area,flat_toilets,balcony,current_floors,total_floors,ceiling,dorm,mortgage,...,type_of_house,repair_status,street,latitude,longitude,is_in_kokshetau,distance_to_nearest_school,distance_to_nearest_kindergarten,distance_to_nearest_pharmacy,distance_to_center
0,0,9500000.0,39.0,совмещенный,лоджия,4.0,5.0,2.7,нет,нет,...,кирпичный,частично,"Юбилейный 43, Кокшетау",53.269046,69.424945,True,0.335102,0.336834,0.468495,3.712312
1,1,39000000.0,76.1,раздельный,нет,4.0,10.0,3.0,нет,нет,...,кирпичный,частично,"Кенесары 41а, Кокшетау",53.262199,69.369769,True,0.738231,2.777238,2.062675,2.429681
2,2,29990000.0,75.0,раздельный,нет,9.0,10.0,3.0,нет,нет,...,кирпичный,полностью,"Сарыарка 2г, Кокшетау",53.320954,69.38596,True,0.393297,0.360013,1.196087,4.192638
3,3,10900000.0,37.0,совмещенный,балкон,1.0,5.0,2.7,нет,нет,...,кирпичный,частично,"Абылай хана, Кокшетау",53.28547,69.381409,True,0.560507,0.449888,0.083088,0.470146
4,4,13400000.0,50.8,совмещенный,нет,6.0,6.0,2.7,нет,нет,...,панельный,частично,"Абылай хана 20, Кокшетау",53.300842,69.387853,True,0.785759,0.409501,0.48886,2.07822


#### 3step. Drop columns and some preprocessing.

In [3]:

df.drop(columns=['Unnamed: 0', 
                 'is_in_kokshetau', 'street', 'latitude', 'longitude', 'distance_to_center', 'repair_status', 'type_of_house', 'distance_to_nearest_school', 'distance_to_nearest_kindergarten', 'distance_to_nearest_pharmacy'], axis=1, inplace=True)
df['area'] = df['area'].str.split(',').str[0]
df['area'] = df['area'].astype(float)
df = df[df['year'] != 2025]

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5122 entries, 0 to 5270
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   price           5122 non-null   float64
 1   area            5122 non-null   float64
 2   flat_toilets    5122 non-null   object 
 3   balcony         5122 non-null   object 
 4   current_floors  5122 non-null   float64
 5   total_floors    5122 non-null   float64
 6   ceiling         5122 non-null   float64
 7   dorm            5122 non-null   object 
 8   mortgage        5122 non-null   object 
 9   year            5122 non-null   int64  
dtypes: float64(5), int64(1), object(4)
memory usage: 440.2+ KB


#### 4step. Top 10 by price, area, ceiling to identify abnormal values.

In [4]:

top_10_price = df.sort_values(by='price', ascending=False).head(10)
top_10_price

Unnamed: 0,price,area,flat_toilets,balcony,current_floors,total_floors,ceiling,dorm,mortgage,year
2518,185000000.0,62.6,раздельный,нет,1.0,5.0,2.7,нет,нет,1992
86,180800000.0,272.0,раздельный,нет,9.0,10.0,3.4,нет,нет,2020
1909,105000000.0,142.9,раздельный,нет,7.0,9.0,2.7,нет,нет,2023
18,101601245.0,141.5,раздельный,нет,9.0,10.0,3.4,нет,нет,2020
702,100000000.0,151.0,раздельный,нет,3.0,9.0,3.0,нет,нет,2023
211,100000000.0,141.0,раздельный,нет,2.0,9.0,2.7,нет,нет,2021
799,100000000.0,141.8,раздельный,нет,5.0,9.0,3.0,нет,нет,2020
648,100000000.0,156.0,2 с/у и более,нет,10.0,13.0,3.0,нет,нет,2024
638,100000000.0,141.0,2 с/у и более,нет,2.0,9.0,3.0,нет,нет,2021
4175,100000000.0,151.0,раздельный,нет,3.0,9.0,3.0,нет,нет,2023


In [5]:

top_10_area = df.sort_values(by='area', ascending=False).head(10)
top_10_area

Unnamed: 0,price,area,flat_toilets,balcony,current_floors,total_floors,ceiling,dorm,mortgage,year
894,33000000.0,765.0,совмещенный,нет,9.0,9.0,2.7,нет,нет,2022
582,9500000.0,321.0,совмещенный,нет,1.0,5.0,2.7,нет,нет,1980
1069,90000000.0,300.0,раздельный,нет,1.0,3.0,3.6,нет,нет,2024
86,180800000.0,272.0,раздельный,нет,9.0,10.0,3.4,нет,нет,2020
1150,50000000.0,180.9,совмещенный,лоджия,7.0,10.0,2.7,нет,нет,2022
3444,48000000.0,175.0,раздельный,нет,4.0,4.0,2.7,нет,нет,2011
667,48000000.0,170.0,раздельный,нет,4.0,4.0,2.7,нет,нет,2011
3405,48000000.0,170.0,2 с/у и более,несколько балконов или лоджий,4.0,4.0,3.0,нет,нет,2011
809,60000000.0,167.3,2 с/у и более,нет,5.0,5.0,3.0,нет,нет,2015
4498,85000000.0,160.0,совмещенный,нет,6.0,11.0,2.8,нет,нет,2022


In [6]:

top_10_ceiling = df.sort_values(by='ceiling', ascending=False).head(10)
top_10_ceiling

Unnamed: 0,price,area,flat_toilets,balcony,current_floors,total_floors,ceiling,dorm,mortgage,year
4870,18000000.0,65.0,раздельный,лоджия,5.0,5.0,100.0,нет,нет,1989
1309,34000000.0,92.0,2 с/у и более,нет,6.0,9.0,28.0,нет,нет,2018
3051,14700000.0,45.0,совмещенный,нет,4.0,5.0,27.0,нет,нет,2024
559,19500000.0,58.0,совмещенный,нет,6.0,9.0,27.0,нет,нет,2014
4624,18500000.0,39.0,совмещенный,нет,6.0,9.0,27.0,нет,нет,2018
3833,8900000.0,30.0,совмещенный,нет,1.0,2.0,25.0,нет,нет,1969
1828,22500000.0,61.0,раздельный,несколько балконов или лоджий,4.0,6.0,25.0,нет,нет,1992
1069,90000000.0,300.0,раздельный,нет,1.0,3.0,3.6,нет,нет,2024
18,101601245.0,141.5,раздельный,нет,9.0,10.0,3.4,нет,нет,2020
144,79198709.0,110.3,раздельный,нет,9.0,10.0,3.4,нет,нет,2020


#### 5step. Data cleaning and preprocessing.

In [7]:

df = df.drop(index=[2518, 582, 4870, 1309, 3051, 559, 4624, 3833, 1828, 852])
df.reset_index(drop=True, inplace=True)

df['balcony'] = df['balcony'].apply(lambda x: 0 if x == 'нет' else 1)
df['flat_toilets'] = df['flat_toilets'].apply(lambda x: 0 if x == 'нет' else 1)

df['mortgage'] = df['mortgage'].map({'нет': 0, 'В залоге': 1})
df['dorm'] = df['dorm'].map({'нет': 0, 'да': 1})

print(df.isnull().sum())

price             0
area              0
flat_toilets      0
balcony           0
current_floors    0
total_floors      0
ceiling           0
dorm              0
mortgage          0
year              0
dtype: int64


#### 6step. New features to improve understanding of dataset.

In [8]:

df["age_of_building"] = 2025 - df["year"]  
df["floor_ratio"] = df["current_floors"] / df["total_floors"]  

#### 7step. Prediction price by RandomForestRegressor.

In [9]:

X = df.drop(columns=['price'])  
y = df['price']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(max_depth=None, min_samples_leaf= 1, min_samples_split=2, n_estimators= 100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f'R^2 score: {r2_score(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')

R^2 score: 0.8180470766117902
MAE: 1842874.8089526834
MSE: 22107370644902.21


#### 8step. Save the model to .pkl to use it in streamlit.

In [10]:

with open("rf_model.pkl", "wb") as f:
    pickle.dump(model, f)
