In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_csv('../data/kolesa_almaty_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,brand,model,year,city,price,mileage,engine_volume_liters,body_style,color,transmission,drive_type,url,parsed_at
0,ГАЗ,ГАЗель,2011,Алматы,5400000,193650,2.9,Фургон,белый металлик,Механика,Задний привод,https://kolesa.kz/a/show/185584438,2025-04-07 09:11:19
1,Infiniti,QX56,2011,Алматы,14500000,218688,5.6,Внедорожник,черный,Автомат,Полный привод,https://kolesa.kz/a/show/185584296,2025-04-07 09:11:27
2,Chery,Tiggo 2,2023,Алматы,5600000,17134,1.5,Кроссовер,серый,Вариатор,Передний привод,https://kolesa.kz/a/show/184249228,2025-04-07 09:11:38
3,Toyota,Windom,2001,Алматы,5200000,249874,3.0,Седан,белый,Автомат,Передний привод,https://kolesa.kz/a/show/184510195,2025-04-07 09:11:47
4,BMW,735,2000,Алматы,3900000,239760,3.5,Седан,черный металлик,Автомат,Задний привод,https://kolesa.kz/a/show/185584229,2025-04-07 09:11:53


In [4]:
df_model = df.drop(columns=['url', 'parsed_at', 'city'])

In [5]:
y = df_model['price']
X = df_model.drop(columns=['price'])

In [6]:
numeric_features = ['year', 'mileage', 'engine_volume_liters']
categorical_features = ['brand', 'model', 'body_style', 'color', 'transmission', 'drive_type']

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = model.score(X_test, y_test)

In [9]:
mae, r2

(2158412.206520505, 0.9078846201358622)