In [80]:
import os
import numpy as np
import pandas as pd
from glob import glob
import seaborn as sns
from shapely.geometry import Polygon
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from catboost import CatBoostRegressor, Pool
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


In [81]:
npy_files = glob('./GEN_DB/**/numpy/*.npy', recursive=True)
# print(npy_files)
data_np = [np.load(file, allow_pickle=True) for file in npy_files]
print(f"Loaded {len(data_np)} .npy files.")


Loaded 1106 .npy files.


In [82]:
rows = []

for arr in data_np:
    item = arr.item()
    
    row = {
        'plot_points': item['plot_points'],
        'offset_points': item['offset_points'],
        'polygon_area': item['metadata']['polygon_area'],
        'floors': item['metadata']['floors'],
        'setback': item['metadata']['setback'],
        'max_living_area': item['metadata']['max_living_area'],
        'max_footprint_area': item['metadata']['max_footprint_area'],
        'footprint_area': item['metadata']['footprint_area'],
        'num_buildings': item['metadata']['num_buildings']
    }

    if item['buildings']:
        building = item['buildings'][0]
        row.update({
            'building_points': building['points'],
            'building_center': building['center'],
            'building_angle': building['angle'],
            'building_type': building['type'],
            'building_area': building['area']
        })
    
    rows.append(row)

df = pd.DataFrame(rows)
df.sample()
df.columns

Index(['plot_points', 'offset_points', 'polygon_area', 'floors', 'setback',
       'max_living_area', 'max_footprint_area', 'footprint_area',
       'num_buildings', 'building_points', 'building_center', 'building_angle',
       'building_type', 'building_area'],
      dtype='object')

In [83]:
def calculate_polygon_area(points):
    polygon = Polygon(points)
    return polygon.area

df['offset_polygon_area'] = df['offset_points'].apply(calculate_polygon_area)

In [84]:
df['living_area']  = df['footprint_area'] * df['floors']

In [85]:
df = df.drop(columns=['plot_points', 'offset_points', 'building_area', 'offset_polygon_area',
'building_points', 'building_center', 'building_angle', 'num_buildings'])

In [86]:
# df.describe()

In [87]:
df.sample(5)

Unnamed: 0,polygon_area,floors,setback,max_living_area,max_footprint_area,footprint_area,building_type,living_area
51,3411.5,16,12.0,6925.345,618.334375,416.0,"(16, 26)",6656.0
531,4167.5,5,8.0,5542.775,1583.65,864.0,"(54, 16)",4320.0
571,3852.0,11,12.0,7049.16,915.475325,416.0,"(26, 16)",4576.0
141,3869.0,4,8.0,4565.42,1630.507143,772.0,L-18x18+28x16,3088.0
665,3349.0,2,8.0,7434.78,5310.557143,468.0,"(18, 26)",936.0


In [88]:
df.columns

Index(['polygon_area', 'floors', 'setback', 'max_living_area',
       'max_footprint_area', 'footprint_area', 'building_type', 'living_area'],
      dtype='object')

In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1106 entries, 0 to 1105
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   polygon_area        1106 non-null   float64
 1   floors              1106 non-null   int64  
 2   setback             1106 non-null   float64
 3   max_living_area     1106 non-null   float64
 4   max_footprint_area  1106 non-null   float64
 5   footprint_area      1106 non-null   float64
 6   building_type       1106 non-null   object 
 7   living_area         1106 non-null   float64
dtypes: float64(6), int64(1), object(1)
memory usage: 69.2+ KB


In [90]:
unique_values = {col: df[col].unique() for col in df.columns}

print(unique_values)

{'polygon_area': array([3807. , 3247.5, 3411.5, 3955.5, 3669. , 3128. , 3419. , 3195.5,
       3027.5, 3869. , 3207. , 3202. , 3441. , 3132. , 3939.5, 3884.5,
       3194. , 3525. , 3494. , 3478. , 4302.5, 3709.5, 3723.5, 4041. ,
       3676. , 3258.5, 4725. , 3238. , 3302. , 3764. , 3249.5, 4167.5,
       3770.5, 3852. , 3733.5, 3782. , 3040.5, 3244.5, 3954. , 3349. ,
       3578. , 4460. , 3518.5, 3143.5, 3895. , 3161.5, 3340.5, 3332.5,
       3326.5, 3485. , 3071. , 3446. , 3761.5, 3114.5, 4342.5, 3179. ,
       3075. , 3727.5, 3734.5, 3437. , 3524.5, 3774.5, 3811. , 3091.5,
       3653.5]), 'floors': array([13, 17, 10,  5, 19, 21, 15,  3, 23,  7,  6, 22, 24, 25,  2,  8,  4,
       16, 12, 18,  9, 11, 14, 20]), 'setback': array([12.,  8.]), 'max_living_area': array([ 7309.44 ,  7842.42 ,  6776.46 ,  5063.31 ,  8032.77 ,  8185.05 ,
        7614.   ,  3807.   ,  8337.33 ,  5900.85 ,  5520.15 ,  8261.19 ,
        8413.47 ,  8451.54 ,  5325.9  ,  5780.55 ,  7209.45 ,  3832.05 ,
        

In [91]:
X = df.drop('living_area', axis=1)
y = df['living_area']

categorical_cols = ['building_type']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

X_processed = preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"Model Coefficients: {model.coef_}")
print(f"Model Intercept: {model.intercept_}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Absolute Percentage Error: {mape:.2f}")

predictions = model.predict(X_processed)

df['predicted_living_area'] = predictions

Model Coefficients: [-2.59271493e-01  3.28096043e+02 -1.69842271e+01  3.93325397e-01
 -4.60330504e-01  5.34110785e+00 -3.22760991e+01  1.82765615e+02
 -6.88318626e+02  2.50304278e+02 -5.42953431e+01  3.34984555e+02
  1.94512301e+02  1.97752451e+02  4.03438771e+02 -5.71581310e+01
  3.29377050e+02 -9.09252263e+01 -2.33196311e+02 -1.81926400e+01
  1.19935465e+02 -1.22835858e+02 -2.42538897e+02 -1.88429150e+02
 -2.84904205e+02]
Model Intercept: -2061.0820322621576
Mean Absolute Error: 370.67
Mean Absolute Percentage Error: 0.16


In [92]:
df = df.drop(columns=['predicted_living_area'])

In [93]:
df.sample()

Unnamed: 0,polygon_area,floors,setback,max_living_area,max_footprint_area,footprint_area,building_type,living_area
97,3419.0,16,12.0,6940.57,619.69375,324.0,"(18, 18)",5184.0


In [94]:
# import os
# from catboost import CatBoostRegressor
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
# import pandas as pd

# Предполагаем, что df уже загружен
X = df.drop('living_area', axis=1)
y = df['living_area']

categorical_cols = ['building_type']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

cat_features_indices = [X.columns.get_loc(col) for col in categorical_cols]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostRegressor(
    cat_features=cat_features_indices,
    random_seed=42,
    verbose=0
)

model.fit(X_train, y_train)

# Создаем папку model, если она не существует
os.makedirs('./model', exist_ok=True)

# Сохраняем модель
model.save_model('./model/catboost_model.cbm')

# Дальнейший код для оценки и анализа модели
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Absolute Percentage Error: {mape:.2f}")

predictions = model.predict(X)

df['predicted_living_area'] = predictions

feature_importances = model.get_feature_importance()
feature_names = X.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print(f"{name}: {score:.2f}")

Mean Absolute Error: 42.75
Mean Absolute Percentage Error: 0.01
floors: 42.64
footprint_area: 30.50
max_footprint_area: 12.91
max_living_area: 6.52
setback: 5.73
polygon_area: 1.60
building_type: 0.09


In [95]:
df = df.drop('predicted_living_area', axis=1)

In [96]:
import os
import pickle
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Предполагаем, что df уже загружен
X = df.drop('living_area', axis=1)
y = df['living_area']

# Определяем категориальные и числовые признаки
categorical_cols = ['building_type']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Применяем One-Hot Encoding к категориальным признакам
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(X[categorical_cols])
encoded_cols = encoder.get_feature_names_out(categorical_cols)

# Создаем DataFrame с закодированными признаками
X_encoded_df = pd.DataFrame(X_encoded, columns=encoded_cols, index=X.index)

# Объединяем с числовыми признаками
X_processed = pd.concat([X[numerical_cols], X_encoded_df], axis=1)

# Разделяем данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Теперь у нас нет категориальных признаков, поэтому не указываем cat_features
model = CatBoostRegressor(
    random_seed=42,
    verbose=0
)

model.fit(X_train, y_train)

# Сохраняем модель и кодировщик
os.makedirs('./model', exist_ok=True)

with open('./model/catboost_model.pkl', 'wb') as f:
    pickle.dump(model, f)
    
with open('./model/onehot_encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

# Оценка модели
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Absolute Percentage Error: {mape:.2f}")

# Предсказания для всего набора данных
X_full_encoded = encoder.transform(X[categorical_cols])
X_full_encoded_df = pd.DataFrame(X_full_encoded, columns=encoded_cols, index=X.index)
X_full_processed = pd.concat([X[numerical_cols], X_full_encoded_df], axis=1)

predictions = model.predict(X_full_processed)
df['predicted_living_area'] = predictions

# Важность признаков
feature_importances = model.get_feature_importance()
feature_names = X_processed.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print(f"{name}: {score:.2f}")

Mean Absolute Error: 43.98
Mean Absolute Percentage Error: 0.01
floors: 44.95
footprint_area: 19.42
max_footprint_area: 13.62
setback: 9.61
building_type_(18, 18): 5.73
max_living_area: 5.27
polygon_area: 0.94
building_type_(26, 34): 0.25
building_type_L-18x18+26x16: 0.06
building_type_(26, 18): 0.05
building_type_(44, 18): 0.03
building_type_L-26x16+18x18: 0.02
building_type_(16, 26): 0.01
building_type_L-28x16+18x18: 0.01
building_type_(18, 26): 0.01
building_type_L-26x18+18x18: 0.00
building_type_L-18x18+26x18: 0.00
building_type_L-26x16+16x28: 0.00
building_type_(26, 16): 0.00
building_type_L-26x16+18x26: 0.00
building_type_(16, 28): 0.00
building_type_(28, 16): 0.00
building_type_L-18x18+28x16: 0.00
building_type_L-26x18+16x26: 0.00
building_type_(54, 16): 0.00
