In [104]:
import pandas as pd
import numpy as np

In [105]:
df = pd.read_csv("./data/houses.csv")

In [106]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  int64  
 3   total_rooms         20433 non-null  int64  
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  int64  
 6   households          20433 non-null  int64  
 7   median_income       20433 non-null  float64
 8   ocean_proximity     20433 non-null  object 
 9   median_house_value  20433 non-null  int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 1.7+ MB


In [107]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


Работа с параметрами

In [108]:
def distance(lat1,lon1,lat2,lon2):
    import math
    # Earth Radius in KM 
    R = 6371
    dLat = math.radians(lat2-lat1);
    dLon = math.radians(lon2-lon1);
    lat1 = math.radians(lat1);
    lat2 = math.radians(lat2);
    a = math.sin(dLat/2) * math.sin(dLat/2) +  math.sin(dLon/2) * math.sin(dLon/2) * math.cos(lat1) * math.cos(lat2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = R * c
    return d

In [109]:

los_angeles = (34.05224006522778, -118.24340295060638)
san_francisco = (37.768869788439, -122.41781584327025)

In [110]:
df['los_angeles_dist'] = df.apply(lambda x: distance(x.latitude, x.longitude, los_angeles[0], los_angeles[1]), axis=1)
df['san_francisco_dist'] = df.apply(lambda x: distance(x.latitude, x.longitude, san_francisco[0], san_francisco[1]), axis=1)

In [111]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value,los_angeles_dist,san_francisco_dist
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600,556.544948,20.611281
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500,554.295669,20.115667
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100,554.626612,18.038918
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300,555.210185,17.283751
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200,555.210185,17.283751


In [112]:
ocean = pd.get_dummies(df['ocean_proximity']).astype(int)
ocean.head(3)

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0


In [113]:
df = pd.concat((df, ocean), axis=1)

In [114]:
df.drop(columns=["longitude", "latitude", "ocean_proximity"], inplace=True)

In [115]:
df.head(5)

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,los_angeles_dist,san_francisco_dist,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,41,880,129.0,322,126,8.3252,452600,556.544948,20.611281,0,0,0,1,0
1,21,7099,1106.0,2401,1138,8.3014,358500,554.295669,20.115667,0,0,0,1,0
2,52,1467,190.0,496,177,7.2574,352100,554.626612,18.038918,0,0,0,1,0
3,52,1274,235.0,558,219,5.6431,341300,555.210185,17.283751,0,0,0,1,0
4,52,1627,280.0,565,259,3.8462,342200,555.210185,17.283751,0,0,0,1,0


In [116]:
train_df = df.sample(frac=0.8,random_state=200)
test_df = df.drop(train_df.index)

Обучаем модельку

In [117]:
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing

In [118]:
X = train_df.drop(columns=["median_house_value"])
y = train_df.median_house_value

X_test = test_df.drop(columns=["median_house_value"])
y_test = test_df.median_house_value

In [119]:
reg = LinearRegression().fit(X, y)

In [120]:
reg.score(X, y)

0.6498506344865067

In [121]:
reg.coef_

array([ 9.21301409e+02, -4.90475377e+00,  8.25945568e+01, -3.84905411e+01,
        6.37045476e+01,  3.86908359e+04, -1.15028677e+02, -1.17417577e+02,
       -3.46062373e+04, -9.79056567e+04,  1.76224795e+05, -3.54587793e+04,
       -8.25412171e+03])

Посчитаем метрики

In [122]:
from sklearn.metrics import mean_squared_error

In [123]:
y_pred = reg.predict(X_test)

In [124]:
mean_squared_error(y_test, y_pred, squared=False)

67556.50257371225

In [125]:
from catboost import CatBoostRegressor

In [143]:
model = CatBoostRegressor(iterations=10000,  # Количество итераций
                          learning_rate=0.1,  # Скорость обучения
                          depth=4,  # Глубина дерева
                          random_state=42)  # Задаем случайное начальное состояние для воспроизводимости

In [144]:
model.fit(X, y, verbose=100)

0:	learn: 109512.7233409	total: 1.47ms	remaining: 14.7s
100:	learn: 56109.8396828	total: 121ms	remaining: 11.9s
200:	learn: 51397.7924130	total: 242ms	remaining: 11.8s
300:	learn: 48851.8497163	total: 361ms	remaining: 11.6s
400:	learn: 47064.2538699	total: 467ms	remaining: 11.2s
500:	learn: 45653.8993745	total: 577ms	remaining: 10.9s
600:	learn: 44486.4203162	total: 677ms	remaining: 10.6s
700:	learn: 43514.7408030	total: 786ms	remaining: 10.4s
800:	learn: 42693.2093095	total: 937ms	remaining: 10.8s
900:	learn: 41909.4285385	total: 1.05s	remaining: 10.6s
1000:	learn: 41195.6998513	total: 1.15s	remaining: 10.3s
1100:	learn: 40537.3861430	total: 1.26s	remaining: 10.2s
1200:	learn: 39910.2965926	total: 1.36s	remaining: 10s
1300:	learn: 39315.3412863	total: 1.47s	remaining: 9.86s
1400:	learn: 38781.6043440	total: 1.58s	remaining: 9.69s
1500:	learn: 38232.3722227	total: 1.68s	remaining: 9.54s
1600:	learn: 37746.9099237	total: 1.79s	remaining: 9.38s
1700:	learn: 37286.0354612	total: 1.9s	rema

<catboost.core.CatBoostRegressor at 0x7f059bd86560>

In [145]:
y_pred = model.predict(X)

In [146]:
mse = np.mean((y - y_pred) ** 2)  # Среднеквадратичная ошибка
mse ** 0.5

20769.21284009507

In [147]:
y_pref = model.predict(X_test)

In [148]:
mse = np.mean((y_test - y_pref) ** 2)  # Среднеквадратичная ошибка
mse ** 0.5

49318.95116212646

In [149]:
len(X_test), len(X)

(4087, 16346)