In [93]:
from pandas import read_csv 
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.feature_selection import SequentialFeatureSelector as sfs



from scripts.pipeline_preprocess import pipeline_model

In [94]:
dataframe = read_csv('../data/clean/houses_madrid_cleaned.csv')
dataframe.head(1)

Unnamed: 0,sq_mt_built,n_rooms,n_bathrooms,floor,is_floor_under,rent_price,buy_price,buy_price_by_area,house_type_id,is_renewal_needed,is_new_development,is_exterior,energy_certificate,has_parking
0,64.0,2,1.0,3,False,471,85000,1328,HouseType 1: Pisos,False,False,True,D,False


# Features 
To have better performances, we will need to analyse features.  
Firstly, we'll create new features that could be relevant.

In [95]:
# To be sure that n_rooms doesn't stand for n_toal_rooms
# print(dataframe[(dataframe["n_rooms"] == 1) & (dataframe["n_bathrooms"] == 1)])

dataframe['n_total_rooms'] = dataframe['n_rooms'] + dataframe['n_bathrooms']
dataframe['coef_buy_price_by_era'] = dataframe['buy_price'] / dataframe['buy_price_by_area']
dataframe['buy_price_per_mt_built'] = dataframe['buy_price'] / dataframe['sq_mt_built']
dataframe.head()

Unnamed: 0,sq_mt_built,n_rooms,n_bathrooms,floor,is_floor_under,rent_price,buy_price,buy_price_by_area,house_type_id,is_renewal_needed,is_new_development,is_exterior,energy_certificate,has_parking,n_total_rooms,coef_buy_price_by_era,buy_price_per_mt_built
0,64.0,2,1.0,3,False,471,85000,1328,HouseType 1: Pisos,False,False,True,D,False,3.0,64.006024,1328.125
1,70.0,3,1.0,4,False,666,129900,1856,HouseType 1: Pisos,True,False,True,en trámite,False,4.0,69.989224,1855.714286
2,94.0,2,2.0,1,False,722,144247,1535,HouseType 1: Pisos,False,False,True,no indicado,False,4.0,93.971987,1534.542553
3,64.0,2,1.0,Bajo,True,583,109900,1717,HouseType 1: Pisos,False,False,True,en trámite,False,3.0,64.006989,1717.1875
4,108.0,2,2.0,4,False,1094,260000,2407,HouseType 1: Pisos,False,False,True,en trámite,True,4.0,108.01828,2407.407407


### Analyse correlation of numerical features
To analyse the interesting features, we can first look at the correlation with the target features. 

In [96]:
NUMERICAL_COLUMNS = [column for column in dataframe.columns if dataframe[column].dtype in ('int64', 'float64')]
correlation_matrix = dataframe[NUMERICAL_COLUMNS].corr()
correlation_matrix['buy_price'].sort_values(ascending=False)

buy_price                 1.000000
coef_buy_price_by_era     0.814749
sq_mt_built               0.814737
n_bathrooms               0.731769
n_total_rooms             0.655744
buy_price_per_mt_built    0.633759
buy_price_by_area         0.633745
rent_price                0.615957
n_rooms                   0.481796
Name: buy_price, dtype: float64

In [97]:
print(correlation_matrix['coef_buy_price_by_era'].sort_values(ascending=False))
print(correlation_matrix['n_total_rooms'].sort_values(ascending=False))
print(correlation_matrix['buy_price_per_mt_built'].sort_values(ascending=False))

coef_buy_price_by_era     1.000000
sq_mt_built               0.999996
n_total_rooms             0.816346
buy_price                 0.814749
n_bathrooms               0.789000
n_rooms                   0.689429
rent_price                0.510968
buy_price_per_mt_built    0.140836
buy_price_by_area         0.140822
Name: coef_buy_price_by_era, dtype: float64
n_total_rooms             1.000000
n_rooms                   0.925321
n_bathrooms               0.856546
sq_mt_built               0.816351
coef_buy_price_by_era     0.816346
buy_price                 0.655744
rent_price                0.447866
buy_price_by_area         0.061241
buy_price_per_mt_built    0.061235
Name: n_total_rooms, dtype: float64
buy_price_per_mt_built    1.000000
buy_price_by_area         0.999994
buy_price                 0.633759
rent_price                0.521169
n_bathrooms               0.249184
coef_buy_price_by_era     0.140836
sq_mt_built               0.140828
n_total_rooms             0.061235
n_rooms   

In [98]:
dataframe.drop(columns=['n_rooms', 'n_total_rooms', 'sq_mt_built', 'buy_price_by_area'], inplace=True)

As we can see, all the features seems pretty correlation with globally more than 0.6 correlation with buy price.  
Also, thanks to the created features, we can drop some columns.  
*** 
### Forward feature selection
To optimize the features selected, we're going to lookup for features selection methods.

In [99]:
dataframe, buy_price = dataframe.drop(columns=['buy_price']), dataframe['buy_price']
dataframe_train, dataframe_test, buy_price_train, buy_price_test = train_test_split(dataframe, buy_price, test_size=0.3, random_state=40)
pipeline = pipeline_model(dataframe_train)
dataframe_train = pipeline.fit_transform(dataframe_train)

for model in [LinearRegression]:
    feature_selection = sfs(model(), n_features_to_select=2, scoring='neg_mean_squared_error')
    feature_selection.fit(dataframe_train, buy_price_train)
    print(model, feature_selection.get_feature_names_out())


<class 'sklearn.linear_model._base.LinearRegression'> ['x2' 'x3']


In [100]:
dataframe_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 74893 stored elements and shape (10699, 30)>