In [1]:
# IMPORT

import pandas as pd
import pylab as plt
import seaborn as sns
import numpy as np
import h2o

from sklearn.model_selection import train_test_split
from h2o.automl import H2OAutoML
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2

import warnings
warnings.filterwarnings('ignore')

In [2]:
# LINEAR REGRESSION MODELS

from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
lasso=Lasso()
ridge=Ridge()
elastic=ElasticNet()
le=LabelEncoder

from sklearn.svm import SVR
svr=SVR()

from sklearn.ensemble import RandomForestRegressor as RFR  
from sklearn.tree import ExtraTreeRegressor as ETR
rfr=RFR()
etr=ETR()

from sklearn.ensemble import GradientBoostingRegressor as GBR
from xgboost import XGBRegressor as XGBR
from catboost import CatBoostRegressor as CTR
from lightgbm import LGBMRegressor as LGBMR
gbr=GBR()
xgbr=XGBR()
ctr=CTR()
lgbmr=LGBMR()

In [3]:
# LOAD CSVs

train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample = pd.read_csv('../data/sample.csv')

In [4]:
train1 = train.copy()
test1 = test.copy()
sample1 = sample.copy()

In [5]:
'''le = LabelEncoder()

for c in train1.columns:
    
    if c=='host_location' or c=='host_verifications':
        
        le.fit(train1[c].astype(str))
        
        train1[c]=le.transform(train1[c].astype(str))'''

"le = LabelEncoder()\n\nfor c in train1.columns:\n    \n    if c=='host_location' or c=='host_verifications':\n        \n        le.fit(train1[c].astype(str))\n        \n        train1[c]=le.transform(train1[c].astype(str))"

In [6]:
train1=pd.get_dummies(train1, columns=['property_type'], drop_first=True)
lista = [col for col in train1.columns if 'property_type' in col]
train1[lista] = train1[lista].astype(float)

train1=pd.get_dummies(train1, columns=['bathrooms_text'], drop_first=True)
lista = [col for col in train1.columns if 'bathrooms_text' in col]
train1[lista] = train1[lista].astype(float)

train1=pd.get_dummies(train1, columns=['neighbourhood_cleansed'], drop_first=True)
lista = [col for col in train1.columns if 'neighbourhood_cleansed' in col]
train1[lista] = train1[lista].astype(float)

train1=pd.get_dummies(train1, columns=['host_neighbourhood'], drop_first=True)
lista = [col for col in train1.columns if 'host_neighbourhood' in col]
train1[lista] = train1[lista].astype(float)

In [7]:
def trans(train1):    
    
    if 'price' in train1.columns:
    
        # CHANGE PRICE COLUMN POSITION

        aux = train1.price

        train1 = train1.drop('price',axis=1)

        train1.insert(1, 'price', aux)

        train1 = train1[train1.price<=6000]

    # TRANSFORM COLUMNS

    train1.host_acceptance_rate = train1.host_acceptance_rate.str.replace('%','').astype(float)/100

    train1.host_response_rate = train1.host_response_rate.str.replace('%','').astype(float)/100

    # DROP TRAIN COLUMNS

    col_importance = ['property_type','neighbourhood_cleansed','host_verifications','host_neighbourhood',
                      'availability_365','longitude','bedrooms','accommodates','host_acceptance_rate','host_location',
                      'minimum_maximum_nights','availability_90','review_scores_value','host_total_listings_count',
                      'availability_30','number_of_reviews','host_id','first_review','bathrooms_text','host_response_rate']

    col_classification = [c for c in train1.columns if np.dtype(train1[c]) is np.dtype(np.object)]

    col_noutil = ['listing_url','scrape_id','last_scraped','name','description','neighborhood_overview','picture_url',
                  'host_url']

    train1 = train1.dropna(axis=1, how='all')
    train1 = train1.drop(col_noutil, axis=1)
    train1 = train1.drop(columns=[c for c in train1 if c in col_classification and c!='price' and c!='host_acceptance_rate'
                                  and c!='host_response_rate'], axis=1)
    # No correlación con price
    train1 = train1.drop(columns='id', axis=1)
    # >0.9 Correlación con otras columnas
    train1 = train1.drop('minimum_nights', axis=1)
    train1 = train1.drop('minimum_minimum_nights', axis=1)
    train1 = train1.drop('minimum_nights_avg_ntm', axis=1)
    train1 = train1.drop('availability_90', axis=1)
    train1 = train1.drop('availability_60', axis=1)
    train1 = train1.drop('host_total_listings_count', axis=1)
    train1 = train1.drop('host_listings_count', axis=1)

    # FILLNA columnas numéricas con sus medias

    col_somenull = [c for c in train1.columns if (train1[c].isnull().sum()>0)]

    for c in col_somenull:

        # Probar a cambiar mean por mode o median
        train1[c].fillna(round(train1[c].median()), inplace=True)
        
    return train1

train1 = trans(train1)

In [8]:
le = LabelEncoder()

for c in test1.columns:
    
    if c=='host_location' or c=='host_verifications':
        
        le.fit(test1[c].astype(str))
        
        test1[c]=le.transform(test1[c].astype(str))

test1=pd.get_dummies(test1, columns=['property_type'], drop_first=True)
lista = [col for col in test1.columns if 'property_type' in col]
test1[lista] = test1[lista].astype(float)

test1=pd.get_dummies(test1, columns=['host_neighbourhood'], drop_first=True)
lista = [col for col in test1.columns if 'host_neighbourhood' in col]
test1[lista] = test1[lista].astype(float)

test1=pd.get_dummies(test1, columns=['bathrooms_text'], drop_first=True)
lista = [col for col in test1.columns if 'bathrooms_text' in col]
test1[lista] = test1[lista].astype(float)

test1=pd.get_dummies(test1, columns=['neighbourhood_cleansed'], drop_first=True)
lista = [col for col in test1.columns if 'neighbourhood_cleansed' in col]
test1[lista] = test1[lista].astype(float)

test1 = trans(test1)

In [9]:
X = train1.drop('price', axis=1)

y = train1.price

X.shape, y.shape

((4165, 174), (4165,))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, train_size=0.8,
                                                    random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3332, 174), (833, 174), (3332,), (833,))

In [11]:
# RMSE POR MODELO
# modelo que da error: lgbmr
models = [lasso, ridge, elastic, svr, rfr, etr, gbr, xgbr, ctr]

for m in models:
    
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    
    print(f'RMSE Model {m}: {mse(y_test, y_pred, squared=False)}')
    print(f'R2 Model {m}: {r2(y_test, y_pred)}')

RMSE Model Lasso(): 80.11320207996663
R2 Model Lasso(): 0.5139703914318152
RMSE Model Ridge(): 77.28230108844434
R2 Model Ridge(): 0.5477124478005073
RMSE Model ElasticNet(): 84.2147491349618
R2 Model ElasticNet(): 0.46293003661637044
RMSE Model SVR(): 118.86981399576307
R2 Model SVR(): -0.07003433158161476
RMSE Model RandomForestRegressor(): 76.30351670115387
R2 Model RandomForestRegressor(): 0.559096390302855
RMSE Model ExtraTreeRegressor(): 118.13915838661505
R2 Model ExtraTreeRegressor(): -0.05692042590165158
RMSE Model GradientBoostingRegressor(): 77.88387007384091
R2 Model GradientBoostingRegressor(): 0.5406437888689088
RMSE Model XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
        

128:	learn: 74.1049051	total: 4.06s	remaining: 27.4s
129:	learn: 73.9571651	total: 4.09s	remaining: 27.4s
130:	learn: 73.8666944	total: 4.1s	remaining: 27.2s
131:	learn: 73.7526871	total: 4.11s	remaining: 27s
132:	learn: 73.6782195	total: 4.12s	remaining: 26.9s
133:	learn: 73.6242863	total: 4.14s	remaining: 26.8s
134:	learn: 73.4980273	total: 4.16s	remaining: 26.7s
135:	learn: 73.4198908	total: 4.18s	remaining: 26.6s
136:	learn: 73.3810349	total: 4.19s	remaining: 26.4s
137:	learn: 73.2289280	total: 4.21s	remaining: 26.3s
138:	learn: 73.1761732	total: 4.23s	remaining: 26.2s
139:	learn: 73.1455058	total: 4.25s	remaining: 26.1s
140:	learn: 73.0271159	total: 4.26s	remaining: 26s
141:	learn: 72.8816177	total: 4.28s	remaining: 25.9s
142:	learn: 72.8128984	total: 4.29s	remaining: 25.7s
143:	learn: 72.6870854	total: 4.31s	remaining: 25.6s
144:	learn: 72.6140384	total: 4.32s	remaining: 25.5s
145:	learn: 72.5135693	total: 4.33s	remaining: 25.4s
146:	learn: 72.4290146	total: 4.35s	remaining: 25.2

284:	learn: 62.3133746	total: 9.94s	remaining: 24.9s
285:	learn: 62.2471420	total: 10s	remaining: 25s
286:	learn: 62.2352934	total: 10.1s	remaining: 25s
287:	learn: 62.1742454	total: 10.1s	remaining: 25s
288:	learn: 62.0976316	total: 10.2s	remaining: 25.1s
289:	learn: 62.0854574	total: 10.3s	remaining: 25.2s
290:	learn: 62.0080415	total: 10.4s	remaining: 25.3s
291:	learn: 61.9961535	total: 10.4s	remaining: 25.3s
292:	learn: 61.9870851	total: 10.6s	remaining: 25.6s
293:	learn: 61.8579652	total: 10.7s	remaining: 25.8s
294:	learn: 61.7920725	total: 10.8s	remaining: 25.7s
295:	learn: 61.7305508	total: 10.8s	remaining: 25.8s
296:	learn: 61.7236415	total: 10.9s	remaining: 25.8s
297:	learn: 61.6554043	total: 10.9s	remaining: 25.7s
298:	learn: 61.5806406	total: 11s	remaining: 25.8s
299:	learn: 61.5276184	total: 11s	remaining: 25.8s
300:	learn: 61.5151170	total: 11.1s	remaining: 25.7s
301:	learn: 61.4534277	total: 11.1s	remaining: 25.7s
302:	learn: 61.3849253	total: 11.2s	remaining: 25.7s
303:	

441:	learn: 55.1512506	total: 21.3s	remaining: 26.9s
442:	learn: 55.1260221	total: 21.4s	remaining: 27s
443:	learn: 55.1088554	total: 21.5s	remaining: 27s
444:	learn: 55.0580786	total: 21.6s	remaining: 26.9s
445:	learn: 54.9960614	total: 21.7s	remaining: 27s
446:	learn: 54.9575133	total: 21.7s	remaining: 26.9s
447:	learn: 54.9226891	total: 21.8s	remaining: 26.8s
448:	learn: 54.9195218	total: 21.8s	remaining: 26.8s
449:	learn: 54.8779360	total: 21.9s	remaining: 26.8s
450:	learn: 54.8195845	total: 21.9s	remaining: 26.7s
451:	learn: 54.7697380	total: 22s	remaining: 26.7s
452:	learn: 54.7305531	total: 22.1s	remaining: 26.7s
453:	learn: 54.6785153	total: 22.1s	remaining: 26.6s
454:	learn: 54.6065059	total: 22.2s	remaining: 26.6s
455:	learn: 54.5754511	total: 22.3s	remaining: 26.6s
456:	learn: 54.5549459	total: 22.3s	remaining: 26.6s
457:	learn: 54.4969567	total: 22.4s	remaining: 26.5s
458:	learn: 54.4721385	total: 22.5s	remaining: 26.5s
459:	learn: 54.4214919	total: 22.6s	remaining: 26.5s
4

598:	learn: 49.5757372	total: 29.4s	remaining: 19.7s
599:	learn: 49.5622618	total: 29.5s	remaining: 19.6s
600:	learn: 49.5554901	total: 29.5s	remaining: 19.6s
601:	learn: 49.5015616	total: 29.6s	remaining: 19.5s
602:	learn: 49.4474944	total: 29.6s	remaining: 19.5s
603:	learn: 49.4003346	total: 29.6s	remaining: 19.4s
604:	learn: 49.3886849	total: 29.6s	remaining: 19.3s
605:	learn: 49.3543172	total: 29.7s	remaining: 19.3s
606:	learn: 49.3146612	total: 29.7s	remaining: 19.2s
607:	learn: 49.2790232	total: 29.7s	remaining: 19.2s
608:	learn: 49.2181020	total: 29.7s	remaining: 19.1s
609:	learn: 49.1754487	total: 29.8s	remaining: 19s
610:	learn: 49.1696410	total: 29.8s	remaining: 19s
611:	learn: 49.1323010	total: 29.8s	remaining: 18.9s
612:	learn: 49.0994455	total: 29.9s	remaining: 18.9s
613:	learn: 49.0562434	total: 29.9s	remaining: 18.8s
614:	learn: 49.0317891	total: 32.9s	remaining: 20.6s
615:	learn: 49.0179002	total: 32.9s	remaining: 20.5s
616:	learn: 48.9472870	total: 33s	remaining: 20.5s

754:	learn: 44.7598383	total: 39.2s	remaining: 12.7s
755:	learn: 44.7291700	total: 39.3s	remaining: 12.7s
756:	learn: 44.7123479	total: 39.3s	remaining: 12.6s
757:	learn: 44.6846407	total: 39.3s	remaining: 12.6s
758:	learn: 44.6579061	total: 39.4s	remaining: 12.5s
759:	learn: 44.6423897	total: 39.5s	remaining: 12.5s
760:	learn: 44.6341722	total: 39.5s	remaining: 12.4s
761:	learn: 44.6224385	total: 39.5s	remaining: 12.3s
762:	learn: 44.5954314	total: 39.6s	remaining: 12.3s
763:	learn: 44.5844568	total: 39.6s	remaining: 12.2s
764:	learn: 44.5609520	total: 39.7s	remaining: 12.2s
765:	learn: 44.5201775	total: 39.7s	remaining: 12.1s
766:	learn: 44.5056145	total: 39.8s	remaining: 12.1s
767:	learn: 44.4745730	total: 39.8s	remaining: 12s
768:	learn: 44.4642042	total: 39.9s	remaining: 12s
769:	learn: 44.4342880	total: 39.9s	remaining: 11.9s
770:	learn: 44.4125326	total: 40s	remaining: 11.9s
771:	learn: 44.3894805	total: 40s	remaining: 11.8s
772:	learn: 44.3533057	total: 40s	remaining: 11.8s
773

912:	learn: 40.9606818	total: 46.3s	remaining: 4.42s
913:	learn: 40.9208615	total: 46.4s	remaining: 4.37s
914:	learn: 40.8738366	total: 46.4s	remaining: 4.32s
915:	learn: 40.8724577	total: 46.5s	remaining: 4.26s
916:	learn: 40.8680082	total: 46.6s	remaining: 4.22s
917:	learn: 40.8433123	total: 46.7s	remaining: 4.17s
918:	learn: 40.8345213	total: 46.8s	remaining: 4.12s
919:	learn: 40.7915572	total: 46.8s	remaining: 4.07s
920:	learn: 40.7535257	total: 46.9s	remaining: 4.02s
921:	learn: 40.7324986	total: 47s	remaining: 3.97s
922:	learn: 40.7003183	total: 47s	remaining: 3.92s
923:	learn: 40.6946709	total: 47.1s	remaining: 3.87s
924:	learn: 40.6745200	total: 47.2s	remaining: 3.82s
925:	learn: 40.6384693	total: 47.3s	remaining: 3.78s
926:	learn: 40.6346384	total: 47.3s	remaining: 3.72s
927:	learn: 40.6195203	total: 47.3s	remaining: 3.67s
928:	learn: 40.6083090	total: 47.4s	remaining: 3.62s
929:	learn: 40.5972910	total: 47.4s	remaining: 3.57s
930:	learn: 40.5867815	total: 47.5s	remaining: 3.5

In [None]:
'''models = [lasso, ridge, elastic, svr, rfr, etr, gbr, xgbr, lgbmr, ctr]

for m in models:
    
    m.fit(X, y)
    
    y_pred = m.predict(X)
    
    print(f'RMSE Model {m}: {mse(y, y_pred, squared=False)}')'''

In [None]:
'''X = train1.drop('price', axis=1)

y = train1.price

lasso.fit(X, y)

y_pred = lasso.predict(test1)

sample1.price = y_pred

sample1.to_csv('../data/predictions.csv', index=False)

sample1'''

In [None]:
# H2O

h2o.init()

# parsear datos para h20

h2train=h2o.H2OFrame(train1)

h2test=h2o.H2OFrame(test1)

X=[c for c in h2train.columns if c!='price']

y='price'

# inicia auto-machine-learning

automl=H2OAutoML(max_models=50,
                 seed=42,
                 max_runtime_secs=300,
                 sort_metric='RMSE')

In [None]:
# entrena

automl.train(x=X,
             y=y,
             training_frame=h2train)

In [None]:
# prediciones del lider

y_pred = automl.leader.predict(h2test)

y_pred = y_pred.as_data_frame()

sample1.price = y_pred

sample1.to_csv('../data/predictions.csv', index=False)

sample1

In [None]:
# buscando colinealidad

plt.figure(figsize=(200, 200))

sns.set(style='white')

mask=np.triu(np.ones_like(train1.corr(), dtype=bool))

cmap=sns.diverging_palette(0, 10, as_cmap=True)

sns.heatmap(train1.corr(),
            mask=mask,
            cmap=cmap,
            center=0,
            square=True,
            linewidths=0.5,
            cbar_kws={'shrink': 0.5},
            annot=True);

In [None]:
train1.corr()

In [None]:
train.corr().price