In [1]:
# IMPORT

import pandas as pd
import pylab as plt
import seaborn as sns
import numpy as np
import h2o

from sklearn.model_selection import train_test_split
from h2o.automl import H2OAutoML
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2

import warnings
warnings.filterwarnings('ignore')

In [2]:
# LINEAR REGRESSION MODELS

from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
lasso=Lasso()
ridge=Ridge()
elastic=ElasticNet()
le=LabelEncoder

from sklearn.svm import SVR
svr=SVR()

from sklearn.ensemble import RandomForestRegressor as RFR  
from sklearn.tree import ExtraTreeRegressor as ETR
rfr=RFR()
etr=ETR()

from sklearn.ensemble import GradientBoostingRegressor as GBR
from xgboost import XGBRegressor as XGBR
from catboost import CatBoostRegressor as CTR
from lightgbm import LGBMRegressor as LGBMR
gbr=GBR()
xgbr=XGBR()
ctr=CTR()
lgbmr=LGBMR()

In [3]:
# LOAD CSVs

train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample = pd.read_csv('../data/sample.csv')

In [4]:
train1 = train.copy()
test1 = test.copy()
sample1 = sample.copy()

In [5]:
le = LabelEncoder()

for c in train1.columns:
    
    if c=='host_location' or c=='host_verifications':
        
        le.fit(train1[c].astype(str))
        
        train1[c]=le.transform(train1[c].astype(str))

In [6]:
train1=pd.get_dummies(train1, columns=['property_type'], drop_first=True)
lista = [col for col in train1.columns if 'property_type' in col]
train1[lista] = train1[lista].astype(float)

train1=pd.get_dummies(train1, columns=['bathrooms_text'], drop_first=True)
lista = [col for col in train1.columns if 'bathrooms_text' in col]
train1[lista] = train1[lista].astype(float)

train1=pd.get_dummies(train1, columns=['neighbourhood_cleansed'], drop_first=True)
lista = [col for col in train1.columns if 'neighbourhood_cleansed' in col]
train1[lista] = train1[lista].astype(float)

train1=pd.get_dummies(train1, columns=['host_neighbourhood'], drop_first=True)
lista = [col for col in train1.columns if 'host_neighbourhood' in col]
train1[lista] = train1[lista].astype(float)

'''train1=pd.get_dummies(train1, columns=['amenities'], drop_first=True)
lista = [col for col in train1.columns if 'amenities' in col]
train1[lista] = train1[lista].astype(float)'''

"train1=pd.get_dummies(train1, columns=['amenities'], drop_first=True)\nlista = [col for col in train1.columns if 'amenities' in col]\ntrain1[lista] = train1[lista].astype(float)"

In [7]:
def trans(train1):    
    
    if 'price' in train1.columns:
    
        # CHANGE PRICE COLUMN POSITION

        aux = train1.price

        train1 = train1.drop('price',axis=1)

        train1.insert(1, 'price', aux)

        train1 = train1[train1.price<=6000]

    # TRANSFORM COLUMNS

    train1.host_acceptance_rate = train1.host_acceptance_rate.str.replace('%','').astype(float)/100

    train1.host_response_rate = train1.host_response_rate.str.replace('%','').astype(float)/100

    # DROP TRAIN COLUMNS

    col_importance = ['property_type','neighbourhood_cleansed','host_verifications','host_neighbourhood',
                      'availability_365','longitude','bedrooms','accommodates','host_acceptance_rate','host_location',
                      'minimum_maximum_nights','availability_90','review_scores_value','host_total_listings_count',
                      'availability_30','number_of_reviews','host_id','first_review','bathrooms_text','host_response_rate']

    col_classification = [c for c in train1.columns if np.dtype(train1[c]) is np.dtype(np.object)]

    col_noutil = ['listing_url','scrape_id','last_scraped','name','description','neighborhood_overview','picture_url',
                  'host_url']

    train1 = train1.dropna(axis=1, how='all')
    train1 = train1.drop(col_noutil, axis=1)
    train1 = train1.drop(columns=[c for c in train1 if c in col_classification and c!='price' and c!='host_acceptance_rate'
                                  and c!='host_response_rate'], axis=1)
    # No correlación con price
    train1 = train1.drop(columns='id', axis=1)
    # >0.9 Correlación con otras columnas
    train1 = train1.drop('minimum_nights', axis=1)
    train1 = train1.drop('minimum_minimum_nights', axis=1)
    train1 = train1.drop('minimum_nights_avg_ntm', axis=1)
    train1 = train1.drop('availability_90', axis=1)
    train1 = train1.drop('availability_60', axis=1)
    train1 = train1.drop('host_total_listings_count', axis=1)
    train1 = train1.drop('host_listings_count', axis=1)

    # FILLNA columnas numéricas con sus medias

    col_somenull = [c for c in train1.columns if (train1[c].isnull().sum()>0)]

    for c in col_somenull:

        # Probar a cambiar mean por mode o median
        train1[c].fillna(round(train1[c].median()), inplace=True)
        
    return train1

train1 = trans(train1)

In [8]:
le = LabelEncoder()

for c in test1.columns:
    
    if c=='host_location' or c=='host_verifications':
        
        le.fit(test1[c].astype(str))
        
        test1[c]=le.transform(test1[c].astype(str))

test1=pd.get_dummies(test1, columns=['property_type'], drop_first=True)
lista = [col for col in test1.columns if 'property_type' in col]
test1[lista] = test1[lista].astype(float)

test1=pd.get_dummies(test1, columns=['host_neighbourhood'], drop_first=True)
lista = [col for col in test1.columns if 'host_neighbourhood' in col]
test1[lista] = test1[lista].astype(float)

test1=pd.get_dummies(test1, columns=['bathrooms_text'], drop_first=True)
lista = [col for col in test1.columns if 'bathrooms_text' in col]
test1[lista] = test1[lista].astype(float)

test1=pd.get_dummies(test1, columns=['neighbourhood_cleansed'], drop_first=True)
lista = [col for col in test1.columns if 'neighbourhood_cleansed' in col]
test1[lista] = test1[lista].astype(float)

test1 = trans(test1)

In [9]:
'''# TRANSFORM COLUMNS

test1.host_acceptance_rate = test1.host_acceptance_rate.str.replace('%','').astype(float)/100

test1.host_response_rate = test1.host_response_rate.str.replace('%','').astype(float)/100

# DROP TRAIN COLUMNS

col_importance = ['price','property_type','neighbourhood_cleansed','host_verifications','host_neighbourhood',
                  'availability_365','longitude','bedrooms','accommodates','host_acceptance_rate','host_location',
                  'minimum_maximum_nights','availability_90','review_scores_value','host_total_listings_count',
                  'availability_30','number_of_reviews','host_id','first_review','bathrooms_text','host_response_rate']

col_classification = [c for c in test1.columns if np.dtype(test1[c]) is np.dtype(np.object)]

col_noutil = ['listing_url','scrape_id','last_scraped','name','description','neighborhood_overview','picture_url',
              'host_url']

test1 = test1.dropna(axis=1, how='all')
test1 = test1.drop(col_noutil, axis=1)
test1 = test1.drop(columns=[c for c in test1 if c in col_classification and c!='price' and c!='host_acceptance_rate'
                              and c!='host_response_rate'], axis=1)
# Sin correlación con price
test1 = test1.drop(columns='id', axis=1)
# Mucha correlación con otras columnas
test1 = test1.drop('minimum_nights', axis=1)
test1 = test1.drop('minimum_minimum_nights', axis=1)
test1 = test1.drop('minimum_nights_avg_ntm', axis=1)
test1 = test1.drop('availability_90', axis=1)
test1 = test1.drop('availability_60', axis=1)

# FILLNA columnas numéricas con sus medias

col_somenull = [c for c in test1.columns if test1[c].isnull().sum()>0]

for c in col_somenull:
    
    # Probar a cambiar mean por mode o median
    test1[c].fillna(round(test1[c].median()), inplace=True)'''

"# TRANSFORM COLUMNS\n\ntest1.host_acceptance_rate = test1.host_acceptance_rate.str.replace('%','').astype(float)/100\n\ntest1.host_response_rate = test1.host_response_rate.str.replace('%','').astype(float)/100\n\n# DROP TRAIN COLUMNS\n\ncol_importance = ['price','property_type','neighbourhood_cleansed','host_verifications','host_neighbourhood',\n                  'availability_365','longitude','bedrooms','accommodates','host_acceptance_rate','host_location',\n                  'minimum_maximum_nights','availability_90','review_scores_value','host_total_listings_count',\n                  'availability_30','number_of_reviews','host_id','first_review','bathrooms_text','host_response_rate']\n\ncol_classification = [c for c in test1.columns if np.dtype(test1[c]) is np.dtype(np.object)]\n\ncol_noutil = ['listing_url','scrape_id','last_scraped','name','description','neighborhood_overview','picture_url',\n              'host_url']\n\ntest1 = test1.dropna(axis=1, how='all')\ntest1 = test1.dro

In [10]:
X = train1.drop('price', axis=1)

y = train1.price

X.shape, y.shape

((4165, 176), (4165,))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, train_size=0.8,
                                                    random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3332, 176), (833, 176), (3332,), (833,))

In [12]:
# RMSE POR MODELO
# modelo que da error: lgbmr
models = [lasso, ridge, elastic, svr, rfr, etr, gbr, xgbr, ctr]

for m in models:
    
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    
    print(f'RMSE Model {m}: {mse(y_test, y_pred, squared=False)}')
    print(f'R2 Model {m}: {r2(y_test, y_pred)}')

RMSE Model Lasso(): 80.12018312076914
R2 Model Lasso(): 0.5138856827877281
RMSE Model Ridge(): 77.30762001562303
R2 Model Ridge(): 0.5474160458747346
RMSE Model ElasticNet(): 84.20921773585764
R2 Model ElasticNet(): 0.4630005860339582
RMSE Model SVR(): 118.86979896430756
R2 Model SVR(): -0.07003406096333276
RMSE Model RandomForestRegressor(): 75.72997589600489
R2 Model RandomForestRegressor(): 0.5656996463829149
RMSE Model ExtraTreeRegressor(): 106.94163176724963
R2 Model ExtraTreeRegressor(): 0.13393964482384524
RMSE Model GradientBoostingRegressor(): 78.09025284719479
R2 Model GradientBoostingRegressor(): 0.5382060872703339
RMSE Model XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
        

128:	learn: 73.7902844	total: 2.11s	remaining: 14.2s
129:	learn: 73.7045677	total: 2.12s	remaining: 14.2s
130:	learn: 73.5901748	total: 2.13s	remaining: 14.1s
131:	learn: 73.5183301	total: 2.14s	remaining: 14.1s
132:	learn: 73.3953440	total: 2.15s	remaining: 14s
133:	learn: 73.3382306	total: 2.17s	remaining: 14s
134:	learn: 73.2746326	total: 2.18s	remaining: 14s
135:	learn: 73.1301339	total: 2.19s	remaining: 13.9s
136:	learn: 73.0340461	total: 2.2s	remaining: 13.9s
137:	learn: 72.9162097	total: 2.21s	remaining: 13.8s
138:	learn: 72.8991482	total: 2.23s	remaining: 13.8s
139:	learn: 72.8299557	total: 2.24s	remaining: 13.8s
140:	learn: 72.7529800	total: 2.26s	remaining: 13.8s
141:	learn: 72.6525293	total: 2.28s	remaining: 13.8s
142:	learn: 72.4746771	total: 2.31s	remaining: 13.8s
143:	learn: 72.4075578	total: 2.34s	remaining: 13.9s
144:	learn: 72.3712279	total: 2.36s	remaining: 13.9s
145:	learn: 72.2689508	total: 2.38s	remaining: 13.9s
146:	learn: 72.2364264	total: 2.42s	remaining: 14s
14

285:	learn: 61.9920926	total: 4.07s	remaining: 10.2s
286:	learn: 61.9326465	total: 4.08s	remaining: 10.1s
287:	learn: 61.9261512	total: 4.09s	remaining: 10.1s
288:	learn: 61.8367694	total: 4.1s	remaining: 10.1s
289:	learn: 61.7883017	total: 4.11s	remaining: 10.1s
290:	learn: 61.7816689	total: 4.11s	remaining: 10s
291:	learn: 61.7755258	total: 4.12s	remaining: 10s
292:	learn: 61.6989807	total: 4.13s	remaining: 9.97s
293:	learn: 61.6928019	total: 4.14s	remaining: 9.94s
294:	learn: 61.6869681	total: 4.15s	remaining: 9.92s
295:	learn: 61.5950885	total: 4.16s	remaining: 9.89s
296:	learn: 61.5185634	total: 4.17s	remaining: 9.87s
297:	learn: 61.4715331	total: 4.18s	remaining: 9.84s
298:	learn: 61.3764440	total: 4.19s	remaining: 9.82s
299:	learn: 61.3213202	total: 4.2s	remaining: 9.79s
300:	learn: 61.2674817	total: 4.21s	remaining: 9.77s
301:	learn: 61.2213510	total: 4.21s	remaining: 9.74s
302:	learn: 61.1415891	total: 4.22s	remaining: 9.72s
303:	learn: 61.0820560	total: 4.24s	remaining: 9.7s


445:	learn: 54.1748748	total: 5.77s	remaining: 7.16s
446:	learn: 54.1370372	total: 5.78s	remaining: 7.15s
447:	learn: 54.0901427	total: 5.79s	remaining: 7.13s
448:	learn: 54.0870711	total: 5.79s	remaining: 7.11s
449:	learn: 54.0531462	total: 5.8s	remaining: 7.09s
450:	learn: 54.0187842	total: 5.81s	remaining: 7.08s
451:	learn: 53.9785229	total: 5.82s	remaining: 7.06s
452:	learn: 53.9404263	total: 5.83s	remaining: 7.04s
453:	learn: 53.9374322	total: 5.84s	remaining: 7.03s
454:	learn: 53.9159434	total: 5.85s	remaining: 7.01s
455:	learn: 53.8882569	total: 5.86s	remaining: 6.99s
456:	learn: 53.8703987	total: 5.87s	remaining: 6.97s
457:	learn: 53.8255555	total: 5.88s	remaining: 6.96s
458:	learn: 53.7823234	total: 5.89s	remaining: 6.94s
459:	learn: 53.7131333	total: 5.9s	remaining: 6.92s
460:	learn: 53.6520488	total: 5.91s	remaining: 6.91s
461:	learn: 53.5954921	total: 5.92s	remaining: 6.9s
462:	learn: 53.5007990	total: 5.93s	remaining: 6.88s
463:	learn: 53.4623082	total: 5.94s	remaining: 6.

609:	learn: 48.1186753	total: 7.73s	remaining: 4.94s
610:	learn: 48.1122558	total: 7.74s	remaining: 4.93s
611:	learn: 48.0840715	total: 7.75s	remaining: 4.92s
612:	learn: 48.0477793	total: 7.77s	remaining: 4.9s
613:	learn: 47.9753764	total: 7.78s	remaining: 4.89s
614:	learn: 47.9516095	total: 7.79s	remaining: 4.87s
615:	learn: 47.9272223	total: 7.79s	remaining: 4.86s
616:	learn: 47.9043792	total: 7.8s	remaining: 4.84s
617:	learn: 47.8884809	total: 7.81s	remaining: 4.83s
618:	learn: 47.8863947	total: 7.82s	remaining: 4.81s
619:	learn: 47.8236519	total: 7.83s	remaining: 4.8s
620:	learn: 47.8197547	total: 7.84s	remaining: 4.79s
621:	learn: 47.7639472	total: 7.85s	remaining: 4.77s
622:	learn: 47.7510845	total: 7.86s	remaining: 4.76s
623:	learn: 47.7241330	total: 7.87s	remaining: 4.74s
624:	learn: 47.6841106	total: 7.88s	remaining: 4.73s
625:	learn: 47.6495825	total: 7.89s	remaining: 4.71s
626:	learn: 47.6218623	total: 7.9s	remaining: 4.7s
627:	learn: 47.5807485	total: 7.91s	remaining: 4.68

769:	learn: 43.6486333	total: 9.68s	remaining: 2.89s
770:	learn: 43.6097718	total: 9.7s	remaining: 2.88s
771:	learn: 43.5683212	total: 9.72s	remaining: 2.87s
772:	learn: 43.5267493	total: 9.73s	remaining: 2.86s
773:	learn: 43.5051593	total: 9.74s	remaining: 2.85s
774:	learn: 43.4856299	total: 9.76s	remaining: 2.83s
775:	learn: 43.4512651	total: 9.77s	remaining: 2.82s
776:	learn: 43.4319727	total: 9.77s	remaining: 2.81s
777:	learn: 43.4054808	total: 9.78s	remaining: 2.79s
778:	learn: 43.3942103	total: 9.79s	remaining: 2.78s
779:	learn: 43.3708727	total: 9.8s	remaining: 2.76s
780:	learn: 43.3615061	total: 9.81s	remaining: 2.75s
781:	learn: 43.3285833	total: 9.82s	remaining: 2.74s
782:	learn: 43.3158759	total: 9.83s	remaining: 2.72s
783:	learn: 43.2790008	total: 9.84s	remaining: 2.71s
784:	learn: 43.2516416	total: 9.85s	remaining: 2.7s
785:	learn: 43.2233283	total: 9.86s	remaining: 2.68s
786:	learn: 43.2147739	total: 9.86s	remaining: 2.67s
787:	learn: 43.2013401	total: 9.88s	remaining: 2.

933:	learn: 39.8314754	total: 11.6s	remaining: 822ms
934:	learn: 39.8228879	total: 11.7s	remaining: 810ms
935:	learn: 39.7783914	total: 11.7s	remaining: 797ms
936:	learn: 39.7727264	total: 11.7s	remaining: 785ms
937:	learn: 39.7405661	total: 11.7s	remaining: 772ms
938:	learn: 39.7033145	total: 11.7s	remaining: 760ms
939:	learn: 39.6657273	total: 11.7s	remaining: 747ms
940:	learn: 39.6599178	total: 11.7s	remaining: 735ms
941:	learn: 39.6388403	total: 11.7s	remaining: 723ms
942:	learn: 39.6228140	total: 11.8s	remaining: 711ms
943:	learn: 39.6076032	total: 11.8s	remaining: 699ms
944:	learn: 39.5795681	total: 11.8s	remaining: 686ms
945:	learn: 39.5616679	total: 11.8s	remaining: 674ms
946:	learn: 39.5473100	total: 11.8s	remaining: 662ms
947:	learn: 39.5230007	total: 11.8s	remaining: 649ms
948:	learn: 39.5164560	total: 11.8s	remaining: 636ms
949:	learn: 39.5021629	total: 11.9s	remaining: 624ms
950:	learn: 39.4933899	total: 11.9s	remaining: 611ms
951:	learn: 39.4590583	total: 11.9s	remaining:

In [None]:
'''models = [lasso, ridge, elastic, svr, rfr, etr, gbr, xgbr, lgbmr, ctr]

for m in models:
    
    m.fit(X, y)
    
    y_pred = m.predict(X)
    
    print(f'RMSE Model {m}: {mse(y, y_pred, squared=False)}')'''

In [None]:
'''X = train1.drop('price', axis=1)

y = train1.price

lasso.fit(X, y)

y_pred = lasso.predict(test1)

sample1.price = y_pred

sample1.to_csv('../data/predictions.csv', index=False)

sample1'''

In [None]:
# H2O

h2o.init()

# parsear datos para h20

h2train=h2o.H2OFrame(train1)

h2test=h2o.H2OFrame(test1)

X=[c for c in h2train.columns if c!='price']

y='price'

# inicia auto-machine-learning

automl=H2OAutoML(max_models=50,
                 seed=42,
                 max_runtime_secs=300,
                 sort_metric='RMSE')

In [None]:
# entrena

automl.train(x=X,
             y=y,
             training_frame=h2train)

In [None]:
# prediciones del lider

y_pred = automl.leader.predict(h2test)

y_pred = y_pred.as_data_frame()

sample1.price = y_pred

sample1.to_csv('../data/predictions.csv', index=False)

sample1

In [None]:
# buscando colinealidad

plt.figure(figsize=(200, 200))

sns.set(style='white')

mask=np.triu(np.ones_like(train1.corr(), dtype=bool))

cmap=sns.diverging_palette(0, 10, as_cmap=True)

sns.heatmap(train1.corr(),
            mask=mask,
            cmap=cmap,
            center=0,
            square=True,
            linewidths=0.5,
            cbar_kws={'shrink': 0.5},
            annot=True);

In [None]:
train1.corr()

In [None]:
train.corr().price