In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
#import modelos
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor

In [3]:
#filtrando los datos
df_original = pd.read_csv("train.csv", dtype={
    'tipodepropiedad':'category', 'ciudad':'category',\
    'provincia':'category'}, parse_dates=[16])

df_original['fecha'] = pd.to_datetime(df_original['fecha'])

df = df_original.drop(columns=['fecha', 'id', 'titulo', 'descripcion', 'direccion', 'lat', 'lng', 'precio']).copy()
y = df_original['precio'].values

df['mes'] = df_original['fecha'].dt.month
df['anio'] = df_original['fecha'].dt.year
columnas = ['metroscubiertos', 'metrostotales', 'habitaciones', 'banos', 'idzona', 'garages', 'antiguedad']
df[columnas] = df[columnas].fillna(value=0)

df['cant_extras'] = sum([df['gimnasio'],df['usosmultiples'],df['piscina'],
                        df['escuelascercanas'],df['centroscomercialescercanos']])

df['tipodepropiedad'] = df['tipodepropiedad'].cat.codes
df['ciudad'] = df['ciudad'].cat.codes
df['provincia'] = df['provincia'].cat.codes

X = df.copy().values

In [4]:
#splitear train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [5]:
modelos=[LinearRegression(),KNeighborsRegressor(),Ridge(),Lasso(),MLPRegressor(alpha=20),DecisionTreeRegressor(),ExtraTreeRegressor(),XGBRegressor(),RandomForestRegressor(),AdaBoostRegressor(),GradientBoostingRegressor(),BaggingRegressor()]
modelos_nombre=['LinearRegression','KNNRegressor','Ridge','Lasso','MLPRegressor','DecisionTree','ExtraTree','XGBoost','RandomForest','AdaBoost','GradientBoost','Bagging']
score_=[]

In [6]:
#ejectucar todos los modelos
for nombre,modelo in zip(modelos_nombre,modelos):
    print('empieza entrenar：'+ nombre)
    modelo= modelo   
    modelo.fit(X_train,y_train)
    score=modelo.score(X_test,y_test)
    print(nombre +' score:'+str(score))

empieza entrenar：LinearRegression
LinearRegression score:0.361233938409
empieza entrenar：KNNRegressor
KNNRegressor score:0.692980959585
empieza entrenar：Ridge
Ridge score:0.36123393096
empieza entrenar：Lasso
Lasso score:0.361233914801
empieza entrenar：MLPRegressor
MLPRegressor score:0.369119641676
empieza entrenar：DecisionTree
DecisionTree score:0.556978677594
empieza entrenar：ExtraTree
ExtraTree score:0.499048875175
empieza entrenar：XGBoost
XGBoost score:0.681062788231
empieza entrenar：RandomForest
RandomForest score:0.76297083085
empieza entrenar：AdaBoost
AdaBoost score:0.00649644415622
empieza entrenar：GradientBoost
GradientBoost score:0.679205209062
empieza entrenar：Bagging
Bagging score:0.76228520206


In [7]:
#aplico logaritmo
y_log = np.log(y)
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.25, random_state=42)

In [8]:
for nombre,modelo in zip(modelos_nombre,modelos):
    print('empieza entrenar：'+ nombre)
    modelo= modelo   
    modelo.fit(X_train,y_train_log)
    score=modelo.score(X_test,y_test_log)
    print(nombre +' score:'+str(score))

empieza entrenar：LinearRegression
LinearRegression score:0.468568179475
empieza entrenar：KNNRegressor
KNNRegressor score:0.768533585155
empieza entrenar：Ridge
Ridge score:0.468568149133
empieza entrenar：Lasso
Lasso score:0.378445924746
empieza entrenar：MLPRegressor
MLPRegressor score:-85.2502986737
empieza entrenar：DecisionTree
DecisionTree score:0.67558160356
empieza entrenar：ExtraTree
ExtraTree score:0.639069948482
empieza entrenar：XGBoost
XGBoost score:0.770385120383
empieza entrenar：RandomForest
RandomForest score:0.821901543689
empieza entrenar：AdaBoost
AdaBoost score:0.575903452311
empieza entrenar：GradientBoost
GradientBoost score:0.769624885582
empieza entrenar：Bagging
Bagging score:0.824114238237


In [9]:
#estanderizar datos
X_estandar=StandardScaler().fit_transform(X)
y=np.array(y).reshape(-1,1)
y_estandar=StandardScaler().fit_transform(y)
y_estandar=y_estandar.ravel()
X_train_estandar, X_test_estandar, y_train_estandar, y_test_estandar = train_test_split(X_estandar, y_estandar, test_size=0.25, random_state=42)


In [10]:
for nombre,modelo in zip(modelos_nombre,modelos):
    print('empieza entrenar：'+ nombre)
    modelo= modelo   
    modelo.fit(X_train_estandar,y_train_estandar)
    score=modelo.score(X_test_estandar,y_test_estandar)
    print(nombre +' score:'+str(score))

empieza entrenar：LinearRegression
LinearRegression score:0.361236244913
empieza entrenar：KNNRegressor
KNNRegressor score:0.601734313572
empieza entrenar：Ridge
Ridge score:0.361233919499
empieza entrenar：Lasso
Lasso score:-1.79817439336e-05
empieza entrenar：MLPRegressor
MLPRegressor score:0.411969096194
empieza entrenar：DecisionTree
DecisionTree score:0.557735641828
empieza entrenar：ExtraTree
ExtraTree score:0.506469540993
empieza entrenar：XGBoost
XGBoost score:0.682303051185
empieza entrenar：RandomForest
RandomForest score:0.759024363128
empieza entrenar：AdaBoost
AdaBoost score:0.117793049712
empieza entrenar：GradientBoost
GradientBoost score:0.682424385537
empieza entrenar：Bagging
Bagging score:0.761603856492


In [11]:
df_modelos_resultados = pd.read_csv("modelos_resultados.csv")
df_modelos_resultados

Unnamed: 0,modelo,sinmodificar,logaritmo,estandar,mejor_performance
0,LinearRegression,0.361234,0.468568,0.361236,logaritmo
1,KNNRegressor,0.692981,0.768534,0.601734,logaritmo
2,Ridge,0.361234,0.468568,0.361234,logaritmo
3,Lasso,0.361234,0.378446,-1.8e-05,logaritmo
4,MLPRegressor,0.387305,-281.162411,0.41135,estandar
5,DecisionTree,0.560428,0.673522,0.557428,logaritmo
6,ExtraTree,0.487954,0.635222,0.479507,logaritmo
7,XGBoost,0.681063,0.770385,0.682303,logaritmo
8,RandomForest,0.759886,0.82335,0.759021,logaritmo
9,AdaBoost,0.189561,0.557912,-0.053853,logaritmo


In [12]:
random_forest = RandomForestRegressor(n_estimators=200, max_depth=1000, n_jobs=-1)
random_forest.fit(X_train, y_train)
random_forest.feature_importances_

array([ 0.09742557,  0.05566406,  0.09251227,  0.03226349,  0.03130875,
        0.01392139,  0.07786624,  0.38906046,  0.07106923,  0.06258694,
        0.00305602,  0.0014424 ,  0.0028512 ,  0.00273734,  0.00298921,
        0.02509188,  0.03078257,  0.00737097])

In [13]:
random_forest = RandomForestRegressor(n_estimators=200, max_depth=1000, n_jobs=-1)
random_forest.fit(X_train, y_train_log)
random_forest.feature_importances_

array([ 0.08095614,  0.03582951,  0.0889979 ,  0.02364571,  0.03014272,
        0.01302167,  0.12138901,  0.45074236,  0.05810849,  0.04610976,
        0.0017444 ,  0.00066181,  0.00201997,  0.00201129,  0.00211153,
        0.01845722,  0.01914316,  0.00490735])

In [14]:
y_pred=random_forest.predict(X_test)
y_pred = np.exp(y_pred)
y_pred

array([ 3119472.42636328,  2424894.65408515,  6741394.39985208, ...,
        1448785.17076876,  2120633.90576078,  1307610.89393898])

In [15]:
df_test = pd.read_csv("test.csv", dtype={
    'tipodepropiedad':'category', 'ciudad':'category',\
    'provincia':'category'}, parse_dates=[16])

df_test['fecha'] = pd.to_datetime(df_original['fecha'])
Id = df_test['id']

df = df_test.drop(columns=['fecha', 'id', 'titulo', 'descripcion', 'direccion', 'lat', 'lng']).copy()

df['mes'] = df_test['fecha'].dt.month
df['anio'] = df_test['fecha'].dt.year
columnas = ['metroscubiertos', 'metrostotales', 'habitaciones', 'banos', 'idzona', 'garages', 'antiguedad']
df[columnas] = df[columnas].fillna(value=0)

df['cant_extras'] = sum([df['gimnasio'],df['usosmultiples'],df['piscina'],
                        df['escuelascercanas'],df['centroscomercialescercanos']])

df['tipodepropiedad'] = df['tipodepropiedad'].cat.codes
df['ciudad'] = df['ciudad'].cat.codes
df['provincia'] = df['provincia'].cat.codes

id_test = df.copy().values

In [16]:
valor_pred=random_forest.predict(id_test)
df_resultado = pd.DataFrame()
df_resultado['Id'] = Id
df_resultado['Valor'] = np.exp(valor_pred)
df_resultado.to_csv('resultado_version1.csv',index=False)