In [317]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.impute import SimpleImputer

In [318]:
#import modelos
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor

# Procesamiento de los datos

In [319]:
#filtrando los datos
df = pd.read_csv('test.csv', dtype={
    'tipodepropiedad':'category', 'ciudad':'category',\
    'provincia':'category'}, parse_dates=[16])

### Manejo de valores nulos

In [320]:
df.isnull().sum()

id                                0
titulo                         1378
descripcion                     401
tipodepropiedad                   7
direccion                     13191
ciudad                           83
provincia                        42
antiguedad                    10714
habitaciones                   5628
garages                        9323
banos                          6554
metroscubiertos                4299
metrostotales                 12655
idzona                         7179
lat                           30695
lng                           30695
fecha                             0
gimnasio                          0
usosmultiples                     0
piscina                           0
escuelascercanas                  0
centroscomercialescercanos        0
dtype: int64

A los datos del dataframe que tienen una de las columnas (de 'metrostotales' y 'metroscubiertos') pero no la otra, copiamos el dato de una a la otra según sea necesario.

In [321]:
## funcion para setear los superficies
def set_metros(row):
    total = row.metrostotales
    covered = row.metroscubiertos
    if np.isnan(total):
        row.metrostotales = covered
        return row
    if np.isnan(covered):
        row.metroscubiertos = total
        return row
    return row

In [322]:
df[['metrostotales', 'metroscubiertos']] = df[['metrostotales', 'metroscubiertos']].apply(set_metros, axis = 1)

In [323]:
df = df.loc[~df['tipodepropiedad'].isnull()] #Elimino los datos que no tengan tipo de propiedad (46)
df['garages'] = df['garages'].fillna(value = 0)

In [324]:
num_cols_con_nulls = ['antiguedad', 'habitaciones', 'banos', 'idzona']
cat_cols_con_nulls = ['ciudad', 'provincia']

In [325]:
# define numerical imputer
num_imputer = SimpleImputer(strategy='mean')

# imputing on numerical data
df[num_cols_con_nulls] = num_imputer.fit_transform(df[num_cols_con_nulls])

In [326]:
from sklearn.base import TransformerMixin

class SeriesImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        If the Series is of dtype Object, then impute with the most frequent object.
        If the Series is not of dtype Object, then impute with the mean.  

        """
    def fit(self, X, y=None):
        self.fill = X.value_counts().index[0]
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [327]:
# define categorical imputer
cat_imputer = SeriesImputer()

# imputing on categorical data
df['ciudad'] = cat_imputer.fit_transform(df['ciudad'])
df['provincia'] = cat_imputer.fit_transform(df['provincia'])

In [328]:
df.isnull().sum()

id                                0
titulo                         1378
descripcion                     401
tipodepropiedad                   0
direccion                     13184
ciudad                            0
provincia                         0
antiguedad                        0
habitaciones                      0
garages                           0
banos                             0
metroscubiertos                   0
metrostotales                     0
idzona                            0
lat                           30689
lng                           30689
fecha                             0
gimnasio                          0
usosmultiples                     0
piscina                           0
escuelascercanas                  0
centroscomercialescercanos        0
dtype: int64

In [330]:
df['fecha'] = pd.to_datetime(df['fecha'])

y = df['precio'].values

df['dia'] = df['fecha'].dt.day
df['mes'] = df['fecha'].dt.month
df['anio'] = df['fecha'].dt.year

df = df.drop(columns=['fecha', 'id', 'titulo', 'descripcion', 'direccion', 'lat', 'lng', 'precio']).copy()

X = df

df['cant_extras'] = sum([df['gimnasio'],df['usosmultiples'],df['piscina'],
                        df['escuelascercanas'],df['centroscomercialescercanos']])

df.head()

Unnamed: 0,tipodepropiedad,ciudad,provincia,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,dia,mes,anio,cant_extras
0,Casa,Miguel Hidalgo,Distrito Federal,29.0,3.0,0.0,4.0,300.0,300.0,2510839.0,0.0,0.0,0.0,0.0,0.0,20,7,2013,0.0
1,Apartamento,Mérida,Yucatán,8.151266,1.0,1.0,1.0,67.0,67.0,113851.0,0.0,0.0,0.0,0.0,0.0,24,10,2015,0.0
2,Apartamento,Coyoacán,Distrito Federal,0.0,2.0,1.0,2.0,87.0,100.0,23620.0,0.0,0.0,0.0,0.0,1.0,30,5,2015,1.0
3,Apartamento,Acapulco de Juárez,Guerrero,2.0,2.0,2.0,2.0,86.0,86.0,129347.0,0.0,0.0,0.0,0.0,0.0,2,4,2015,0.0
4,Casa,Tultitlán,Edo. de México,10.0,2.0,1.0,1.0,80.0,76.0,57125.0,0.0,0.0,0.0,1.0,1.0,15,8,2013,2.0


## Encoding variables categóricas

### Tipo de propiedad

In [331]:
df['tipodepropiedad'].unique()

[Casa, Apartamento, Casa en condominio, Terreno, Bodega comercial, ..., Otros, Duplex, Terreno industrial, Huerta, Lote]
Length: 22
Categories (22, object): [Casa, Apartamento, Casa en condominio, Terreno, ..., Duplex, Terreno industrial, Huerta, Lote]

In [332]:
df.shape

(59993, 19)

In [333]:
#One Hot Encoding
df = pd.get_dummies(df, prefix = ['Tipo'], columns = ['tipodepropiedad'])

In [334]:
df.shape

(59993, 40)

In [335]:
df.head()

Unnamed: 0,ciudad,provincia,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,gimnasio,...,Tipo_Oficina comercial,Tipo_Otros,Tipo_Quinta Vacacional,Tipo_Rancho,Tipo_Terreno,Tipo_Terreno comercial,Tipo_Terreno industrial,Tipo_Villa,Tipo_Huerta,Tipo_Lote
0,Miguel Hidalgo,Distrito Federal,29.0,3.0,0.0,4.0,300.0,300.0,2510839.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,Mérida,Yucatán,8.151266,1.0,1.0,1.0,67.0,67.0,113851.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,Coyoacán,Distrito Federal,0.0,2.0,1.0,2.0,87.0,100.0,23620.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,Acapulco de Juárez,Guerrero,2.0,2.0,2.0,2.0,86.0,86.0,129347.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,Tultitlán,Edo. de México,10.0,2.0,1.0,1.0,80.0,76.0,57125.0,0.0,...,0,0,0,0,0,0,0,0,0,0


### Provincia

Efectuare un ordinal encoding, teniendo en cuenta los precios promedio por provincia obtenidos a partir del TP1

In [336]:
precio_promedio_por_provincia = pd.read_csv('provincias_por_precio')
precio_promedio_por_provincia = precio_promedio_por_provincia.sort_values(by = 'precio_por_m2_mean')



def cambiar_valores(x):
    i = 0
    if x.name == 'precio_por_m2_mean':
        i+=1
        return i
    return x

precio_promedio_por_provincia = precio_promedio_por_provincia.apply(cambiar_valores)
precio_promedio_por_provincia['precio_por_m2_mean'] = precio_promedio_por_provincia['precio_por_m2_mean'].cumsum()
precio_promedio_por_provincia

Unnamed: 0,provincia,precio_por_m2_mean
31,Baja California,1
30,Durango,2
29,Sonora,3
28,Baja California Sur,4
27,Coahuila,5
26,Tlaxcala,6
25,Chihuahua,7
24,Aguascalientes,8
23,Colima,9
22,Hidalgo,10


In [337]:
orden_provincias = precio_promedio_por_provincia.set_index('provincia').to_dict()
orden_provincias = orden_provincias['precio_por_m2_mean']
orden_provincias

{'Baja California': 1,
 'Durango': 2,
 'Sonora': 3,
 'Baja California Sur': 4,
 'Coahuila': 5,
 'Tlaxcala': 6,
 'Chihuahua': 7,
 'Aguascalientes': 8,
 'Colima': 9,
 'Hidalgo': 10,
 'Tamaulipas': 11,
 'Campeche': 12,
 'Yucatan': 13,
 'Sinaloa': 14,
 'Michoacan': 15,
 'Zacatecas': 16,
 'Nayarit': 17,
 'Oaxaca': 18,
 'San Luis Potosi': 19,
 'Guanajuato': 20,
 'Queretaro': 21,
 'Veracruz': 22,
 'Morelos': 23,
 'Chiapas': 24,
 'Puebla': 25,
 'Quintana Roo': 26,
 'Tabasco': 27,
 'Jalisco': 28,
 'Nuevo Leon': 29,
 'Mexico': 30,
 'Guerrero': 31,
 'Distrito Federal': 32}

In [338]:
df['provincia'] = df['provincia'].replace({'Edo. de México' : 'Mexico',
                                               'Querétaro' : 'Queretaro',
                                               'Nuevo León' : 'Nuevo Leon',
                                               'San luis Potosí' : 'San Luis Potosi',
                                               'Yucatán' : 'Yucatan',
                                               'Baja California Norte' : 'Baja California',
                                               'Michoacán' : 'Michoacan'})

In [339]:
df['provincia_ordinal'] = df.provincia.map(orden_provincias)
df.head()

Unnamed: 0,ciudad,provincia,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,gimnasio,...,Tipo_Otros,Tipo_Quinta Vacacional,Tipo_Rancho,Tipo_Terreno,Tipo_Terreno comercial,Tipo_Terreno industrial,Tipo_Villa,Tipo_Huerta,Tipo_Lote,provincia_ordinal
0,Miguel Hidalgo,Distrito Federal,29.0,3.0,0.0,4.0,300.0,300.0,2510839.0,0.0,...,0,0,0,0,0,0,0,0,0,32
1,Mérida,Yucatan,8.151266,1.0,1.0,1.0,67.0,67.0,113851.0,0.0,...,0,0,0,0,0,0,0,0,0,13
2,Coyoacán,Distrito Federal,0.0,2.0,1.0,2.0,87.0,100.0,23620.0,0.0,...,0,0,0,0,0,0,0,0,0,32
3,Acapulco de Juárez,Guerrero,2.0,2.0,2.0,2.0,86.0,86.0,129347.0,0.0,...,0,0,0,0,0,0,0,0,0,31
4,Tultitlán,Mexico,10.0,2.0,1.0,1.0,80.0,76.0,57125.0,0.0,...,0,0,0,0,0,0,0,0,0,30


In [340]:
#df['ciudad'] = df['ciudad'].cat.codes
#df['provincia'] = df['provincia'].cat.codes
df = df.drop(columns = ['provincia', 'ciudad'])

In [341]:
df.head()

Unnamed: 0,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,gimnasio,usosmultiples,piscina,...,Tipo_Otros,Tipo_Quinta Vacacional,Tipo_Rancho,Tipo_Terreno,Tipo_Terreno comercial,Tipo_Terreno industrial,Tipo_Villa,Tipo_Huerta,Tipo_Lote,provincia_ordinal
0,29.0,3.0,0.0,4.0,300.0,300.0,2510839.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,32
1,8.151266,1.0,1.0,1.0,67.0,67.0,113851.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,13
2,0.0,2.0,1.0,2.0,87.0,100.0,23620.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,32
3,2.0,2.0,2.0,2.0,86.0,86.0,129347.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,31
4,10.0,2.0,1.0,1.0,80.0,76.0,57125.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,30


## Ejecucion de modelos

In [342]:
X = df.copy().values

In [343]:
def ejecutar_modelos(X_train, y_train, X_test, y_test, modelos, nombres):
    resultado = []
    for nombre,modelo in zip(modelos_nombre, modelos):
        print('------------------------------------------------')
        print('Comienza a entrenar: '+ nombre)
        print()
        modelo = modelo   
        modelo.fit(X_train,y_train)
        score = modelo.score(X_test,y_test)
        resultado.append((str(nombre), str(score)))
        print('**FINALIZADO** ' + nombre + ' Score: ' + str(score))
        print()
    return resultado

In [295]:
#splitear train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [296]:
modelos=[LinearRegression(),KNeighborsRegressor(),Ridge(),Lasso(),MLPRegressor(alpha=20),DecisionTreeRegressor(),ExtraTreeRegressor(),XGBRegressor(),RandomForestRegressor(),AdaBoostRegressor(),GradientBoostingRegressor(),BaggingRegressor()]
modelos_nombre=['LinearRegression','KNNRegressor','Ridge','Lasso','MLPRegressor','DecisionTree','ExtraTree','XGBoost','RandomForest','AdaBoost','GradientBoost','Bagging']
score_=[]

In [297]:
resultado_normal = ejecutar_modelos(X_train, y_train, X_test, y_test, modelos, modelos_nombre)

------------------------------------------------
Comienza a entrenar: LinearRegression

**FINALIZADO** LinearRegression Score: 0.5711356789197017

------------------------------------------------
Comienza a entrenar: KNNRegressor

**FINALIZADO** KNNRegressor Score: 0.646462318191011

------------------------------------------------
Comienza a entrenar: Ridge

**FINALIZADO** Ridge Score: 0.5711133552579843

------------------------------------------------
Comienza a entrenar: Lasso

**FINALIZADO** Lasso Score: 0.5711332241317963

------------------------------------------------
Comienza a entrenar: MLPRegressor

**FINALIZADO** MLPRegressor Score: 0.4431136850435383

------------------------------------------------
Comienza a entrenar: DecisionTree

**FINALIZADO** DecisionTree Score: 0.5355703540484082

------------------------------------------------
Comienza a entrenar: ExtraTree

**FINALIZADO** ExtraTree Score: 0.45951778116109876

------------------------------------------------
Comi

In [298]:
#aplico logaritmo
y_log = np.log(y)
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.25, random_state=42)

In [299]:
resultado_log = ejecutar_modelos(X_train, y_train_log, X_test, y_test_log, modelos, modelos_nombre)

------------------------------------------------
Comienza a entrenar: LinearRegression

**FINALIZADO** LinearRegression Score: 0.6787958224324625

------------------------------------------------
Comienza a entrenar: KNNRegressor

**FINALIZADO** KNNRegressor Score: 0.724528146881767

------------------------------------------------
Comienza a entrenar: Ridge

**FINALIZADO** Ridge Score: 0.6787756423550482

------------------------------------------------
Comienza a entrenar: Lasso

**FINALIZADO** Lasso Score: 0.5169830400226825

------------------------------------------------
Comienza a entrenar: MLPRegressor

**FINALIZADO** MLPRegressor Score: -282.780534670208

------------------------------------------------
Comienza a entrenar: DecisionTree

**FINALIZADO** DecisionTree Score: 0.6575923676081432

------------------------------------------------
Comienza a entrenar: ExtraTree

**FINALIZADO** ExtraTree Score: 0.6219563448950498

------------------------------------------------
Comien

In [None]:
RandomForestRegressor().fit_transform()

In [300]:
#estanderizar datos
X_estandar=StandardScaler().fit_transform(X)
y=np.array(y).reshape(-1,1)
y_estandar=StandardScaler().fit_transform(y)
y_estandar=y_estandar.ravel()
X_train_estandar, X_test_estandar, y_train_estandar, y_test_estandar = train_test_split(X_estandar, y_estandar, test_size=0.25, random_state=42)


In [301]:
resultado_estandar = ejecutar_modelos(X_train_estandar, y_train_estandar, X_test_estandar, y_test_estandar, modelos, modelos_nombre)

------------------------------------------------
Comienza a entrenar: LinearRegression

**FINALIZADO** LinearRegression Score: 0.5711356175554008

------------------------------------------------
Comienza a entrenar: KNNRegressor

**FINALIZADO** KNNRegressor Score: 0.6110149219877827

------------------------------------------------
Comienza a entrenar: Ridge

**FINALIZADO** Ridge Score: 0.5711356448241414

------------------------------------------------
Comienza a entrenar: Lasso

**FINALIZADO** Lasso Score: -1.718533198169503e-05

------------------------------------------------
Comienza a entrenar: MLPRegressor

**FINALIZADO** MLPRegressor Score: 0.5961823133171069

------------------------------------------------
Comienza a entrenar: DecisionTree

**FINALIZADO** DecisionTree Score: 0.5302896801968984

------------------------------------------------
Comienza a entrenar: ExtraTree

**FINALIZADO** ExtraTree Score: 0.4634860820714931

------------------------------------------------


In [302]:
#Crea un archivo csv de la forma nombre_del_algoritmo, score

#Columnas: string con nombres de columnas separados por comas
#Resultados: lista de resultados, siendo cada resultado una lista conformada por tuplas (Nombre_algoritmo, Score)
def crear_csv_resultados(columnas, resultados, nombre_archivo):
    with open(nombre_archivo, 'w') as f:
        f.write(columnas + '\n')
        lineas_a_escribir = []
        for i in range(len(resultados[0])):
            linea = resultados[0][i][0] + ','
            for resultado in resultados:
                linea = linea + resultado[i][1] + ','
            linea = linea[:len(linea)-1]
            lineas_a_escribir.append(linea)
        for linea in lineas_a_escribir:
            f.write(linea + '\n')

In [308]:
NOMBRE_CSV = 'one_hot+ordinal_provincias' # CAMBIARLO ASI NO SE PISAN!!!!
crear_csv_resultados('nombre,normal,log,estandar',[resultado_normal, resultado_log, resultado_estandar],NOMBRE_CSV)

In [304]:
pd.read_csv('one_hot_ciudades')

Unnamed: 0,nombre,normal,log
0,LinearRegression,0.555144,0.648737
1,KNNRegressor,0.651336,0.726062
2,Ridge,0.555125,0.64872
3,Lasso,0.555142,0.453054
4,MLPRegressor,0.433307,-15.786975
5,DecisionTree,0.556668,0.673395
6,ExtraTree,0.487819,0.621363
7,XGBoost,0.688102,0.772202
8,RandomForest,0.762187,0.820405
9,AdaBoost,0.265294,0.608815


In [305]:
pd.read_csv('num_imputter_mean')

Unnamed: 0,nombre,normal,log,estandar
0,LinearRegression,0.511245,0.59588,0.511245
1,KNNRegressor,0.672423,0.750899,0.573645
2,Ridge,0.511245,0.59588,0.511245
3,Lasso,0.511245,0.453054,-1.7e-05
4,MLPRegressor,0.45629,-58285.253482,0.506419
5,DecisionTree,0.54894,0.670336,0.545172
6,ExtraTree,0.492676,0.632293,0.475199
7,XGBoost,0.686549,0.777634,0.687392
8,RandomForest,0.757515,0.822943,0.75854
9,AdaBoost,0.375647,0.585711,0.340939


In [309]:
pd.read_csv(NOMBRE_CSV)

Unnamed: 0,nombre,normal,log,estandar
0,LinearRegression,0.571136,0.678796,0.571136
1,KNNRegressor,0.646462,0.724528,0.611015
2,Ridge,0.571113,0.678776,0.571136
3,Lasso,0.571133,0.516983,-1.7e-05
4,MLPRegressor,0.443114,-282.780535,0.596182
5,DecisionTree,0.53557,0.657592,0.53029
6,ExtraTree,0.459518,0.621956,0.463486
7,XGBoost,0.689195,0.77661,0.689946
8,RandomForest,0.749007,0.814572,0.745852
9,AdaBoost,0.200397,0.620372,0.124905
