# **1. Importación de *modules***

In [189]:
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# **2. Importación del dataset *properatti.csv***

In [190]:
data_raw = pd.read_csv("../data/properatti.csv", sep = ",", low_memory=False) 
#data.head(3)
data_raw.shape

(121220, 26)

# **3. Tratamiento de los datos**

## **3.1. Selección del subdataset AMBA**

In [191]:
amba = ['Capital Federal', 'Bs.As. G.B.A. Zona Sur', 'Bs.As. G.B.A. Zona Norte', 'Bs.As. G.B.A. Zona Oeste']
data_amba = data_raw[data_raw["state_name"].isin(amba)]
data_amba.shape

(81150, 26)

## **3.2. Limpieza e imputaciones**

### 3.2.1. Dropeo de columnas no informativas

In [192]:
cols2keep = ['property_type', 'state_name', 'place_name','place_with_parent_names','price_aprox_usd', 'surface_total_in_m2','surface_covered_in_m2','price_usd_per_m2', 'rooms', 'description', 'title', 'properati_url']
data_col_clean = data_amba.loc[:, cols2keep]
data_col_clean.columns
data_col_clean.shape

Index(['property_type', 'state_name', 'place_name', 'place_with_parent_names',
       'price_aprox_usd', 'surface_total_in_m2', 'surface_covered_in_m2',
       'price_usd_per_m2', 'rooms', 'description', 'title', 'properati_url'],
      dtype='object')

(81150, 12)

### 3.2.2. Chequeo de valores nulos (no modifica datasets)

In [193]:
cant_nulos_por_campo = data_col_clean.apply(lambda x: x.isnull().sum(), axis = 0)
percent_nulos_por_campo = data_col_clean.apply(lambda x: (100 * x.isnull().sum() / data_col_clean.shape[0]).round(2), axis = 0)
pd.DataFrame({'null': cant_nulos_por_campo, '%': percent_nulos_por_campo,'type': data_col_clean.dtypes})

Unnamed: 0,null,%,type
property_type,0,0.0,object
state_name,0,0.0,object
place_name,23,0.03,object
place_with_parent_names,0,0.0,object
price_aprox_usd,8656,10.67,float64
surface_total_in_m2,22792,28.09,float64
surface_covered_in_m2,8505,10.48,float64
price_usd_per_m2,29515,36.37,float64
rooms,51211,63.11,float64
description,1,0.0,object


### 3.2.3. Imputación de la columna *rooms*


In [194]:
# Comprobamos el tipo y los valores (no modifica datasets)
print(f'''summary rooms

type: {data_col_clean.rooms.dtype}
N total: {data_col_clean.shape[0]}
N null: {data_col_clean.rooms.isnull().sum()}
% null: {round((data_col_clean.rooms.isnull().sum() / data_col_clean.shape[0] * 100), 2)}%
unique: {np.sort(data_col_clean['rooms'].unique())}''')

summary rooms

type: float64
N total: 81150
N null: 51211
% null: 63.11%
unique: [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 22. 25. 32. nan]


In [195]:
# Uniformamos la capitalización de las variables que se van a usar para imputar rooms
data_col_clean['title'] = data_col_clean.title.str.upper()
data_col_clean['description'] = data_col_clean.description.str.upper()

In [196]:
room_mapper = {"UNO": "1", "UN": "1", "DOS": "2", "TRES": "3", "CUATRO": "4", "CINCO": "5", "SEIS": "6", "SIETE": "7", "OCHO": "8",
             "NUEVE": "9", "DIEZ": "10", "MONO AMBIENTE": "1 AMBIENTE", "MONOAMBIENTE": "1 AMBIENTE", "MONOAMB" : "1 AMBIENTE", "AMBIENTE DIVISIBLE": "1 AMBIENTE",
             "DORMITORIOS": "AMBIENTE", "DORMITORIO": "AMBIENTE", "HABITACIONES": "AMBIENTE", "HABITACION": "AMBIENTE"}

#for key in room_mapper.keys():
#    data_col_clean[['title', 'description']].replace(key, room_mapper[key], inplace = True, regex = False)

for key in room_mapper.keys():
    data_col_clean.description = data_col_clean.description.str.replace(key, room_mapper[key], regex = False)
    data_col_clean.title = data_col_clean.title.str.replace(key, room_mapper[key], regex = False)

In [197]:
data_input = data_col_clean

array_regex = ["(\d+)AMB", "(\d+) AMB", "(\d+)DORM", "(\d+) DORM", "(\d+)HABITACIO", "(\d+) HABITACIO"]

for regex in array_regex:
    controlRooms = data_input[(data_input.rooms.isnull())]
    controlRooms.rooms = controlRooms.title.str.extract(regex).astype(float)
    data_input.update(controlRooms)
    controlRooms = data_input[(data_input.rooms.isnull())]
    controlRooms.rooms = controlRooms.description.str.extract(regex).astype(float)
    data_input.update(controlRooms)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [198]:
# Comprobamos el tipo y los valores luego de la imputación (no modifica datasets)
print(f'''summary rooms

type: {data_input['rooms'].dtype}
N total: {data_input.shape[0]}
N null: {data_input['rooms'].isnull().sum()}
% null: {round((data_input['rooms'].isnull().sum() / data_col_clean.shape[0] * 100), 2)}%
unique: {np.sort(data_input['rooms'].unique())}''')

summary rooms

type: float64
N total: 81150
N null: 10685
% null: 13.17%
unique: [0.0000e+00 1.0000e+00 2.0000e+00 3.0000e+00 4.0000e+00 5.0000e+00
 6.0000e+00 7.0000e+00 8.0000e+00 9.0000e+00 1.0000e+01 1.1000e+01
 1.2000e+01 1.3000e+01 1.4000e+01 1.5000e+01 1.6000e+01 1.7000e+01
 1.8000e+01 2.0000e+01 2.1000e+01 2.2000e+01 2.3000e+01 2.4000e+01
 2.5000e+01 2.7000e+01 2.9000e+01 3.0000e+01 3.2000e+01 3.3000e+01
 3.5000e+01 3.6000e+01 3.7000e+01 4.0000e+01 4.2000e+01 4.5000e+01
 4.8000e+01 5.0000e+01 5.1000e+01 5.2000e+01 5.4000e+01 6.0000e+01
 6.2000e+01 7.0000e+01 7.2000e+01 8.0000e+01 8.2000e+01 8.3000e+01
 8.5000e+01 8.7000e+01 9.0000e+01 9.1000e+01 1.0000e+02 1.0300e+02
 1.2500e+02 2.1000e+02 2.7200e+02 4.0300e+02 6.0200e+02 6.6200e+02
 7.7200e+02 8.3100e+02 9.0200e+02 2.0000e+03 4.0000e+03 6.0030e+03
 2.0173e+04 2.0174e+04 6.5003e+04 1.1200e+05        nan]


### 3.2.4. Dropeo de duplicados

In [199]:
data_input.drop_duplicates(keep = "first", inplace = True)

### 3.2.5. Limpieza de *outliers* en *price_usd_per_m2*

In [200]:
#búsqueda y reemplazo de outliers (de más de 2 std, 95%) por NaN en las columnas numéricas, en un solo paso
data_out = data_input
df_sub = data_out.loc[:, 'price_usd_per_m2']
lim = np.abs((df_sub - df_sub.mean()) / df_sub.std(ddof=0)) < 2
data_out.loc[:, 'price_usd_per_m2'] = df_sub.where(lim, np.nan)
data_out.shape

(81150, 12)

### 3.2.6. Dropeo de *NaNs*

In [201]:
data_na = data_out
data_na.dropna(axis = 0, how = 'any', subset = ['property_type', 'state_name', 'place_name', 'price_aprox_usd', 'surface_total_in_m2','surface_covered_in_m2', 'rooms', 'price_usd_per_m2'], inplace = True)
data_na.shape

(42717, 12)

In [202]:
# Último chequeo de que no hay nans en las columnas de trabajo (feature y targets)

cant_nulos_por_campo = data_na.apply(lambda x: x.isnull().sum(), axis = 0)
percent_nulos_por_campo = data_na.apply(lambda x: (100 * x.isnull().sum() / data_na.shape[0]).round(2), axis = 0)
pd.DataFrame({'null': cant_nulos_por_campo, '%': percent_nulos_por_campo,'type': data_na.dtypes})

Unnamed: 0,null,%,type
property_type,0,0.0,object
state_name,0,0.0,object
place_name,0,0.0,object
place_with_parent_names,0,0.0,object
price_aprox_usd,0,0.0,float64
surface_total_in_m2,0,0.0,float64
surface_covered_in_m2,0,0.0,float64
price_usd_per_m2,0,0.0,float64
rooms,0,0.0,float64
description,0,0.0,object


## **3.3. Creacion de columnas *dummies***

### 3.3.1. *Amenities*

In [203]:
#fracciono la columna properti_url para sacar la nube de palabras mas repetidas
patron_url = re.compile(pattern = "_", flags = re.IGNORECASE)
lista_url = data_na["properati_url"].apply(lambda x : patron_url.split(x))
serie_palabras = pd.Series(np.hstack(lista_url))
#serie_palabras.value_counts().head(20).plot(kind="bar")
serie_palabras.value_counts().head(50)

venta                       42662
departamento                27323
garage                      21795
lavadero                    19749
balcon                      17206
parrilla                    16518
piscina                     14135
luminoso                    14032
suite                       13137
casa                        12777
toilette                    11959
placard                     10993
terraza                     10784
vestidor                     8882
jardin                       7775
patio                        6598
dependencias                 6451
sum                          5961
aire-acondicionado           5700
baulera                      5494
amenities                    5423
gimnasio                     4763
estrenar                     4394
lujoso                       4275
vista                        3850
subte-linea-d                3639
quincho                      3599
hidromasaje                  3170
subte-linea-b                2986
subte-linea-a 

In [204]:
# a partir de la nube de palabras selecciono las que son buenos adicionales
adicionales = ["garage", "balcon", "parrilla", "piscina", "terraza", "patio", "jardin", "quincho", "sum", "amenities", "baulera", "gimnasio", "subte-linea-d", "subte-linea-b", "subte-linea-a", "subte-linea-h", "subte-linea-e"]

In [205]:
#elimino el primer elemento de lista_url para no tener el elemento con el http: etc
for sublist in lista_url:
  del sublist[0]

lista_url

0         [venta, ph, mataderos, lavadero, patio, inmobi...
2         [venta, departamentos, mataderos, lavadero, pl...
6         [venta, ph, munro, lavadero, patio, garage, al...
7         [venta, departamentos, belgrano, lavadero, pis...
8         [venta, departamentos, belgrano, lavadero, pis...
                                ...                        
121154    [venta, casa, la-plata, suite, parrilla, pisci...
121158    [venta, departamento, recoleta, pueyrredon-av-...
121215    [venta, departamento, belgrano, balcon, suite,...
121216    [venta, casa, beccar, suite, hidromasaje, jard...
121217    [venta, departamento, villa-urquiza, holmberg,...
Name: properati_url, Length: 42717, dtype: object

In [206]:
#creo una función que compare la lista de palabras con la lista de listas
#y me da como resultado una lista de listas de palabras true/false segun coincida o no 
def buscador_palabras(quebuscar, dondebuscar):
  listadeextras = []
  for listas in dondebuscar:
    extras = []
    for palabra in quebuscar:
      if palabra in listas:
        extras.append(True)
      else:
        extras.append(False)
    listadeextras.append(extras)
  #print(listadeextras)
  return listadeextras     

In [207]:
#aplico la funcion a mi lista "adicionales" y "lista_url"
#chequeo que tenga la misma longitud de data
resultado = buscador_palabras(adicionales, lista_url)
len(resultado)

42717

In [208]:
#convierto resultado en dataframe, y renombro las columnas por la lista de palabras adicionales
df = pd.DataFrame(resultado)
df.columns = ["garage", "balcon", "parrilla", "piscina", "terraza", "patio", "jardin", "quincho", "s.u.m.", "amenities", "baulera", "gimnasio","subte-linea-d", "subte-linea-b", "subte-linea-a", "subte-linea-h", "subte-linea-e"]
df = df.add_prefix('am_')
df

Unnamed: 0,am_garage,am_balcon,am_parrilla,am_piscina,am_terraza,am_patio,am_jardin,am_quincho,am_s.u.m.,am_amenities,am_baulera,am_gimnasio,am_subte-linea-d,am_subte-linea-b,am_subte-linea-a,am_subte-linea-h,am_subte-linea-e
0,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42712,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False
42713,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False
42714,True,True,True,True,False,False,False,True,False,False,False,False,False,False,False,False,False
42715,True,False,True,True,False,False,True,True,False,False,False,False,False,False,False,False,False


In [209]:
#uno el dataframe original con el nuevo generado de true/false
#data = pd.merge(data,df,left_index=True, right_index=True)
data_dum_1 = data_na.join(df)
data_dum_1.columns
#antes me puso los dos indices como resultado del merge, y tuve que sacar la primera columna
#data.drop(columns=data.columns[0], axis=1,inplace=True)
data_dum_1.shape

Index(['property_type', 'state_name', 'place_name', 'place_with_parent_names',
       'price_aprox_usd', 'surface_total_in_m2', 'surface_covered_in_m2',
       'price_usd_per_m2', 'rooms', 'description', 'title', 'properati_url',
       'am_garage', 'am_balcon', 'am_parrilla', 'am_piscina', 'am_terraza',
       'am_patio', 'am_jardin', 'am_quincho', 'am_s.u.m.', 'am_amenities',
       'am_baulera', 'am_gimnasio', 'am_subte-linea-d', 'am_subte-linea-b',
       'am_subte-linea-a', 'am_subte-linea-h', 'am_subte-linea-e'],
      dtype='object')

(42717, 29)

### 3.3.2. *State name*

**creé las dummies sin drop_first para poder hacer el join**

In [210]:
df_sn = pd.get_dummies(data_dum_1["state_name"], prefix = 'sn', drop_first = True)

In [211]:
data_dum_2 = data_dum_1.join(df_sn)
data_dum_2.columns
data_dum_2.shape

Index(['property_type', 'state_name', 'place_name', 'place_with_parent_names',
       'price_aprox_usd', 'surface_total_in_m2', 'surface_covered_in_m2',
       'price_usd_per_m2', 'rooms', 'description', 'title', 'properati_url',
       'am_garage', 'am_balcon', 'am_parrilla', 'am_piscina', 'am_terraza',
       'am_patio', 'am_jardin', 'am_quincho', 'am_s.u.m.', 'am_amenities',
       'am_baulera', 'am_gimnasio', 'am_subte-linea-d', 'am_subte-linea-b',
       'am_subte-linea-a', 'am_subte-linea-h', 'am_subte-linea-e',
       'sn_Bs.As. G.B.A. Zona Oeste', 'sn_Bs.As. G.B.A. Zona Sur',
       'sn_Capital Federal'],
      dtype='object')

(42717, 32)

### 3.3.3. *Place name*

**creé las dummies sin drop_first para poder hacer el join**

In [212]:
df_pn = pd.get_dummies(data_dum_2["place_name"], prefix = "pn", drop_first = True)

In [213]:
data_dum_3 = data_dum_2.join(df_pn)
data_dum_3.columns
data_dum_3.shape

Index(['property_type', 'state_name', 'place_name', 'place_with_parent_names',
       'price_aprox_usd', 'surface_total_in_m2', 'surface_covered_in_m2',
       'price_usd_per_m2', 'rooms', 'description',
       ...
       'pn_Villa de Mayo', 'pn_Villa del Parque',
       'pn_Village Golf & Tennis Country Club', 'pn_Virasoro Village',
       'pn_Virrey del Pino', 'pn_Virreyes', 'pn_Wilde', 'pn_William Morris',
       'pn_Zelaya', 'pn_coordenadas 34.255511'],
      dtype='object', length=482)

(42717, 482)

### 3.3.4. Dropeo de columnas innecesarias y una de cada *dummy*

In [217]:
data = data_dum_3.drop(columns = ['state_name', 'place_name', 'place_with_parent_names', 'description', 'title', 'properati_url'])
data.shape
list(data.columns)
data.head(5)

(42717, 476)

['property_type',
 'price_aprox_usd',
 'surface_total_in_m2',
 'surface_covered_in_m2',
 'price_usd_per_m2',
 'rooms',
 'am_garage',
 'am_balcon',
 'am_parrilla',
 'am_piscina',
 'am_terraza',
 'am_patio',
 'am_jardin',
 'am_quincho',
 'am_s.u.m.',
 'am_amenities',
 'am_baulera',
 'am_gimnasio',
 'am_subte-linea-d',
 'am_subte-linea-b',
 'am_subte-linea-a',
 'am_subte-linea-h',
 'am_subte-linea-e',
 'sn_Bs.As. G.B.A. Zona Oeste',
 'sn_Bs.As. G.B.A. Zona Sur',
 'sn_Capital Federal',
 'pn_ los alamos',
 'pn_9 de Abril',
 'pn_Abasto',
 'pn_Abril Club de Campo',
 'pn_Acacias Blancas',
 'pn_Acassuso',
 'pn_Adrogué',
 'pn_Aeropuerto Internacional Ezeiza',
 'pn_Agronomía',
 'pn_Albanueva Barrio Cerrado',
 'pn_Aldo Bonzi',
 'pn_Alejandro Korn',
 'pn_Almagro',
 'pn_Almirante Brown',
 'pn_Altamira',
 'pn_Altos de Hudson II',
 'pn_Altos de Manzanares 1 y 2',
 'pn_Altos de Matheu',
 'pn_Altos del Golf',
 'pn_Altos del Pilar',
 'pn_Armenia Country Club',
 'pn_Avellaneda',
 'pn_Bahía del Sol',
 'pn_

Unnamed: 0,property_type,price_aprox_usd,surface_total_in_m2,surface_covered_in_m2,price_usd_per_m2,rooms,am_garage,am_balcon,am_parrilla,am_piscina,...,pn_Villa de Mayo,pn_Villa del Parque,pn_Village Golf & Tennis Country Club,pn_Virasoro Village,pn_Virrey del Pino,pn_Virreyes,pn_Wilde,pn_William Morris,pn_Zelaya,pn_coordenadas 34.255511
0,PH,62000.0,55.0,40.0,1127.272727,2.0,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
2,apartment,72000.0,55.0,55.0,1309.090909,2.0,True,False,False,False,...,0,0,0,0,0,0,0,0,0,0
6,PH,130000.0,106.0,78.0,1226.415094,2.0,False,False,False,True,...,0,0,0,0,0,0,0,0,0,0
7,apartment,138000.0,45.0,40.0,3066.666667,1.0,False,False,False,True,...,0,0,0,0,0,0,0,0,0,0
8,apartment,195000.0,65.0,60.0,3000.0,2.0,False,False,True,False,...,0,0,0,0,0,0,0,0,0,0


# **4. Exportación del dataset resultante a un nuevo *.csv***

In [215]:
# para exportar resultados
data.to_csv(r'../data/properatti_tp2.csv', index = False, header=True)

AttributeError: 'NoneType' object has no attribute 'to_csv'

In [None]:
# para usar el nuevo dataset desde el archivo
# data = pd.read_csv("../data/properatti_tp2.csv", sep = ",", low_memory=False) 
# data.head(3)
# data.shape