## Campos involucrados

- titulo
- descripcion

## Ideas

- wordcloud
- normalizacion
- stemming
- palabras positivas (respecto al precio)
- palabras negativas (respecto al precio)

## Hipótesis

- ciertas palabras indican mayor precio (luminoso, jardín, hermoso, vista...)
- a más palabras, mayor precio

## Resultados
- la correlacion entre longitud de descripcion y precio es bastante baja (0.1)

In [1]:
#importo las funciones para levantar los dataframes
%run "../../utils/dataset_parsing.ipynb"
#importo las funciones para graficar
%run "../../utils/graphs.ipynb"
df = levantar_datos("../../"+DATASET_RELATIVE_PATH)
df.columns
pd.set_option("display.max_colwidth", -1)

Index(['id', 'titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad',
       'provincia', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'fecha',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'precio', 'mes', 'ano', 'dia'],
      dtype='object')

In [2]:
import nltk  
from nltk.corpus import stopwords  
from string import punctuation  

In [3]:
spanish_stopwords = set(stopwords.words('spanish'))
non_words = set(punctuation)
non_words.update({'¿', '¡'})
non_words.update(map(str,range(10)))

In [179]:
import re
from unidecode import unidecode

def is_meaningful(word: str) -> bool:
    """
        Recibe una palabra, remueve puntuaciones y verifica que lo que queda no esté en el set de stopwords
    """
    return len(word) > 2 and not word in spanish_stopwords

def remove_html(field: str) -> str:
    """
        Recibe un texto y devuelve una copia sin los tags html
    """
    return re.compile(r'<[^>]+>').sub('', field) if field else field

def normalize(field: str) -> str:
    """
        Recibe un texto y devuelve una copia sin acentos, ñ ni puntuaciones.
    """
    return ''.join([" " if c in non_words else unidecode(c) for c in field]).strip() if field else ""

def limpiar_campo(field: str) -> str:
    """
        Recibe un campo string que podría tener muchas palabras.
        Devuelve un string que contiene sólo las palabras significativas.
    """
    if not isinstance(field,str): return ""
    without_html = remove_html(field)
    normalized = normalize(without_html)
    meaningful = " ".join(set(filter(is_meaningful, normalized.split())))
    return meaningful

In [182]:
df["descripcion_limpia"] = df["descripcion"].map(limpiar_campo)
df["len_descripcion"] = df["descripcion_limpia"].map(lambda x: len(x.split()))

In [183]:
df["titulo_limpio"] = df["titulo"].map(limpiar_campo)
df["len_titulo"] = df["titulo_limpio"].map(lambda x: len(x.split()))

In [184]:
# df["titulo_limpio"].sample(10)
df["descripcion_limpia"].sample(10)

131504    hidroneumatico split completo recamaras cocina closet sala servicio espacio dos cuentan comparten agua cuarto republica casa medio integral techada lts ademas servicios cuenta gas antecomedor lavanderia alta planta baja excelente patio colonia luz mini cisterna comedor bano amplios                                                                                                                                                                                                                                                                                                  
129360    motor completo recamaras bano terreno cocina closet sala construccion servicio jardin bardas lavado tendido alacena desayunadora garage cuarto porton integral barra techada techado acabados terraza iluminacion amplia recamara modernos granito perimetrales preparacion acondicionado lavabo gavetas piscina alta planta baja doble autos area principal vestidor filtro plantas lujo comedor herreria aire vista  

In [185]:
df[["len_descripcion","precio"]].corr()

Unnamed: 0,len_descripcion,precio
len_descripcion,1.0,0.098862
precio,0.098862,1.0


In [148]:
from collections import Counter

def get_word_counter(series):
    """
        Faltaría analizar stemming
    """
    counter = Counter()
    for title in series.values:
        counter.update(set(title.split()))
    return counter

In [186]:
titulo_palabras = get_word_counter(df["titulo_limpio"])
descripcion_palabras = get_word_counter(df["descripcion_limpia"])

In [151]:
print(len(titulo_palabras),len(descripcion_palabras))

18578 72121


In [155]:
# titulo_palabras.most_common(10)

In [96]:
# descripcion_palabras.most_common(10)

In [156]:
palabras_positivas = {"conservacion","tenis","balcon","panoramica","exclusivos","golf","canchas","remodelada","acondicionado","lujo","jacuzzi","diseno","exclusiva","magnifica","exclusivo","country","precioso","estilo","seguridad","verdes","juegos","servicio","excelente","terraza","jardin","hermosa","vista","bonita","renta", "granito"}
palabras_negativas = {"oportunidad","remato","oferta","remodelar"}

In [188]:
df["palabras_positivas_descripcion"] = df["descripcion_limpia"].map(lambda x: len([y for y in x.split() if y in palabras_positivas]))
df[["palabras_positivas_descripcion","precio"]].corr()

Unnamed: 0,palabras_positivas_descripcion,precio
palabras_positivas_descripcion,1.0,0.338492
precio,0.338492,1.0


In [189]:
df["palabras_negativas_descripcion"] = df["descripcion_limpia"].map(lambda x: len([y for y in x.split() if y in palabras_negativas]))
df[["palabras_negativas_descripcion","precio"]].corr()

Unnamed: 0,palabras_negativas_descripcion,precio
palabras_negativas_descripcion,1.0,-0.026244
precio,-0.026244,1.0


In [196]:
df.palabras_positivas_descripcion.value_counts()

1     64744
0     59049
2     46790
3     30454
4     18382
5     10290
6     5407 
7     2635 
8     1241 
9     571  
10    221  
11    98   
12    58   
14    31   
13    20   
15    8    
16    1    
Name: palabras_positivas_descripcion, dtype: int64

In [199]:
# df.loc[df.palabras_positivas_descripcion > 14]["descripcion"]

In [203]:
df.loc[df.palabras_negativas_descripcion > 2]["descripcion"]

6791      <p>residencia para remodelar. excelente ubicación frente a parque se incluyen planos de propuesta de remodelacion. oferta única en el mercado. casa habitación de tres recamaras con baño cada una. amplios espacios y terrazas. alberca de 6x3, cuarto de estudios. jardín, cochera para tres autos. oportunidad para inversionistas o desarrolladores.</p>                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
108031    <table class=adviewcontent cellpadding=0>\r\n<tbody>\r\n<tr>\r\n<td valign=top>remato casa en claveria, para remodelar, excelente oportunidad actualmente la construccion cuenta co

In [214]:
df_corr_positivas = df[["descripcion_limpia","precio"]]
for palabra in palabras_positivas:
    df_corr_positivas[palabra] = df_corr_positivas["descripcion_limpia"].map(lambda x: int(palabra in x))
df_corr_positivas.corr()["precio"].sort_values(ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


precio           1.000000
terraza          0.223338
jardin           0.208592
vista            0.194534
jacuzzi          0.157660
lujo             0.155200
juegos           0.144366
servicio         0.132612
granito          0.102580
estilo           0.099340
hermosa          0.097818
exclusivo        0.091339
panoramica       0.087759
precioso         0.086908
balcon           0.081591
tenis            0.073820
excelente        0.065813
golf             0.064149
seguridad        0.060509
exclusiva        0.055698
diseno           0.052607
country          0.051681
remodelada       0.051611
exclusivos       0.040721
magnifica        0.039993
conservacion     0.028719
renta            0.026840
acondicionado    0.022942
canchas          0.021226
verdes          -0.022241
bonita          -0.040270
Name: precio, dtype: float64

In [215]:
df_corr_negativas = df[["descripcion_limpia","precio"]]
for palabra in palabras_negativas:
    df_corr_negativas[palabra] = df_corr_negativas["descripcion_limpia"].map(lambda x: int(palabra in x))
df_corr_negativas.corr()["precio"].sort_values(ascending=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


oportunidad   -0.042807
oferta        -0.031834
remato        -0.017778
remodelar      0.049695
precio         1.000000
Name: precio, dtype: float64

In [216]:
test = df[["descripcion_limpia","precio","metrostotales"]]
for palabra in palabras_positivas:
    test[palabra] = test["descripcion_limpia"].map(lambda x: int(palabra in x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


precio           1.000000
metrostotales    0.514411
terraza          0.223338
jardin           0.208592
vista            0.194534
jacuzzi          0.157660
lujo             0.155200
juegos           0.144366
servicio         0.132612
granito          0.102580
estilo           0.099340
hermosa          0.097818
exclusivo        0.091339
panoramica       0.087759
precioso         0.086908
balcon           0.081591
tenis            0.073820
excelente        0.065813
golf             0.064149
seguridad        0.060509
exclusiva        0.055698
diseno           0.052607
country          0.051681
remodelada       0.051611
exclusivos       0.040721
magnifica        0.039993
conservacion     0.028719
renta            0.026840
acondicionado    0.022942
canchas          0.021226
verdes          -0.022241
bonita          -0.040270
Name: precio, dtype: float64

metrostotales    1.000000
precio           0.514411
jardin           0.234522
terraza          0.184700
vista            0.124387
granito          0.109006
jacuzzi          0.105397
hermosa          0.102505
servicio         0.087094
lujo             0.077831
estilo           0.074213
juegos           0.065640
panoramica       0.057353
tenis            0.055924
golf             0.055919
acondicionado    0.052472
exclusivo        0.044286
exclusiva        0.042953
country          0.037417
diseno           0.030468
magnifica        0.027192
excelente        0.025536
canchas          0.015788
exclusivos       0.012004
remodelada       0.010258
precioso         0.006164
balcon           0.002815
renta            0.002466
seguridad        0.001984
conservacion    -0.005672
verdes          -0.031609
bonita          -0.036421
Name: metrostotales, dtype: float64

In [239]:
top = list(set(test.corr()["metrostotales"].sort_values(ascending=False).head(8).index).union(set(test.corr()["precio"].sort_values(ascending=False).head(8).index)))

In [244]:
test_corr = test[top].corr()
test_corr["dif"] = test_corr["precio"] - test_corr["metrostotales"]
test_corr["dif"] = abs(test_corr["dif"])

In [246]:
test_corr["dif"].sort_values(ascending=False)
#estas se me ocurre que serian las palabras que mayor diferencia podrian hacer

precio           0.485589
metrostotales    0.485589
juegos           0.078725
lujo             0.077370
vista            0.070146
jacuzzi          0.052263
terraza          0.038639
jardin           0.025931
granito          0.006427
hermosa          0.004686
Name: dif, dtype: float64

In [247]:
test_corr

Unnamed: 0,metrostotales,hermosa,jacuzzi,juegos,granito,terraza,jardin,precio,lujo,vista,dif
metrostotales,1.0,0.102505,0.105397,0.06564,0.109006,0.1847,0.234522,0.514411,0.077831,0.124387,0.485589
hermosa,0.102505,1.0,0.047128,0.054155,0.080282,0.096229,0.122408,0.097818,0.067182,0.156207,0.004686
jacuzzi,0.105397,0.047128,1.0,0.109528,0.040159,0.120713,0.102968,0.15766,0.066057,0.094428,0.052263
juegos,0.06564,0.054155,0.109528,1.0,0.055081,0.11185,0.146598,0.144366,0.046619,0.101708,0.078725
granito,0.109006,0.080282,0.040159,0.055081,1.0,0.163145,0.148632,0.10258,0.164585,0.094475,0.006427
terraza,0.1847,0.096229,0.120713,0.11185,0.163145,1.0,0.207999,0.223338,0.1131,0.191207,0.038639
jardin,0.234522,0.122408,0.102968,0.146598,0.148632,0.207999,1.0,0.208592,0.07113,0.124513,0.025931
precio,0.514411,0.097818,0.15766,0.144366,0.10258,0.223338,0.208592,1.0,0.1552,0.194534,0.485589
lujo,0.077831,0.067182,0.066057,0.046619,0.164585,0.1131,0.07113,0.1552,1.0,0.107811,0.07737
vista,0.124387,0.156207,0.094428,0.101708,0.094475,0.191207,0.124513,0.194534,0.107811,1.0,0.070146
