### El objetivo de esta notebook es obtener los datos (principalmente ubicaciones y calidad) de los hoteles o similares 

In [3]:
import numpy as np
import pandas as pd

In [4]:
alojamientos_turisticos = pd.read_csv('../Data/alojamientos-turisticos.csv')
alojamientos_turisticos.sample(3)

Unnamed: 0,long,lat,num_registro,periodo,categoria,establecimiento,telefono,mail,calle,calle_nro,barrio,comuna,codigo_postal,codigo_postal_argentino
355,-58.383854,-34.603917,347,2019,Hosp. B,BAHIA 2,(54 11) 4382 1780,,CORRIENTES,1212,San Nicolas,Comuna 1,1043.0,C1043AAZ
358,-58.424608,-34.582644,337,2019,Hosp. B,BRISAS,4774-0485,hotelbrisasbaires@gmail.com,CHARCAS,4470,Palermo,Comuna 14,1425.0,C1425BNN
439,-58.389562,-34.588573,57 A/11,2019,REG DE PREST,MIO BUENOS AIRES,(54 11) 5295 8500,reservas@miobuenosaires.com,QUINTANA,465,Recoleta,Comuna 2,1129.0,C1129ABA


In [5]:
alojamientos_turisticos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501 entries, 0 to 500
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   long                     500 non-null    float64
 1   lat                      500 non-null    float64
 2   num_registro             501 non-null    object 
 3   periodo                  501 non-null    int64  
 4   categoria                501 non-null    object 
 5   establecimiento          501 non-null    object 
 6   telefono                 501 non-null    object 
 7   mail                     373 non-null    object 
 8   calle                    501 non-null    object 
 9   calle_nro                501 non-null    int64  
 10  barrio                   492 non-null    object 
 11  comuna                   492 non-null    object 
 12  codigo_postal            491 non-null    float64
 13  codigo_postal_argentino  489 non-null    object 
dtypes: float64(3), int64(2), o

In [6]:
for i in alojamientos_turisticos.columns:
    print('Columna:',i)
    print(alojamientos_turisticos[i].value_counts(), '\n\n')

Columna: long
-58.411678    3
-58.383854    2
-58.388996    2
-58.385757    2
-58.384393    2
             ..
-58.378188    1
-58.397793    1
-58.392003    1
-58.440001    1
-58.441088    1
Name: long, Length: 494, dtype: int64 


Columna: lat
-34.610956    3
-34.603917    2
-34.589997    2
-34.589394    2
-34.597223    2
             ..
-34.594176    1
-34.570201    1
-34.608715    1
-34.587134    1
-34.612066    1
Name: lat, Length: 491, dtype: int64 


Columna: num_registro
209    3
233    2
229    2
119    2
208    2
      ..
96     1
310    1
34     1
120    1
161    1
Name: num_registro, Length: 479, dtype: int64 


Columna: periodo
2019    501
Name: periodo, dtype: int64 


Columna: categoria
Hosp. A         99
Hosp. B         79
REG DE PREST    76
4*              66
2*              46
3*              46
1*              31
5*              19
APART           17
APART 2*        16
APART 3*         4
APART 1*         2
Name: categoria, dtype: int64 


Columna: establecimiento
RICH 

In [7]:
# Nos deshacemos de columnas irrelevantes
columns_at_todrop = ['periodo','telefono','mail','codigo_postal','codigo_postal_argentino']
alojamientos_turisticos_1 = alojamientos_turisticos.drop(columns=columns_at_todrop)

In [8]:
alojamientos_turisticos_1.sample(5)

Unnamed: 0,long,lat,num_registro,categoria,establecimiento,calle,calle_nro,barrio,comuna
277,-58.473749,-34.634911,214,Hosp. A,COSTA AZUL,ALBERDI,3327,Floresta,Comuna 10
255,-58.423232,-34.597154,284,Hosp. A,ARIES,GASCON,1147,Palermo,Comuna 14
131,-58.382202,-34.604724,23 A/10,4*,CAESAR PARK SILVER OBELISCO,CERRITO,328,San Nicolas,Comuna 1
100,-58.384381,-34.594209,9,3*,IMPALA,LIBERTAD,1215,Retiro,Comuna 1
209,-58.455561,-34.56354,5020,APART,APART HOTEL CABILDO SUITES (ex CABILDO SUITES ...,CABILDO,1950,Belgrano,Comuna 13


*¿Cómo podemos combinar este df con el principal?*

In [9]:
# Vamos a crear la columna en común con el main
alojamientos_turisticos_1['lat-lon'] = list(zip(alojamientos_turisticos_1.lat, alojamientos_turisticos_1.long))
alojamientos_turisticos_1.sample(3)

Unnamed: 0,long,lat,num_registro,categoria,establecimiento,calle,calle_nro,barrio,comuna,lat-lon
10,-58.395648,-34.610601,70 A/12,1*,DEL CONGRESO,H. YRIGOYEN,2064,Balvanera,Comuna 3,"(-34.610601, -58.395648)"
67,-58.424967,-34.578432,15,2*,PALERMO HOTEL,GODOY CRUZ,2709,Palermo,Comuna 14,"(-34.578432, -58.424967)"
17,-58.407147,-34.612066,137,1*,HISPANO ARGENTINO,CATAMARCA,167,Balvanera,Comuna 3,"(-34.612066, -58.407146999999995)"


In [10]:
# Nos quedamos con las columnas más importantes
alojamientos_turisticos_2 = alojamientos_turisticos_1.loc[:,['lat-lon','categoria']]
alojamientos_turisticos_2.sample(3)

Unnamed: 0,lat-lon,categoria
49,"(-34.605734000000005, -58.383567000000006)",2*
342,"(-34.609266, -58.379213)",Hosp. A
32,"(-34.609428, -58.414803000000006)",2*


In [11]:
# Buscamos que no haya cosas raras en el df
alojamientos_turisticos_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501 entries, 0 to 500
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   lat-lon    501 non-null    object
 1   categoria  501 non-null    object
dtypes: object(2)
memory usage: 8.0+ KB


In [12]:
# Valores duplicados? Nos interesa en particular que las cooredanadas no se repitan
alojamientos_turisticos_2['lat-lon'].duplicated().value_counts()

False    497
True       4
Name: lat-lon, dtype: int64

In [13]:
# Ah mirá...
mask_alojamientos_duplicated = alojamientos_turisticos_2['lat-lon'].duplicated() == True
alojamientos_turisticos_2.loc[mask_alojamientos_duplicated]

Unnamed: 0,lat-lon,categoria
168,"(-34.604028, -58.385757)",4*
276,"(-34.610956, -58.411678)",Hosp. A
346,"(-34.610956, -58.411678)",Hosp. B
355,"(-34.603916999999996, -58.38385400000001)",Hosp. B


In [14]:
mask_alojamientos_duplicated_c = (alojamientos_turisticos_2['lat-lon'] == (-34.610956, -58.411678)) | (alojamientos_turisticos_2['lat-lon'] ==(-34.603916999999996, -58.38385400000001)) | (alojamientos_turisticos_2['lat-lon']==(-34.604028, -58.385757))
alojamientos_turisticos_2.loc[mask_alojamientos_duplicated_c]

Unnamed: 0,lat-lon,categoria
96,"(-34.604028, -58.385757)",3*
168,"(-34.604028, -58.385757)",4*
252,"(-34.610956, -58.411678)",Hosp. A
276,"(-34.610956, -58.411678)",Hosp. A
346,"(-34.610956, -58.411678)",Hosp. B
354,"(-34.603916999999996, -58.38385400000001)",Hosp. B
355,"(-34.603916999999996, -58.38385400000001)",Hosp. B


In [15]:
# Chequeamos esas filas en el dataframe original (que tiene todas las columnas)
alojamientos_turisticos.iloc[[96,168,252,276,346,354,355],:]

Unnamed: 0,long,lat,num_registro,periodo,categoria,establecimiento,telefono,mail,calle,calle_nro,barrio,comuna,codigo_postal,codigo_postal_argentino
96,-58.385757,-34.604028,49 A/11,2019,3*,HOTEL IBIS BUENOS AIRES OBELISCO,(54 11) 4370-9300,h6502-dm1@accor.com,CORRIENTES,1344,San Nicolas,Comuna 1,1043.0,C1043ABN
168,-58.385757,-34.604028,48 A/11,2019,4*,NOVOTEL,(54 11) 4370-9300 / 9500,h6503-re@accor.com,CORRIENTES,1334,San Nicolas,Comuna 1,1043.0,C1043ABN
252,-58.411678,-34.610956,240,2019,Hosp. A,ALSINA,(54 11) 4931 9393,,24 DE NOVIEMBRE,24,Balvanera,Comuna 3,1170.0,C1170AAB
276,-58.411678,-34.610956,823,2019,Hosp. A,COLONIAL,(54 11) 4931 8800/3104,info@hotelcolonialbaires.com.ar,24 DE NOVIEMBRE,24,Balvanera,Comuna 3,1170.0,C1170AAB
346,-58.411678,-34.610956,336,2019,Hosp. B,2018-11-24 00:00:00,4931-4661,,24 DE NOVIEMBRE,24,Balvanera,Comuna 3,1170.0,C1170AAB
354,-58.383854,-34.603917,346,2019,Hosp. B,BAHIA 1,4382-1780,anelluz@hotmail.com arhotelbahia@gmail.com,CORRIENTES,1212,San Nicolas,Comuna 1,1043.0,C1043AAZ
355,-58.383854,-34.603917,347,2019,Hosp. B,BAHIA 2,(54 11) 4382 1780,,CORRIENTES,1212,San Nicolas,Comuna 1,1043.0,C1043AAZ


In [16]:
# No sabemos a qué puede deberse las observaciones duplicadas.
# Investigamos por internet qué establecimiento está en cada una de esas direcciones y eliminamos las que no aparecen ahí
alojamientos_turisticos_3 = alojamientos_turisticos_2.drop(index=[96,252,346,355])
print('Cantidad de observaciones antes de eliminar:', alojamientos_turisticos_2.shape[0])
print('Cantidad de observaciones después de eliminar:', alojamientos_turisticos_3.shape[0])

Cantidad de observaciones antes de eliminar: 501
Cantidad de observaciones después de eliminar: 497


# Dataframe listo para pasar: "alojamientos_turisticos_3"

In [17]:
alojamientos_turisticos_3

Unnamed: 0,lat-lon,categoria
0,"(-34.572421999999996, -58.430627)",1*
1,"(-34.596496, -58.394791000000005)",1*
2,"(-34.609331, -58.385505)",1*
3,"(-34.615128000000006, -58.376078)",1*
4,"(-34.617901, -58.374441000000004)",1*
...,...,...
496,"(-34.616498, -58.375932999999996)",REG DE PREST
497,"(-34.610481, -58.36198100000001)",REG DE PREST
498,"(-34.587358, -58.429186)",REG DE PREST
499,"(-34.61592, -58.381653)",REG DE PREST


In [18]:
alojamientos_turisticos_3.info()
#obj

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497 entries, 0 to 500
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   lat-lon    497 non-null    object
 1   categoria  497 non-null    object
dtypes: object(2)
memory usage: 11.6+ KB
