In [1]:
# Importamos las libreria para leer la data
import polars as pl

In [2]:
# Cargamos el archivo de metadata para los negocios
business = pl.read_csv('../../data/yelp/business.csv')

In [242]:
# Memoria ocupada
business.estimated_size(unit='mb')

106.26533603668213

In [3]:
# Numero de filas y columnas
business.shape

(150346, 14)

In [4]:
business.head()

business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
str,str,str,str,str,str,f64,f64,f64,i64,i64,str,str,str
"""Pns2l4eNsfO8kk…","""Abby Rappoport…","""1616 Chapala S…","""Santa Barbara""",,"""93101""",34.426679,-119.711197,5.0,7,0,"""{'ByAppointmen…","""Doctors, Tradi…",
"""mpf3x-BjTdTEA3…","""The UPS Store""","""87 Grasso Plaz…","""Affton""",,"""63123""",38.551126,-90.335695,3.0,15,1,"""{'BusinessAcce…","""Shipping Cente…","""{'Monday': '0:…"
"""tUFrWirKiKi_TA…","""Target""","""5255 E Broadwa…","""Tucson""",,"""85711""",32.223236,-110.880452,3.5,22,0,"""{'BikeParking'…","""Department Sto…","""{'Monday': '8:…"
"""MTSW4McQd7CbVt…","""St Honore Past…","""935 Race St""","""Philadelphia""","""CA""","""19107""",39.955505,-75.155564,4.0,80,1,"""{'RestaurantsD…","""Restaurants, F…","""{'Monday': '7:…"
"""mWMc6_wTdE0EUB…","""Perkiomen Vall…","""101 Walnut St""","""Green Lane""","""MO""","""18054""",40.338183,-75.471659,4.5,13,1,"""{'BusinessAcce…","""Brewpubs, Brew…","""{'Wednesday': …"


**BUSINESS_ID**

In [26]:
# Eliminamos duplicados basandonos en business_id
business = business.filter(pl.col('business_id').is_unique())

In [29]:
# Nulos
business['business_id'].is_null().sum()

0

**NAME**

In [30]:
# Nulos en name
business['name'].is_null().sum()

0

In [32]:
# Convertimos a lowercase los nombres
business = business.with_columns(pl.col('name').apply(lambda value: value.lower()))

In [216]:
# Nombres unicos de empresas.
business['name'].unique().shape[0]

113670

**ADDRESS**

In [61]:
import re

In [198]:
def only_char_whitespace(text):
    try:
        words = re.sub('[^a-zA-Z\s]*', '', text.lower()).split()
        words = [word for word in words if len(word) > 2]
        if words:
            return ' '.join(words)
        else:
            return None
    except:
        return None

In [189]:
# Numero de empresas sin su direccion.
business['address'].is_null().sum()

5127

In [188]:
# Ejemplo de la funcion only_chat_whitespace
text = business['address'][7]
print('TEXT:', text)
print('NEW TEXT:', only_char_whitespace(text))

TEXT: 400 Pasadena Ave S
NEW TEXT: pasadena ave


In [199]:
# Creo address_v2 que consiste en remove el numero de la direccion de cada negocio ya que es un valor unico.
business = business.with_columns(pl.col('address').alias('address_v2').apply(only_char_whitespace))

In [200]:
# Numero de valores nulos para la nueva columna
business['address_v2'].is_null().sum()

14205

In [192]:
# Cantidad de valores unicos en la vieja address
business['address'].unique().shape[0]

122844

In [201]:
# Cantidad de valores unicos en la nueva address
business['address_v2'].unique().shape[0]

18488

**CITY**

In [99]:
# Numero de empresas sin su valor de city.
business['city'].is_null().sum()

0

In [103]:
# Numero de ciudades
business['city'].unique().shape[0]

1416

In [104]:
# Convertimos a lowercase
business = business.with_columns(pl.col('city').apply(lambda value: value.lower()))

**STATE**

In [110]:
# Que ciudades son a las que les falta el estado
business.filter(pl.col('state').is_null()).select('city')

city
str
"""santa barbara"""
"""affton"""
"""tucson"""


In [127]:
# Creamos un dataframe para poder remplazar los valores faltantes
df = business.to_pandas()

In [134]:
# Remplazamos los valores nulos
df.loc[0, 'state'] = 'CA'
df.loc[1, 'state'] = 'MO'
df.loc[2, 'state'] = 'AZ'

In [137]:
# Pasamos de pandas a polars
business = pl.from_dataframe(df)

In [138]:
# Convertimos a lowercase
business = business.with_columns(pl.col('state').apply(lambda value: value.lower()))

**POSTAL_CODE**

In [140]:
# Cantidad de nulos
business['postal_code'].null_count()

73

In [141]:
# Cantidad de valores unicos
business['postal_code'].unique().shape[0]

3362

**LATITUDE**

In [205]:
# Latitude debe ser mayor a 0
(business['latitude'] < 0).sum()

0

In [207]:
# Cantidad de nulos
business['latitude'].null_count()

0

In [208]:
# Cantidad de unicos
business['latitude'].unique().shape[0]

135593

**LONGITUDE**

In [206]:
# Longitude debe ser menor que 0
(business['longitude'] > 0).sum()

0

In [210]:
# Cantidad de nulos
business['longitude'].null_count()

0

In [211]:
# Cantidad de unicos
business['longitude'].unique().shape[0]

131918

In [219]:
# Consideramos duplicado aquel business en el mismo lugar y con el mismo nombre.
# Asumimos que pueden haber locales en un mismo edificio por lo que su posicion geografica es la misma.
# Pero no asumimos que mas de un mismo local este en el mismo lugar.
business = business.unique(subset=['name', 'latitude', 'longitude'])

**STARS**

In [16]:
# Valores minimos y maximos para stars. Deben estar entre 1 y 5.
business['stars'].min(), business['stars'].max()

(1.0, 5.0)

In [220]:
business['stars'].null_count()

0

In [222]:
# Observamos que stars no es entero.
business['stars'].value_counts()

stars,counts
f64,u32
5.0,16306
2.5,14310
3.0,18449
2.0,9523
3.5,26511
4.5,27176
1.0,1986
1.5,4928
4.0,31120


**REVIEW_COUNT**

In [224]:
# Valores faltantes
business['review_count'].null_count()

0

In [226]:
# Valores minimos y maximos. Naturalmente, el valor debe ser mayor o igual a 0.
business['review_count'].min(), business['review_count'].max()

(5, 7568)

**IS_OPEN**

In [229]:
# Valores unicos y distribucion
business['is_open'].value_counts()

is_open,counts
i64,u32
0,30635
1,119674


In [230]:
# Valores nulos
business['is_open'].null_count()

0

**ATTRIBUTES**

In [232]:
# Cantidad de valores nulos en attributes
business['attributes'].null_count()

13739

In [240]:
# Ejemplo de attibutes
eval(business['attributes'][1])

{'GoodForKids': 'True',
 'BusinessAcceptsCreditCards': 'True',
 'WheelchairAccessible': 'True',
 'BusinessParking': "{'garage': False, 'street': True, 'validated': False, 'lot': True, 'valet': False}"}

Esta columna la vamos a mantener asi, en formato string, puesto que si no nuestro dataframe sera muy grande y con muchos valores nulos.

**CATEGORIES**

In [255]:
# Ejemplo de categories
business['categories'][0]

'Restaurants, Burgers'

In [257]:
# Convertimos a lowercase
business = business.with_columns(pl.col('categories').str.to_lowercase())

In [260]:
# Numero de categorias unicas
business['categories'].unique().shape[0]

83144

In [259]:
# Numero de valroes faltantes
business['categories'].null_count()

103

Mas adelante le aplicaremos un codigo a la columna categories para reducirla a un numero adecuado acorde a los objetivos de la experiencia del usuario, tal que enves de 83144 categorias pasemos a < 20.

Por otro lado, los valores faltantes tambien lo vamos a llenar con esta nueva distribucion que crearemos.

**HOURS**

In [262]:
# Numero de valores faltantes
business['hours'].null_count()

23215

In [285]:
# Ejemplo de hours
eval(business['hours'][1])

{'Monday': '10:0-15:0',
 'Tuesday': '9:30-14:0',
 'Wednesday': '10:0-18:0',
 'Thursday': '10:0-18:0',
 'Friday': '9:30-17:30',
 'Saturday': '9:30-17:30'}

Por la misma razon a attributes, esta columna la dejamos en formato string tal que podamos extraer sus valores con eval cuando es un diccionario.

**NUEVA COLUMNA ID**

In [289]:
# Creamos una nueva columna llamada company_index para mejorar la identificacion de un negocio.
business = business.with_row_count(offset=1, name='company_index')

In [290]:
business.head(5)

company_index,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,address_v2
u32,str,str,str,str,str,str,f64,f64,f64,i64,i64,str,str,str,str
1,"""2xVsWBNFwZOxIO…","""cheeseburger i…","""116 N Pottstow…","""exton""","""pa""","""19341""",40.029962,-75.630607,2.5,20,0,"""{'NoiseLevel':…","""restaurants, b…",,"""pottstown pike…"
2,"""LcAozWCMLGjwRb…","""edwardsville c…","""722 Holyoake R…","""edwardsville""","""pa""","""62025""",38.804395,-89.949733,4.5,12,1,"""{'GoodForKids'…","""museums, kids …","""{'Monday': '10…","""holyoake"""
3,"""xM6LoUcnpDpMBz…","""fairfield inn …","""719 E Baltimor…","""kennett square…","""ab""","""19348""",39.856248,-75.69461,3.0,37,1,"""{'BusinessAcce…","""hotels, hotels…",,"""baltimore pike…"
4,"""eYxGFkxo6m3SYG…","""big boyz toyz …","""4158 E Grant R…","""tucson""","""pa""","""85712""",32.250324,-110.903655,4.5,8,1,,"""towing, hotels…","""{'Monday': '8:…","""grant"""
5,"""cvP_vh_bOLbCY3…","""fishers freedo…","""1 Park Dr""","""fishers""","""ab""","""46038""",39.958734,-86.016966,4.5,7,0,"""{'GoodForKids'…","""festivals, art…",,"""park"""


In [291]:
# Almacenamos en formato parquet para reducir espacio.
business.write_parquet('../../data/yelp/production/business.parquet')