In [1]:
import pandas as pd
import numpy as np

In [2]:
hosts = pd.read_csv("../data/hosts.csv", sep=',')
reviews = pd.read_csv("../data/reviews.csv", sep=',')

In [3]:
rooms_path = "../data/rooms.csv"
rooms = pd.read_csv(
    rooms_path,
    sep=",",
    quotechar='"',
    encoding="utf-8",
    on_bad_lines="skip"
)

In [4]:
def resume_df(df):
    print(f"Shape: {df.shape}")
    print(f"NULOS")
    display(df.isnull().sum())
    print(f"DUPLICATAS")
    duplicated_values = df[df.duplicated()]
    print(f"Total de registros duplicados: {len(duplicated_values)}")

In [5]:
def analyse_categorical_variables(df):
    display(df.info())
    
    df.columns = df.columns.str.lower()
    cat_cols = df.select_dtypes(include='object').columns
    for col in cat_cols:
        df[col] = df[col].str.strip().str.lower()
    for col in cat_cols:
        unique_categories = df[col].unique()
        print(f"\n{col} == {len(unique_categories)}")
        if len(unique_categories) < 20:
            print(unique_categories)

In [6]:
def analyse_numerical_variables(df):
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    for col in num_cols:
        invalid_values = df[~df[col].apply(lambda x: np.isreal(x)) & df[col].notnull()]
        if not invalid_values.empty:
            print(f"Inconsistências encontradas em {col}:")
            display(invalid_values)
    outliers_dict = {}
    for col in num_cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        limits = (q1 - 1.5*iqr, q3 + 1.5*iqr)
        outliers = df[(df[col] < limits[0]) | (df[col] > limits[1])]
        print(f"{col}: {len(outliers)} outliers identificados.")
        outliers_dict[col] = len(outliers)

### Hosts

In [7]:
resume_df(hosts)

Shape: (37484, 2)
NULOS


host_id       0
host_name    18
dtype: int64

DUPLICATAS
Total de registros duplicados: 0


In [8]:
hosts = hosts.dropna()

In [9]:
analyse_categorical_variables(hosts)

<class 'pandas.core.frame.DataFrame'>
Index: 37466 entries, 0 to 37483
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   host_id    37466 non-null  object
 1   host_name  37466 non-null  object
dtypes: object(2)
memory usage: 878.1+ KB


None


host_id == 37461

host_name == 11399


In [10]:
# Corrigindo id
mask_hosts = pd.to_numeric(hosts['host_id'], errors='coerce').isna()
hosts = hosts[~mask_hosts].copy()
hosts['host_id'] = pd.to_numeric(hosts['host_id'])

### Reviews

In [11]:
resume_df(reviews)

Shape: (48875, 8)
NULOS


id                                    0
host_id                               0
price                                 0
number_of_reviews                    20
last_review                       10039
reviews_per_month                 10019
calculated_host_listings_count        0
availability_365                    156
dtype: int64

DUPLICATAS
Total de registros duplicados: 0


In [12]:
# Remover linhas com valores nulos em 'price' e 'availability_365'
reviews = reviews.dropna(subset=['price', 'availability_365'])

# Substituir valores nulos em 'last_review' e 'reviews_per_month' por -1
reviews['last_review'] = reviews['last_review'].fillna(-1)
reviews['reviews_per_month'] = reviews['reviews_per_month'].fillna(-1)

In [13]:
analyse_categorical_variables(reviews)

<class 'pandas.core.frame.DataFrame'>
Index: 48719 entries, 0 to 48874
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48719 non-null  object 
 1   host_id                         48719 non-null  object 
 2   price                           48719 non-null  int64  
 3   number_of_reviews               48719 non-null  object 
 4   last_review                     48719 non-null  object 
 5   reviews_per_month               48719 non-null  float64
 6   calculated_host_listings_count  48719 non-null  int64  
 7   availability_365                48719 non-null  float64
dtypes: float64(2), int64(2), object(4)
memory usage: 3.3+ MB


None


id == 48719

host_id == 37347

number_of_reviews == 394

last_review == 1765


In [14]:
# number_of_reviews: object -> int
reviews['number_of_reviews'] = reviews['number_of_reviews'].astype(int)

# last_review: object -> datatime
reviews['last_review'] = pd.to_datetime(reviews['last_review'], errors='coerce')

In [15]:
# Corrigindo id
mask_reviews = pd.to_numeric(reviews['id'], errors='coerce').isna()
reviews = reviews[~mask_reviews].copy()
reviews['id'] = pd.to_numeric(reviews['id'])

# host_id: object -> int
reviews['host_id'] = reviews['host_id'].astype(int)

In [16]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48710 entries, 0 to 48874
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              48710 non-null  int64         
 1   host_id                         48710 non-null  int64         
 2   price                           48710 non-null  int64         
 3   number_of_reviews               48710 non-null  int64         
 4   last_review                     38694 non-null  datetime64[ns]
 5   reviews_per_month               48710 non-null  float64       
 6   calculated_host_listings_count  48710 non-null  int64         
 7   availability_365                48710 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(5)
memory usage: 3.3 MB


In [17]:
analyse_numerical_variables(reviews)

id: 0 outliers identificados.
host_id: 1514 outliers identificados.
price: 2962 outliers identificados.
number_of_reviews: 6098 outliers identificados.
reviews_per_month: 3288 outliers identificados.
calculated_host_listings_count: 7055 outliers identificados.
availability_365: 0 outliers identificados.


### Rooms

In [18]:
resume_df(rooms)

Shape: (48875, 8)
NULOS


id                     0
name                   0
neighbourhood          0
neighbourhood_group    0
latitude               0
longitude              0
room_type              0
minimum_nights         0
dtype: int64

DUPLICATAS
Total de registros duplicados: 0


In [19]:
analyse_categorical_variables(rooms)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48875 entries, 0 to 48874
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   48875 non-null  object 
 1   name                 48875 non-null  object 
 2   neighbourhood        48875 non-null  object 
 3   neighbourhood_group  48875 non-null  object 
 4   latitude             48875 non-null  float64
 5   longitude            48875 non-null  object 
 6   room_type            48875 non-null  object 
 7   minimum_nights       48875 non-null  int64  
dtypes: float64(1), int64(1), object(6)
memory usage: 3.0+ MB


None


id == 48862

name == 47437

neighbourhood == 377

neighbourhood_group == 68

longitude == 14686

room_type == 77


In [20]:
# Converter colunas para os tipos corretos
rooms['id'] = pd.to_numeric(rooms['id'], errors='coerce').astype('Int64')
rooms['latitude'] = pd.to_numeric(rooms['latitude'], errors='coerce')
rooms['longitude'] = pd.to_numeric(rooms['longitude'], errors='coerce')
rooms['minimum_nights'] = pd.to_numeric(rooms['minimum_nights'], errors='coerce').astype('Int64')

# Remover espaços extras em colunas de texto
for col in ['name', 'neighbourhood', 'neighbourhood_group', 'room_type']:
    rooms[col] = rooms[col].astype(str).str.strip()


### Export

In [21]:
hosts.to_csv("../data/clean_data/hosts_clean.csv", sep=",", index=False, encoding="utf-8")
reviews.to_csv("../data/clean_data/reviews_clean.csv", sep=",", index=False, encoding="utf-8")
rooms.to_csv("../data/clean_data/rooms_clean.csv", sep=",", index=False, encoding="utf-8")