In [24]:
import pandas as pd
import glob
import os
import numpy as np
import re

In [None]:
def merge_csv_files(directory_path, output_filename):

    # Create a pattern to match all CSV files with the olx_properties prefix
    pattern = os.path.join(directory_path, "olx_properties_*.csv")
    
    # Get list of all matching CSV files
    csv_files = glob.glob(pattern)
    
    # Check if any files were found
    if not csv_files:
        print("No CSV files found matching the pattern!")
        return
    
    # Create an empty list to store individual dataframes
    dfs = []
    
    # Read and combine all CSV files
    for csv_file in csv_files:
        try:
            # Read each CSV file
            df = pd.read_csv(csv_file)
            # Add a column for the source file if needed
            df['source_file'] = os.path.basename(csv_file)
            dfs.append(df)
            print(f"Successfully read: {os.path.basename(csv_file)}")
        except Exception as e:
            print(f"Error reading {csv_file}: {str(e)}")
    
    if not dfs:
        print("No data frames were created. Check if files are readable.")
        return
    
    # Combine all dataframes
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Save the combined dataframe
    try:
        combined_df.to_csv(output_filename, index=False)
        print(f"\nSuccessfully merged {len(csv_files)} files into {output_filename}")
        print(f"Total rows in merged file: {len(combined_df)}")
    except Exception as e:
        print(f"Error saving merged file: {str(e)}")

In [None]:
if __name__ == "__main__":
    directory_path = "./scraped_properties"
    output_file = "merged_properties.csv"
    
    merge_csv_files(directory_path, output_file)

Successfully read: olx_properties_ac_20241102_104112.csv
Successfully read: olx_properties_al_20241102_105025.csv
Successfully read: olx_properties_am_20241102_110146.csv
Successfully read: olx_properties_ap_20241102_110627.csv
Successfully read: olx_properties_ba_20241102_111952.csv
Successfully read: olx_properties_ce_20241102_113708.csv
Successfully read: olx_properties_df_20241102_115813.csv
Successfully read: olx_properties_es_20241102_120831.csv
Successfully read: olx_properties_go_20241102_122109.csv
Successfully read: olx_properties_ma_20241102_123616.csv
Successfully read: olx_properties_mg_20241102_125944.csv
Successfully read: olx_properties_ms_20241102_131736.csv
Successfully read: olx_properties_mt_20241102_132652.csv
Successfully read: olx_properties_pa_20241102_134044.csv
Successfully read: olx_properties_pb_20241102_135824.csv
Successfully read: olx_properties_pe_20241102_141754.csv
Successfully read: olx_properties_pi_20241102_143457.csv
Successfully read: olx_properti

In [6]:
df = pd.read_csv(r'C:\Users\berna\etl_project\merged_properties.csv')

In [12]:
def extract_cep(text):
    if pd.isna(text):
        return np.nan
    # Find sequences of 8 digits
    digits = ''.join(c for c in text if c.isdigit())
    return digits if len(digits) == 8 else np.nan

In [None]:
def extract_state(text):
    if pd.isna(text):
        return np.nan
    
    # List of valid  state codes
    valid_states = {'AC', 'AL', 'AP', 'AM', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA', 
                   'MT', 'MS', 'MG', 'PA', 'PB', 'PR', 'PE', 'PI', 'RJ', 'RN', 
                   'RS', 'RO', 'RR', 'SC', 'SP', 'SE', 'TO'}
    
    # Split by common separators
    parts = text.replace(',', ' ').split()
    
    # Look for valid state codes
    for part in parts:
        if part.upper() in valid_states:
            return part.upper()
    
    return np.nan

In [15]:
def parse_location(location_str):
    if pd.isna(location_str):
        return pd.Series({'neighborhood': np.nan, 
                         'city': np.nan, 
                         'state': np.nan, 
                         'cep': np.nan})
    
    # Initialize variables
    neighborhood = np.nan
    city = np.nan
    state = extract_state(location_str)
    cep = extract_cep(location_str)
    
    # Remove the CEP if found
    if cep is not None:
        location_str = location_str.replace(str(cep), '')
    
    # Split by comma
    parts = [p.strip() for p in location_str.split(',') if p.strip()]
    
    if len(parts) >= 3:
        # Case: Neighborhood, City, State
        neighborhood = parts[0]
        city = parts[1]
    elif len(parts) == 2:
        # Case: City, State or Neighborhood, City
        if extract_state(parts[1]):
            neighborhood = np.nan
            city = parts[0]
        else:
            neighborhood = parts[0]
            city = parts[1]
    elif len(parts) == 1:
        # Case: Only one part (could be city or state)
        if not extract_state(parts[0]):
            city = parts[0]
    
    return pd.Series({
        'neighborhood': neighborhood,
        'city': city,
        'state': state,
        'cep': cep
    })

In [16]:
parsed_locations = df['location'].apply(parse_location)


In [19]:
df_clean = pd.concat([df.drop('location', axis=1), parsed_locations], axis=1)


In [20]:
df_clean.sample(15)

Unnamed: 0,price,area_util,quartos,banheiros,vagas,url,scraped_date,state,source_file,neighborhood,city,state.1,cep
27717,R$ 310.000,60m²,2,1,1,https://mt.olx.com.br/regiao-de-cuiaba/imoveis...,2024-11-02 13:24:53,mt,olx_properties_mt_20241102_132652.csv,,Lucas do Rio Verde,MT,78455000
52926,R$ 580.000,70m²,2,3,1,https://sc.olx.com.br/norte-de-santa-catarina/...,2024-11-02 16:21:58,sc,olx_properties_sc_20241102_163120.csv,,,SC,88220000
47582,R$ 300.000,95m²,2,2,2,https://rr.olx.com.br/roraima/imoveis/vendo-ca...,2024-11-02 15:41:27,rr,olx_properties_rr_20241102_155819.csv,,Boa Vista,RR,69312369
41161,R$ 2.600.000,500m²,5 Ou Mais,5 Ou Mais,5 Ou Mais,https://rj.olx.com.br/rio-de-janeiro-e-regiao/...,2024-11-02 14:51:02,rj,olx_properties_rj_20241102_145935.csv,Recreio dos Bandeirantes,Rio de Janeiro,RJ,22790861
5590,R$ 3.900.000,296m²,3,5 Ou Mais,3,https://am.olx.com.br/regiao-de-manaus/imoveis...,2024-11-02 11:01:01,am,olx_properties_am_20241102_110146.csv,Ponta Negra,Manaus,AM,69037000
3279,R$ 580.000,56m²,,1,1,https://al.olx.com.br/alagoas/imoveis/apartame...,2024-11-02 10:50:23,al,olx_properties_al_20241102_105025.csv,Jatiúca,Maceió,AL,57036850
16893,R$ 397.000,114m²,3,2,2,https://go.olx.com.br/grande-goiania-e-anapoli...,2024-11-02 12:13:14,go,olx_properties_go_20241102_122109.csv,Parque Itatiaia,Aparecida de Goiânia,GO,74968730
43223,R$ 692.746,79m²,3,2,2,https://rn.olx.com.br/rio-grande-do-norte/imov...,2024-11-02 15:03:59,rn,olx_properties_rn_20241102_152235.csv,Ponta Negra,Natal,RN,59092500
30301,R$ 1.800.000,268m²,4,4,2,https://pa.olx.com.br/regiao-de-belem/imoveis/...,2024-11-02 13:38:24,pa,olx_properties_pa_20241102_134044.csv,Coqueiro,Belém,PA,66823060
58425,R$ 230.000,42m²,2,1,1,https://to.olx.com.br/tocantins/imoveis/aparta...,2024-11-02 16:59:18,to,olx_properties_to_20241102_170833.csv,Plano Diretor Norte,Palmas,TO,77006399


In [31]:
df_clean.to_csv('cleaned_location_data.csv', index=False)