In [1]:
import pandas as pd
import codecs
import numpy as np

In [2]:
def read_lines_with_len(filename):
    delimiter = "|"
    encoding = "iso-8859-1"

    fp = codecs.open(filename, encoding=encoding)
    lines = fp.read().split("\n")
    lines = list(map(lambda e: e.split("|"), lines))
    headers = lines[0]
    lines_with_len = list(zip(map(lambda e: len(e), lines), lines))

    lines_df = pd.DataFrame(lines_with_len)
    lines_df.columns = ["len","data"]
    return lines_df

In [3]:
def fill_13_columns(lines_df):
    lines_13 = lines_df.loc[lines_df["len"] == 13]
    if lines_13.shape[0] == 0:
        return None
    
    lines_13 = pd.DataFrame(lines_13["data"].tolist())
    lines_13.columns = [
        'surface',
        'rooms',
        'baths',
        'garages',
        'price',
        'location',
        'description',
        'coordinates',
        'used',
        'url',
        'additional_info',
        'nature',
        'pictures'
    ]
    lines_13 = lines_13.drop(['used'], axis=1)
    return lines_13

def fill_12_columns(lines_df):
    lines_12 = lines_df.loc[lines_df["len"] == 12]
    if lines_12.shape[0] == 0:
        return None
    
    lines_12 = pd.DataFrame(lines_12["data"].tolist())
    lines_12.columns = [
        'surface',
        'rooms',
        'baths',
        'garages',
        'price',
        'location',
        'description',
        'coordinates',
        'url',
        'additional_info',
        'nature',
        'pictures'
    ]
    return lines_12

def fill_11_columns(lines_df):
    lines_11 = lines_df.loc[lines_df["len"] == 11]
    if lines_11.shape[0] == 0:
        return None
    
    lines_11 = pd.DataFrame(lines_11["data"].tolist())
    lines_11.columns = [
        'surface',
        'rooms',
        'baths',
        'garages',
        'price',
        'location',
        'description',
        'coordinates',
        'url',
        'additional_info',
        'nature'
    ]
    lines_11.loc[:, "pictures"] = None
    return lines_11


def fill_missing_columns(lines_df, basename):
    lines_11 = fill_11_columns(lines_df)
    lines_12 = fill_12_columns(lines_df)
    lines_13 = fill_13_columns(lines_df)
    
    results = []
    for result in [lines_11, lines_12, lines_13]:
        if result is not None:
            results.append(result)
    
    final_df = pd.concat(results).reset_index(drop=True)
    final_df.to_parquet(basename + "/posts.parquet")
    
    stats_serie = lines_df["len"].value_counts()
    stats_df = pd.DataFrame(
        list(zip(
            stats_serie.index,
            stats_serie.values
        ))
    )
    stats_df.columns = ["columns", "count"]
    return stats_df

# Arriendo

## Arriendos manizales

In [4]:
filename = "../posts/apartamentos/manizales-arriendo/apartments.csv"
manizales_apartments_raw = read_lines_with_len(filename)
manizales_apartments_raw.head()

Unnamed: 0,len,data
0,13,"[surface, rooms, baths, garages, price, locati..."
1,12,"[ 75,00 m², Habitaciones: 2, Baños: 2, Parq..."
2,12,"[ 60,00 m², Habitaciones: 3, Baños: 2, Sin ..."
3,12,"[ 35,00 m², Habitaciones: 1, Baños: 1, Sin ..."
4,12,"[ 50,00 m², Habitaciones: 2, Baños: 1, Parq..."


In [5]:
manizales_apartments_stats = fill_missing_columns(manizales_apartments_raw, 
                                                  "../posts/apartamentos/manizales-arriendo/")
manizales_apartments_stats.head()

Unnamed: 0,columns,count
0,12,1238
1,11,52
2,13,1
3,1,1


## Arriendos villavicencio

In [6]:
filename = "../posts/apartamentos/villavicencio-arriendo/apartments.csv"
villavicencio_apartments_raw = read_lines_with_len(filename)
villavicencio_apartments_raw.head()

Unnamed: 0,len,data
0,13,"[surface, rooms, baths, garages, price, locati..."
1,12,"[ 50,00 m², Habitaciones: 3, Baños: 2, Sin ..."
2,12,"[ 58,00 m², Habitaciones: 2, Baños: 1, Sin ..."
3,12,"[ 73,00 m², Habitaciones: 3, Baños: 2, Sin ..."
4,12,"[ 54,00 m², Habitaciones: 3, Baños: 1, Parq..."


In [7]:
villavicencio_apartments_stats = fill_missing_columns(villavicencio_apartments_raw, 
                                                  "../posts/apartamentos/villavicencio-arriendo/")
villavicencio_apartments_stats.head()

Unnamed: 0,columns,count
0,12,187
1,11,6
2,13,1
3,1,1


## Fusagasuga villavicencio

In [8]:
filename = "../posts/apartamentos/fusagasuga-arriendo/apartments.csv"
fusagasuga_apartments_raw = read_lines_with_len(filename)
fusagasuga_apartments_raw.head()

Unnamed: 0,len,data
0,13,"[surface, rooms, baths, garages, price, locati..."
1,12,"[ 78,00 m², Habitaciones: 2, Baños: 2, Sin ..."
2,12,"[ 78,00 m², Habitaciones: 3, Baños: 2, Parq..."
3,12,"[ 60,00 m², Habitaciones: 3, Baños: 2, Parq..."
4,12,"[ 76,00 m², Habitaciones: 3, Baños: 1, Sin ..."


In [9]:
fusagasuga_apartments_stats = fill_missing_columns(fusagasuga_apartments_raw, 
                                                  "../posts/apartamentos/fusagasuga-arriendo/")
fusagasuga_apartments_stats.head()

Unnamed: 0,columns,count
0,12,14
1,13,1
2,1,1


# Casas

## Arriendos manizales

In [10]:
filename = "../posts/casas/manizales-arriendo/casas.csv"
manizales_houses_raw = read_lines_with_len(filename)
manizales_houses_raw.head()

Unnamed: 0,len,data
0,13,"[surface, rooms, baths, garages, price, locati..."
1,12,"[ 330,00 m², Habitaciones: 7, Baños: 4, Sin..."
2,12,"[ 120,00 m², Habitaciones: 4, Baños: 3, Par..."
3,12,"[ 560,00 m², Habitaciones: 5, Baños: 3, Sin..."
4,12,"[ 78,00 m², Habitaciones: 3, Baños: 3, Sin ..."


In [11]:
manizales_houses_stats = fill_missing_columns(manizales_houses_raw, 
                                                  "../posts/casas/manizales-arriendo/")
manizales_houses_stats.head()

Unnamed: 0,columns,count
0,12,121
1,11,3
2,13,1
3,1,1


## Arriendos villavicencio

In [12]:
filename = "../posts/casas/villavicencio-arriendo/casas.csv"
villavicencio_houses_raw = read_lines_with_len(filename)
villavicencio_houses_raw.head()

Unnamed: 0,len,data
0,13,"[surface, rooms, baths, garages, price, locati..."
1,12,"[ 72,00 m², Habitaciones: 2, Baños: 1, Sin ..."
2,12,"[ 1.200,00 m², Habitaciones: 4, Baños: 4, S..."
3,12,"[ 60,00 m², Habitaciones: 2, Baños: 1, Sin ..."
4,12,"[ 200,00 m², Habitaciones: 4, Baños: 4, Sin..."


In [13]:
villavicencio_houses_stats = fill_missing_columns(villavicencio_houses_raw, 
                                                  "../posts/casas/villavicencio-arriendo/")
villavicencio_houses_stats.head()

Unnamed: 0,columns,count
0,12,69
1,11,2
2,13,1
3,1,1


## Arriendos fusagasuga

In [14]:
filename = "../posts/casas/fusagasuga-arriendo/casas.csv"
fusagasuga_houses_raw = read_lines_with_len(filename)
fusagasuga_houses_raw.head()

Unnamed: 0,len,data
0,13,"[surface, rooms, baths, garages, price, locati..."
1,12,"[ 280,00 m², Habitaciones: 3, Baños: 4, Par..."
2,12,"[ 81,00 m², Habitaciones: 3, Baños: 3, Parq..."
3,12,"[ 10.000,00 m², Habitaciones: 4, Baños: 4, ..."
4,12,"[ 54,00 m², Habitaciones: 2, Baños: 2, Parq..."


In [15]:
fusagasuga_houses_stats = fill_missing_columns(fusagasuga_houses_raw, 
                                                  "../posts/casas/fusagasuga-arriendo/")
fusagasuga_houses_stats.head()

Unnamed: 0,columns,count
0,12,10
1,13,1
2,1,1


# Venta

## Venta manizales

In [16]:
filename = "../posts/apartamentos/manizales-venta/apartments.csv"
manizales_apartments_raw = read_lines_with_len(filename)
manizales_apartments_raw.head()

Unnamed: 0,len,data
0,13,"[surface, rooms, baths, garages, price, locati..."
1,13,"[ 54,00 m², Habitaciones: 2, Baños: 2, Parq..."
2,13,"[ 72,00 m², Habitaciones: 3, Baños: 3, Parq..."
3,13,"[ 160,00 m², Habitaciones: 4, Baños: 3, Par..."
4,13,"[ 93,00 m², Habitaciones: 3, Baños: 2, Parq..."


In [17]:
manizales_apartments_stats = fill_missing_columns(manizales_apartments_raw, 
                                                  "../posts/apartamentos/manizales-venta/")
manizales_apartments_stats.head()

Unnamed: 0,columns,count
0,13,3604
1,12,60
2,1,1


## Venta villavicencio

In [18]:
filename = "../posts/apartamentos/villavicencio-venta/apartments.csv"
villavicencio_apartments_raw = read_lines_with_len(filename)
villavicencio_apartments_raw.head()

Unnamed: 0,len,data
0,13,"[surface, rooms, baths, garages, price, locati..."
1,13,"[ 89,80 m², Habitaciones: 3, Baños: 2, Parq..."
2,13,"[ 86,00 m², Habitaciones: 4, Baños: 3, Parq..."
3,13,"[ 92,00 m², Habitaciones: 3, Baños: 3, Parq..."
4,13,"[ 54,00 m², Habitaciones: 3, Baños: 1, Parq..."


In [19]:
villavicencio_apartments_stats = fill_missing_columns(villavicencio_apartments_raw, 
                                                  "../posts/apartamentos/villavicencio-venta/")
villavicencio_apartments_stats.head()

Unnamed: 0,columns,count
0,13,429
1,12,6
2,1,1


## Venta fusagasuga

In [20]:
filename = "../posts/apartamentos/fusagasuga-venta/apartments.csv"
fusagasuga_apartments_raw = read_lines_with_len(filename)
fusagasuga_apartments_raw.head()

Unnamed: 0,len,data
0,13,"[surface, rooms, baths, garages, price, locati..."
1,13,"[ 85,00 m², Habitaciones: 3, Baños: 2, Parq..."
2,13,"[ 109,00 m², Habitaciones: 4, Baños: 2, Par..."
3,13,"[ 110,00 m², Habitaciones: 3, Baños: 2, Par..."
4,13,"[ 57,00 m², Habitaciones: 2, Baños: 1, Sin ..."


In [21]:
fusagasuga_apartments_stats = fill_missing_columns(fusagasuga_apartments_raw, 
                                                  "../posts/apartamentos/fusagasuga-venta/")
fusagasuga_apartments_stats.head()

Unnamed: 0,columns,count
0,13,234
1,12,9
2,1,1


# Casas

## Venta manizales

In [22]:
filename = "../posts/casas/manizales-venta/casas.csv"
manizales_houses_raw = read_lines_with_len(filename)
manizales_houses_raw.head()

Unnamed: 0,len,data
0,13,"[surface, rooms, baths, garages, price, locati..."
1,13,"[ 180,00 m², Habitaciones: 4, Baños: 1, Par..."
2,13,"[ 130,00 m², Habitaciones: 4, Baños: 2, Sin..."
3,13,"[ 407,00 m², Habitaciones: 9, Baños: 6, Par..."
4,12,"[ 90,00 m², Habitaciones: 3, Baños: 2, Parq..."


In [23]:
manizales_houses_stats = fill_missing_columns(manizales_houses_raw, 
                                                  "../posts/casas/manizales-venta/")
manizales_houses_stats.head()

Unnamed: 0,columns,count
0,13,2521
1,12,48
2,1,1


## Venta villavicencio

In [24]:
filename = "../posts/casas/villavicencio-venta/casas.csv"
villavicencio_houses_raw = read_lines_with_len(filename)
villavicencio_houses_raw.head()

Unnamed: 0,len,data
0,13,"[surface, rooms, baths, garages, price, locati..."
1,13,"[ 180,00 m², Habitaciones: 7, Baños: 4, Par..."
2,13,"[ 250,00 m², Habitaciones: 5, Baños: 5, Par..."
3,13,"[ 130,00 m², Habitaciones: 3, Baños: 3, Par..."
4,13,"[ 72,00 m², Habitaciones: 3, Baños: 1, Sin ..."


In [25]:
villavicencio_houses_stats = fill_missing_columns(villavicencio_houses_raw, 
                                                  "../posts/casas/villavicencio-venta/")
villavicencio_houses_stats.head()

Unnamed: 0,columns,count
0,13,1048
1,12,19
2,1,1


## Venta fusagasuga

In [26]:
filename = "../posts/casas/fusagasuga-venta/casas.csv"
fusagasuga_houses_raw = read_lines_with_len(filename)
fusagasuga_houses_raw.head()

Unnamed: 0,len,data
0,13,"[surface, rooms, baths, garages, price, locati..."
1,13,"[ 129,00 m², Habitaciones: 4, Baños: 4, Par..."
2,13,"[ 210,00 m², Habitaciones: 5, Baños: 4, Par..."
3,13,"[ 140,00 m², Habitaciones: 3, Baños: 4, Par..."
4,13,"[ 75,00 m², Habitaciones: 3, Baños: 2, Sin ..."


In [27]:
fusagasuga_houses_stats = fill_missing_columns(fusagasuga_houses_raw, 
                                                  "../posts/casas/fusagasuga-venta/")
fusagasuga_houses_stats.head()

Unnamed: 0,columns,count
0,13,582
1,12,12
2,1,1
