In [1]:
import os
import requests
import json
import re
import time
import glob
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from os.path import join
from slugify import slugify
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from urllib.parse import urlparse, parse_qs

base_dir = "vuelax"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [2]:
oportunidades_url = "http://www.vuelax.com/category/oportunidades/page/%d/"

In [37]:
content = []

for page in range(1, 10000):
    url = oportunidades_url % page
    op_page = requests.get(url)
    if page % 10 == 0:
        print("Requesting", url)
    if op_page.status_code != 200:
        break
    op_soup = BeautifulSoup(op_page.text, "lxml")
    main_ul = op_soup.find("ul", {"class":"penci-grid"})
    articles = main_ul.findAll("article", {"class":"item"})
    for article in articles:
        grid_title = article.find("h2", {"class":"grid-title"})
        a = grid_title.find("a")
        grid_post_box_meta = article.find("div", {"class":"grid-post-box-meta"})
        content.append([a.text, a.get('href'), grid_post_box_meta.text.strip()])

data = pd.DataFrame(content, columns= ["label", "url", "date"])
print(data.head())
print(data.info())

data.to_csv(join(base_dir, "original.csv"))

Requesting http://www.vuelax.com/category/oportunidades/page/10/
Requesting http://www.vuelax.com/category/oportunidades/page/20/
Requesting http://www.vuelax.com/category/oportunidades/page/30/
Requesting http://www.vuelax.com/category/oportunidades/page/40/
Requesting http://www.vuelax.com/category/oportunidades/page/50/
Requesting http://www.vuelax.com/category/oportunidades/page/60/
Requesting http://www.vuelax.com/category/oportunidades/page/70/
Requesting http://www.vuelax.com/category/oportunidades/page/80/
Requesting http://www.vuelax.com/category/oportunidades/page/90/
Requesting http://www.vuelax.com/category/oportunidades/page/100/
Requesting http://www.vuelax.com/category/oportunidades/page/110/
Requesting http://www.vuelax.com/category/oportunidades/page/120/
Requesting http://www.vuelax.com/category/oportunidades/page/130/
Requesting http://www.vuelax.com/category/oportunidades/page/140/
Requesting http://www.vuelax.com/category/oportunidades/page/150/
Requesting http://w

In [42]:
location_regex = re.compile('([\w0-9,\s\.]+) [a|A] ([\w0-9,\s\.]+)\s–\s\$([0-9\.,]+)')


clean_values = []
non_clean_values = []

for index, row in data.iterrows():
    label = row['label']
    find = location_regex.search(label)
    if find:
        de = find.group(1)
        a = find.group(2)
        por = find.group(3)
        clean_values.append([de, a, por, row["url"], row["date"]])
    else:
        non_clean_values.append(row.values)

content = pd.DataFrame(clean_values, columns= ["origin", "destination", "price", "url", "date"])
content_not_clean = pd.DataFrame(non_clean_values, columns= ["label", "url", "date"])



print("== Clean ==")
print(content.head())
print(content.info())
content.to_csv(join(base_dir, "clean.csv"))
print()
print("== Dirty ==")
print(content_not_clean.head())
print(content_not_clean.info())
content_not_clean.to_csv(join(base_dir, "still_dirty.csv"))

== Clean ==
     origin destination   price  \
0      CDMX       Tokyo  10,972   
1      CDMX        Lima   5,059   
2       CUN     Bélgica   9,731   
3    Canadá    Islandia   4,425   
4  Islandia  Inglaterra   1,156   

                                                 url            date  
0  http://www.vuelax.com/2018/01/14/cdmx-a-tokyo-...  enero 14, 2018  
1  http://www.vuelax.com/2018/01/13/cdmx-a-lima-5...  enero 13, 2018  
2  http://www.vuelax.com/2018/01/13/cun-a-belgica...  enero 13, 2018  
3  http://www.vuelax.com/2018/01/12/canada-a-isla...  enero 12, 2018  
4  http://www.vuelax.com/2018/01/12/islandia-a-in...  enero 12, 2018  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1242 entries, 0 to 1241
Data columns (total 5 columns):
origin         1242 non-null object
destination    1242 non-null object
price          1242 non-null object
url            1242 non-null object
date           1242 non-null object
dtypes: object(5)
memory usage: 48.6+ KB
None

== Dirty ==
      