In [None]:
import os
import requests
import json
import re
import time
import glob
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from os.path import join
from slugify import slugify
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from urllib.parse import urlparse, parse_qs

base_dir = "vuelax"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [None]:
months = {'enero':1, 'febrero':2, 'marzo':3,
          'abril':4, 'mayo':5, 'junio':6,
          'julio':7,'agosto':8, 'septiembre':9,
          'octubre':10, 'noviembre':11, 'diciembre':12}

date_regex = re.compile('(\w+) ([0-9]+), ([0-9]{4})')

def date_converter(date):
    found = date_regex.search(date)
    if found:
        return datetime.datetime(year=int(found.group(3)), month=months[found.group(1)], day=int(found.group(2)))
    else:
        return np.nan

In [None]:
page_url = "http://www.vuelax.com/category/%s/page/%d/"

categories = ['oportunidades', 'uncategorized']

In [None]:
category_dates = { }
category_previous_frames = {}
data = None
last_date = None
for category in categories:
    original_file = join(base_dir, "%s.csv" % category)
    if os.path.exists(original_file):
        data = pd.read_csv(original_file, index_col=0, parse_dates=['date'], encoding='utf-8')
        last_date = data.iloc[0]['date']
        category_previous_frames[category] = data
        category_dates[category] = last_date
        print("Last date for \"%s\"" % category, last_date)

In [None]:
for category in categories:
    content = []
    break_category = False
    for page in range(1, 1000000):
        url = page_url % (category, page)
        op_page = requests.get(url)
        if page % 10 == 0:
            print("Requesting", url)
        if op_page.status_code != 200:
            break
        op_soup = BeautifulSoup(op_page.text, "html.parser")
        main_ul = op_soup.find("ul", {"class":"penci-grid"})
        articles = main_ul.findAll("article", {"class":"item"})
        for article in articles:
            grid_title = article.find("h2", {"class":"grid-title"})
            a = grid_title.find("a")
            grid_post_box_meta = article.find("div", {"class":"grid-post-box-meta"})
            date = date_converter(grid_post_box_meta.text.strip())
            if date <= category_dates[category]:
                print("I already have the date", date, "for the category \"%s\""%category)
                break_category = True
                break
            content.append([a.text, a.get('href'), date])
        if break_category:
            break
        
    data =  pd.DataFrame(content, columns= ["label", "url", "date"])
    
    
    if category in category_previous_frames:
        data = pd.concat([category_previous_frames[category], data])
    data.sort_values(by=['date', 'label'], ascending= False, inplace=True)
    data.to_csv(join(base_dir, "%s.csv" % category), encoding='utf-8')

print("Done")

In [None]:
frames = []
for category in categories:
    frame = pd.read_csv(join(base_dir, "%s.csv" % category), index_col=0, encoding='utf-8')
    frames.append(frame)
data = pd.concat(frames).reset_index(drop=True)

print(len(data))
not_duplicated = data[~data.duplicated('url')]
print(len(not_duplicated))
not_duplicated.head()

## Simple text processing
(lots of regex)

In [None]:
location_regex = re.compile('¡?([\w0-9,\s\.]+)' + # Origin 0
                            '\s+[a|A]\s+'+
                            '([\w0-9,\s\.]+)!?' + # Destination 1
                            '\s*[-|–|"desde"|"DESDE"]{0,1}\s*' + 
                            '\$([0-9\.,]+)') # Price 2

compact_location = re.compile('¡([\w0-9,\s\.]+)' + # Origin 0
                              '\s+[a|A]\s' + 
                              '([\w0-9,\s\.]+)' + # Destination 1
                              '\s+[-|–|"desde"|"DESDE"]{0,1}\s+'+
                              '\$([0-9\.,]+)!' + # Price 2
                              '\s*\(?([\w\s\+]+)\)?\s*') # Note 3
                                 
location_regex_note = re.compile('¡?([\w0-9,\s\.]+)' + # Origin 0
                                 '\s+[a|A]\s' + 
                                 '([\w0-9,\s\.]+)!?' + # Destination 1
                                 '\s*\(?([\w\s\+]+)\)?\s*' +  # Note 2
                                 '[-|–|"desde"|"DESDE"]{0,1}\s*'+
                                 '\$([0-9\.,]+)') # Price 3

def find_info(label):
    
    from_, to, price, note = None, None, None, None
    find = compact_location.search(label)
    if find: # compact
        from_ = find.group(1)
        to = find.group(2)
        price = find.group(3).strip('.')
    else:
        find = location_regex.search(label)
        if find: # without note
            from_ = find.group(1)
            to = find.group(2)
            price = find.group(3).strip('.')
        else:
            find = location_regex_note.search(label)
            if find:
                from_ = find.group(1)
                to = find.group(2)
                note = find.group(3).strip()
                price = find.group(4).strip('.')
    return from_, to, price, note
"""
print(find_info("¡CDMX a Madrid! – $11,866. ¡Opciones disponibles en verano!"))
print(find_info("¡CDMX a Cancún! Vuelos + Hotel Todo Incluido – $5,258"))
print(find_info("CDMX y GDL a Busan, Corea. ¡Opciones de hospedaje desde 189 MXN la noche!"))
print(find_info("CUN a Bélgica – $7,807"))
print(find_info("¡CDMX, MTY, GDL, TIJ, Silao, CUL y más a Lima + Cusco – $7,125! Opción de hotel, 8 días y 7 noches por $2,685 por persona (hab doble)"))
print(find_info("¡CDMX a Bogotá + Cartagena $5,200! (Y por sólo $3,213 adicionales agrega 10 noches de hospedaje)"))
print(find_info("CDMX y muchas ciudades a Panamá – $4,566. ¡Opción con hospedaje y desayuno desde 330 MXN la noche!"))ç
"""

In [None]:
clean_values = []
non_clean_values = []

for index, row in not_duplicated.iterrows():
    label = row['label']
    from_, to, price, note = find_info(label)
    
    if from_:
        clean_values.append([from_, to, float(price.replace(",","")),note, row["url"], row["date"], label])
    else:
        non_clean_values.append(row.values)


clean = pd.DataFrame(clean_values, columns= ["origin", "destination", "price", "note", "url", "date", "label"])
still_dirty_df = pd.DataFrame(non_clean_values, columns= ["label", "url", "date"])


print("== Clean ==")
print(clean[pd.notna(clean.note)].iloc[:3][["note", "label"]].values)
print()
print("== Dirty ==")
print(len(still_dirty_df.label.values))

In [None]:
import nltk
from nltk.tokenize.regexp import regexp_tokenize
tolist  = lambda origin: [t.strip()
                          for t 
                          in regexp_tokenize(origin, r'[y,\.\?!"]\s+', gaps=True)]

airports_df = pd.read_csv(join(base_dir, "airports.csv"), index_col=0)
airports = dict(zip(airports_df.IATA, airports_df["Location served"]))

real_locations = {'CDMX': 'Ciudad de México', 
                  'CUN': 'Cancún', 'GDL': 'Guadalajara',
                  'L.A.': 'Los Angeles', 'LA': 'Los Angeles',
                  'NYC': 'New York City', 
                  'MTY': 'Monterrey', 
                  'PUE': 'Puebla', 'QRO': 'Querétaro',
                  'SLP': 'San Luis Potosí',
                  'TIJ': 'Tijuana', 'VER': 'Veracruz'
                 }


In [None]:
separate_origins = []

year = re.compile("\.*[0-9]+\.*")
has_numbers = lambda f: True if year.search(f) else False

for index, row in clean.iterrows():
    origins = tolist(row['origin'])
    for origin in origins:
        destination = row['destination'].strip()
        if not has_numbers(origin): # filter out stuff like 23 cities more...
            separate_origins.append([ 
                real_locations.get(origin.strip(), origin.strip()), 
                real_locations.get(destination, destination),
                row['date'], row['price'],
                row['note'], row['url']])
        
separa_origin_df = pd.DataFrame(separate_origins, columns=['origin', 'destination',
                                                           'date', 'price', 
                                                           'note', 'url'])



separa_origin_df.to_csv(join(base_dir, "separate_origins.csv"), encoding='utf-8')
separa_origin_df.sample(10)

In [None]:
unique_locations = sorted(set(list(separa_origin_df.origin.unique()) + list(separa_origin_df.destination.unique())))
print(len(unique_locations))

In [None]:
json_locations = join(base_dir, "location_data.json")
location_dic = {}
if os.path.exists(json_locations):
    with open(json_locations, "r") as s:
        location_dic = json.load(s)

In [None]:
import time
import requests

parameters = {
    'bounds':'',
    'components':'',
    'region': '',
    'language':'en', 
    'key': ''
}

for l in unique_locations:
    l = real_locations.get(l, l)
    if l in location_dic:
        continue
    print("Reading %s" % l)
    parameters["address"] = l
    mapinfo = requests.get("https://maps.googleapis.com/maps/api/geocode/json", parameters)
    if mapinfo.status_code == 200:
        location_dic[l] = json.loads(mapinfo.text)
    time.sleep(2)
    

In [None]:
with open(join(base_dir, "location_data.json"), "w") as s:
    json.dump(location_dic, s, indent=2)

In [None]:
def get_original_location(location):
    if location in location_dic:
        jj = location_dic[location]
        if len(jj["results"]) > 0:
            return jj["results"][0]["formatted_address"]
    return location

def get_original_country(location):
    if location in location_dic:
        jj = location_dic[location]
        if len(jj["results"]) > 0:
            address_components = jj["results"][0]["address_components"]
            for component in address_components:
                if "country" in component["types"]:
                    return component["long_name"] + " (" + component["short_name"] + ")"
    return None

def get_original_administrative_area_level_1(location):
    if location in location_dic:
        jj = location_dic[location]
        if len(jj["results"]) > 0:
            address_components = jj["results"][0]["address_components"]
            for component in address_components:
                if "administrative_area_level_1" in component["types"]:
                    return component["long_name"] + " (" + component["short_name"] + ")"
    return None

def get_lat(location):
    if location in location_dic:
        jj = location_dic[location]
        if len(jj["results"]) > 0:
            return jj["results"][0]["geometry"]["location"]["lat"]
    return np.nan

def get_long(location):
    if location in location_dic:
        jj = location_dic[location]
        if len(jj["results"]) > 0:
            return jj["results"][0]["geometry"]["location"]["lat"]
    return np.nan

separa_origin_df["clean_origin"] = separa_origin_df.origin.apply(get_original_location)
separa_origin_df["clean_destination"] = separa_origin_df.destination.apply(get_original_location)

separa_origin_df["country_origin"] = separa_origin_df.origin.apply(get_original_country)
separa_origin_df["country_destination"] = separa_origin_df.destination.apply(get_original_country)

separa_origin_df["area_level_1_origin"] = separa_origin_df.origin.apply(get_original_administrative_area_level_1)
separa_origin_df["area_level_1_destination"] = separa_origin_df.destination.apply(get_original_administrative_area_level_1)

separa_origin_df['origin_lat'] = separa_origin_df.origin.apply(get_lat)
separa_origin_df['origin_long'] = separa_origin_df.origin.apply(get_long)
separa_origin_df['destination_lat'] = separa_origin_df.destination.apply(get_lat)
separa_origin_df['destination_long'] = separa_origin_df.destination.apply(get_long)

separa_origin_df.sample(10)

In [None]:
separa_origin_df = separa_origin_df[['origin','country_origin','area_level_1_origin',
                                     'clean_origin','origin_lat','origin_long',
                                     'destination','country_destination','area_level_1_destination',
                                     'clean_destination','destination_lat','destination_long',
                                     'date', 'price', 'url','note']]
separa_origin_df.to_csv(join(base_dir, "separate_origins.csv"), encoding='utf-8')
print(len(separa_origin_df))
separa_origin_df.head()

In [None]:
separa_origin_df = pd.read_csv(join(base_dir, "separate_origins.csv"), index_col=0, encoding='utf-8')
separa_origin_df.origin.unique()

In [None]:
separa_origin_df[separa_origin_df.origin == 'CHI']