In [None]:
import os
import requests
import json
import re
import time
import glob
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from os.path import join
from slugify import slugify
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from urllib.parse import urlparse, parse_qs

base_dir = "vuelax"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)
    
original_file = join(base_dir, "original.csv")

In [None]:
months = {'enero':1, 'febrero':2, 'marzo':3,
          'abril':4, 'mayo':5, 'junio':6,
          'julio':7,'agosto':8, 'septiembre':9,
          'octubre':10, 'noviembre':11, 'diciembre':12}

date_regex = re.compile('(\w+) ([0-9]+), ([0-9]{4})')

def date_converter(date):
    found = date_regex.search(date)
    if found:
        return datetime.datetime(year=int(found.group(3)), month=months[found.group(1)], day=int(found.group(2)))
    else:
        return np.nan

In [None]:
data = None
last_date = None
if os.path.exists(original_file):
    data = pd.read_csv(original_file, index_col=0, parse_dates=['date'], encoding='utf-8')
    last_date = data.iloc[0]['date']
    print("Last date", last_date)
    print(data.head())

In [None]:
oportunidades_url = "http://www.vuelax.com/category/%s/page/%d/"
uncategorized_url = "http://www.vuelax.com/category/%s/page/%d/"

categories = ['oportunidades', 'uncategorized']

In [None]:
for category in categories:
    content = []
    for page in range(1, 1000000):
        url = uncategorized_url % (category, page)
        op_page = requests.get(url)
        if page % 10 == 0:
            print("Requesting", url)
        if op_page.status_code != 200:
            break
        op_soup = BeautifulSoup(op_page.text, "html.parser")
        main_ul = op_soup.find("ul", {"class":"penci-grid"})
        articles = main_ul.findAll("article", {"class":"item"})
        for article in articles:
            grid_title = article.find("h2", {"class":"grid-title"})
            a = grid_title.find("a")
            grid_post_box_meta = article.find("div", {"class":"grid-post-box-meta"})
            date = date_converter(grid_post_box_meta.text.strip())
            content.append([a.text, a.get('href'), date])
        
    data =  pd.DataFrame(content, columns= ["label", "url", "date"])
    data.sort_values(by=['date', 'label'], ascending= False, inplace=True)
    data.to_csv(join(base_dir, "%s.csv" % category), encoding='utf-8')
    print(data.info())

print("Done")

In [None]:
frames = []
for category in categories:
    frame = pd.read_csv(join(base_dir, "%s.csv" % category), index_col=0, encoding='utf-8')
    frames.append(frame)
data = pd.concat(frames).reset_index(drop=True)
data.head()

In [None]:
location_regex = re.compile('([\w0-9,\s\.]+) [a|A] ([\w0-9,\s\.]+)\s*[-|–|"desde"|"DESDE"]\s*\$([0-9\.,]+)')
location_regex_note = re.compile('([\w0-9,\s\.]+) [a|A] ([\w0-9,\s\.]+)\s*\(([\w\s]+)\)\s*[-|–|"desde"|"DESDE"]\s*\$([0-9\.,]+)')

clean_values = []
non_clean_values = []

for index, row in data.iterrows():
    label = row['label']
    de = None
    a = None
    price = None
    note = None
    
    find_simple = location_regex.search(label)
    if find_simple:
        de = find_simple.group(1)
        a = find_simple.group(2)
        por = find_simple.group(3)
    else:
        find_note = location_regex_note.search(label)
        if find_note:
            de = find_note.group(1)
            a = find_note.group(2)
            note = find_note.group(3)
            por = find_note.group(4)
        else:
            non_clean_values.append(row.values)
            
    if de is not None:
        clean_values.append([de, a, por,note, row["url"], row["date"]])

clean = pd.DataFrame(clean_values, columns= ["origin", "destination", "price", "note", "url", "date"])
still_dirty_df = pd.DataFrame(non_clean_values, columns= ["label", "url", "date"])



print("== Clean ==")
print(clean.head())
print(clean.info())
clean.to_csv(join(base_dir, "clean.csv"), encoding='utf-8')
print()
print("== Dirty ==")
print(still_dirty_df.head())
print(still_dirty_df.info())
still_dirty_df.to_csv(join(base_dir, "still_dirty.csv"), encoding='utf-8')

In [None]:
clean = pd.read_csv(join(base_dir, "clean.csv"), index_col = 0)

strip_blanks = lambda x: x.strip()
strip_dot = lambda x: x.strip('.')

print(clean[['origin','destination','price']].tail(30))
clean.origin = clean.origin.apply(strip_blanks)
clean.destination = clean.destination.apply(strip_blanks)
clean.price = clean.price.apply(strip_dot)
print(clean[['origin','destination','price']].tail(30))


clean.to_csv(join(base_dir, "clean.csv"), encoding='utf-8')

In [None]:
clean = pd.read_csv(join(base_dir, "clean.csv"), index_col = 0, encoding='utf-8')
clean.info()

In [None]:
convert_price = lambda price: float(price.replace(',',''))
clean.price = clean['price'].apply(convert_price)
clean.to_csv(join(base_dir, "clean.csv"), encoding='utf-8')

In [None]:
import nltk
tolist  = lambda origin: [t.strip() for t in nltk.tokenize.regexp.regexp_tokenize(origin, r'[y,\.\?!"]\s*', gaps=True)]

separate_origins = []

for index, row in clean.iterrows():
    origins = tolist(row['origin'])
    for origin in origins:
        separate_origins.append([origin, row['destination'],
                                row['date'], row['price'],
                                row['note'], row['url']])
        
separa_origin_df = pd.DataFrame(separate_origins, columns=['origin', 'destination',
                                                           'date', 'price', 
                                                           'note', 'url'])

separa_origin_df.to_csv(join(base_dir, "separate_origins.csv"), encoding='utf-8')

In [None]:
unique_locations = sorted(set(list(separa_origin_df.origin.unique()) + list(separa_origin_df.destination.unique())))
print(unique_locations)

In [None]:
import geocoder

real_locations = {'CDMX': 'Ciudad de México', 
                  'CUN': 'Cancún', 'GDL': 'Guadalajara',
                  'L.A.': 'Los Angeles', 'LA': 'Los Angeles',
                  'MTY': 'Monterrey', 'NYC': 'New York City', 
                  'PUE': 'Puebla', 'QRO': 'Querétaro',
                  'SLP': 'San Luis Potosí',
                  'TIJ': 'Tijuana', 'VER': 'Veracruz'
                 }

location_dic = {}
for l in locations[21:]:
    l = real_locations.get(l, l)
    g = geocoder.google(l,key = 'GOOGLE KEY API')
    if g.ok:
        location_dic[l] = g.latlng

In [None]:
get_real_name = lambda l: real_locations.get(l, l)
get_lat = lambda l: location_dic.get(l, (np.nan,np.nan))[0]
get_long = lambda l: location_dic.get(l, (np.nan,np.nan))[1]

separa_origin_df['origin'] = separa_origin_df.origin.apply(get_real_name)
separa_origin_df['origin_lat'] = separa_origin_df.origin.apply(get_lat)
separa_origin_df['origin_long'] = separa_origin_df.origin.apply(get_long)
separa_origin_df['destination_lat'] = separa_origin_df.destination.apply(get_lat)
separa_origin_df['destination_long'] = separa_origin_df.destination.apply(get_long)

separa_origin_df = separa_origin_df[['origin','origin_lat','origin_long',
                                     'destination','destination_lat','destination_long',
                                     'date', 'price', 'url','note']]
separa_origin_df.to_csv(join(base_dir, "separate_origins.csv"), encoding='utf-8')
separa_origin_df.head()

In [None]:
clean.describe()

In [None]:
clean.iloc[clean.price.argmin()]

In [None]:
unique_destinations = clean.origin.unique()

dest = set()

for u in unique_destinations:
    parts = re.findall(r"[\w\.']+", u)
    print(u, parts)
    for part in parts:
        if part == '' or part =="y" or part=="Y":
            break
        dest.add(part.strip())
print(dest)