In [6]:
import os
import requests
import json
import re
import time
import glob
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from os.path import join
from slugify import slugify
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from urllib.parse import urlparse, parse_qs

base_dir = "vuelax"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [7]:
months = {'enero':1, 'febrero':2, 'marzo':3,
          'abril':4, 'mayo':5, 'junio':6,
          'julio':7,'agosto':8, 'septiembre':9,
          'octubre':10, 'noviembre':11, 'diciembre':12}

date_regex = re.compile('(\w+) ([0-9]+), ([0-9]{4})')

def date_converter(date):
    found = date_regex.search(date)
    if found:
        return datetime.datetime(year=int(found.group(3)), month=months[found.group(1)], day=int(found.group(2)))
    else:
        return np.nan

In [8]:
page_url = "http://www.vuelax.com/category/%s/page/%d/"

categories = ['oportunidades', 'uncategorized']

In [9]:
category_dates = { }
category_previous_frames = {}
data = None
last_date = None
for category in categories:
    original_file = join(base_dir, "%s.csv" % category)
    if os.path.exists(original_file):
        data = pd.read_csv(original_file, index_col=0, parse_dates=['date'], encoding='utf-8')
        last_date = data.iloc[0]['date']
        category_previous_frames[category] = data
        category_dates[category] = last_date
        print("Last date for \"%s\"" % category, last_date)

Last date for "oportunidades" 2018-02-10 00:00:00
Last date for "uncategorized" 2018-01-22 00:00:00


In [10]:
for category in categories:
    content = []
    break_category = False
    for page in range(1, 1000000):
        url = page_url % (category, page)
        op_page = requests.get(url)
        if page % 10 == 0:
            print("Requesting", url)
        if op_page.status_code != 200:
            break
        op_soup = BeautifulSoup(op_page.text, "html.parser")
        main_ul = op_soup.find("ul", {"class":"penci-grid"})
        articles = main_ul.findAll("article", {"class":"item"})
        for article in articles:
            grid_title = article.find("h2", {"class":"grid-title"})
            a = grid_title.find("a")
            grid_post_box_meta = article.find("div", {"class":"grid-post-box-meta"})
            date = date_converter(grid_post_box_meta.text.strip())
            if date <= category_dates[category]:
                print("I already have the date", date, "for the category \"%s\""%category)
                break_category = True
                break
            content.append([a.text, a.get('href'), date])
        if break_category:
            break
        
    data =  pd.DataFrame(content, columns= ["label", "url", "date"])
    
    
    if category in category_previous_frames:
        data = pd.concat([category_previous_frames[category], data])
    data.sort_values(by=['date', 'label'], ascending= False, inplace=True)
    data.to_csv(join(base_dir, "%s.csv" % category), encoding='utf-8')

print("Done")

Requesting http://www.vuelax.com/category/oportunidades/page/10/
Requesting http://www.vuelax.com/category/oportunidades/page/20/
I already have the date 2018-02-10 00:00:00 for the category "oportunidades"
I already have the date 2018-01-22 00:00:00 for the category "uncategorized"
Done


In [7]:
frames = []
for category in categories:
    frame = pd.read_csv(join(base_dir, "%s.csv" % category), index_col=0, encoding='utf-8')
    frames.append(frame)
data = pd.concat(frames).reset_index(drop=True)
data.head()

Unnamed: 0,label,url,date
0,"¡Vuelo Directo! CDMX a Calgary, Canadá – $5,540",http://www.vuelax.com/2018/02/10/vuelo-directo...,2018-02-10
1,"¡CDMX a Yellowknife, Ártico Canadiense! – $5,961",http://www.vuelax.com/2018/02/09/cdmx-a-yellow...,2018-02-09
2,"¡CDMX a Santa Marta, Colombia! – $6,867",http://www.vuelax.com/2018/02/09/cdmx-a-santa-...,2018-02-09
3,"L.A. a China – $7,239",http://www.vuelax.com/2018/02/09/l-a-a-china-7...,2018-02-09
4,"CDMX a India – $8,339",http://www.vuelax.com/2018/02/09/cdmx-a-india-...,2018-02-09


In [8]:
location_regex = re.compile('([\w0-9,\s\.]+) [a|A] ([\w0-9,\s\.]+)\s*[-|–|"desde"|"DESDE"]\s*\$([0-9\.,]+)')
location_regex_note = re.compile('([\w0-9,\s\.]+) [a|A] ([\w0-9,\s\.]+)\s*\(([\w\s]+)\)\s*[-|–|"desde"|"DESDE"]\s*\$([0-9\.,]+)')

clean_values = []
non_clean_values = []

for index, row in data.iterrows():
    label = row['label']
    de = None
    a = None
    price = None
    note = None
    
    find_simple = location_regex.search(label)
    if find_simple:
        de = find_simple.group(1)
        a = find_simple.group(2)
        por = find_simple.group(3)
    else:
        find_note = location_regex_note.search(label)
        if find_note:
            de = find_note.group(1)
            a = find_note.group(2)
            note = find_note.group(3)
            por = find_note.group(4)
        else:
            non_clean_values.append(row.values)
            
    if de is not None:
        clean_values.append([de, a, por,note, row["url"], row["date"]])

clean = pd.DataFrame(clean_values, columns= ["origin", "destination", "price", "note", "url", "date"])
still_dirty_df = pd.DataFrame(non_clean_values, columns= ["label", "url", "date"])



print("== Clean ==")
print(clean.head())
print(clean.info())
clean.to_csv(join(base_dir, "clean.csv"), encoding='utf-8')
print()
print("== Dirty ==")
print(still_dirty_df.head())
print(still_dirty_df.info())
still_dirty_df.to_csv(join(base_dir, "still_dirty.csv"), encoding='utf-8')

== Clean ==
       origin       destination  price  note  \
0        CDMX  Calgary, Canadá   5,540  None   
1        L.A.            China   7,239  None   
2        CDMX            India   8,339  None   
3  GDL y CDMX       Costa Rica   4,075  None   
4         NYC         Islandia   8,539  None   

                                                 url        date  
0  http://www.vuelax.com/2018/02/10/vuelo-directo...  2018-02-10  
1  http://www.vuelax.com/2018/02/09/l-a-a-china-7...  2018-02-09  
2  http://www.vuelax.com/2018/02/09/cdmx-a-india-...  2018-02-09  
3  http://www.vuelax.com/2018/02/08/gdl-y-cdmx-a-...  2018-02-08  
4  http://www.vuelax.com/2018/02/08/cdmx-a-nyc-ny...  2018-02-08  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1357 entries, 0 to 1356
Data columns (total 6 columns):
origin         1357 non-null object
destination    1357 non-null object
price          1357 non-null object
note           11 non-null object
url            1357 non-null object
date         

In [9]:
clean = pd.read_csv(join(base_dir, "clean.csv"), index_col = 0)

strip_blanks = lambda x: x.strip()
strip_dot = lambda x: x.strip('.')

print(clean[['origin','destination','price']].tail(30))
clean.origin = clean.origin.apply(strip_blanks)
clean.destination = clean.destination.apply(strip_blanks)
clean.price = clean.price.apply(strip_dot)
print(clean[['origin','destination','price']].tail(30))


clean.to_csv(join(base_dir, "clean.csv"), encoding='utf-8')

                       origin         destination   price
1327                     CDMX                NYC    4,008
1328                     CDMX          Guatemala    3,148
1329             Mérida y CUN          La Habana    2,661
1330                     CDMX   Puerto Escondido    1,157
1331                     CDMX          Bangalore    8,506
1332               GDL y CDMX           Shanghai   11,125
1333                   Cancún         Manchester    9,866
1334                     CDMX               Lima    4,991
1335                     CDMX               Lima    5,581
1336               CUN y CDMX            Orlando    2,271
1337                     CDMX                NYC    4,322
1338                   Canadá           Alemania    6,632
1339                     CDMX    Calgary, Canadá   5,318.
1340                     CDMX                NYC    5,126
1341                     L.A.          Tailandia    7,991
1342                      MTY             Panamá   7,043.
1343          

In [10]:
clean = pd.read_csv(join(base_dir, "clean.csv"), index_col = 0, encoding='utf-8')
clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1357 entries, 0 to 1356
Data columns (total 6 columns):
origin         1357 non-null object
destination    1357 non-null object
price          1357 non-null object
note           11 non-null object
url            1357 non-null object
date           1357 non-null object
dtypes: object(6)
memory usage: 74.2+ KB


In [11]:
convert_price = lambda price: float(price.replace(',',''))
clean.price = clean['price'].apply(convert_price)
clean.to_csv(join(base_dir, "clean.csv"), encoding='utf-8')

In [29]:
import nltk
tolist  = lambda origin: [t.strip() for t in nltk.tokenize.regexp.regexp_tokenize(origin, r'[y,\.\?!"]\s+', gaps=True)]

real_locations = {'CDMX': 'Ciudad de México', 
                  'CUN': 'Cancún', 'GDL': 'Guadalajara',
                  'L.A.': 'Los Angeles', 'LA': 'Los Angeles',
                  'MTY': 'Monterrey', 'NYC': 'New York City', 
                  'PUE': 'Puebla', 'QRO': 'Querétaro',
                  'SLP': 'San Luis Potosí',
                  'TIJ': 'Tijuana', 'VER': 'Veracruz'
                 }

separate_origins = []

for index, row in clean.iterrows():
    origins = tolist(row['origin'])
    for origin in origins:
        separate_origins.append([ real_locations.get(origin.strip(), origin.strip()), real_locations.get(row['destination'], row['destination']),
                                row['date'], row['price'],
                                row['note'], row['url']])
        
separa_origin_df = pd.DataFrame(separate_origins, columns=['origin', 'destination',
                                                           'date', 'price', 
                                                           'note', 'url'])



separa_origin_df.to_csv(join(base_dir, "separate_origins.csv"), encoding='utf-8')

In [30]:
unique_locations = sorted(set(list(separa_origin_df.origin.unique()) + list(separa_origin_df.destination.unique())))
print(len(unique_locations))

390


In [31]:
json_locations = join(base_dir, "location_data.json")
location_dic = {}
if os.path.exists(json_locations):
    with open(json_locations, "r") as s:
        location_dic = json.load(s)

In [32]:
import geocoder
import time
import requests

parameters = {
    'bounds':'',
    'components':'',
    'region': '',
    'language':'', 
    'key': 'GOOGLE API KEY'
}

for l in unique_locations:
    l = real_locations.get(l, l)
    if l in location_dic:
        continue
    print("Reading %s" % l)
    parameters["address"] = l
    mapinfo = requests.get("https://maps.googleapis.com/maps/api/geocode/json", parameters)
    if mapinfo.status_code == 200:
        location_dic[l] = mapinfo.text
    time.sleep(2)
    
with open(join(base_dir, "location_data.json"), "w") as s:
    json.dump(location_dic, s)

Reading VER,
Reading o Monterrey


In [33]:
def get_original_location(location):
    if location in location_dic:
        jj = json.loads(location_dic[location])
        if len(jj["results"]) > 0:
            return jj["results"][0]["formatted_address"];
    return location

def get_lat(location):
    if location in location_dic:
        jj = json.loads(location_dic[location])
        if len(jj["results"]) > 0:
            return jj["results"][0]["geometry"]["location"]["lat"];
    return np.nan

def get_long(location):
    if location in location_dic:
        jj = json.loads(location_dic[location])
        if len(jj["results"]) > 0:
            return jj["results"][0]["geometry"]["location"]["lat"];
    return np.nan

separa_origin_df["clean_origin"] = separa_origin_df.origin.apply(get_original_location)
separa_origin_df["clean_destination"] = separa_origin_df.destination.apply(get_original_location)

separa_origin_df['origin_lat'] = separa_origin_df.origin.apply(get_lat)
separa_origin_df['origin_long'] = separa_origin_df.origin.apply(get_long)
separa_origin_df['destination_lat'] = separa_origin_df.destination.apply(get_lat)
separa_origin_df['destination_long'] = separa_origin_df.destination.apply(get_long)

separa_origin_df.head(10)

Unnamed: 0,origin,destination,date,price,note,url,clean_origin,clean_destination,origin_lat,origin_long,destination_lat,destination_long
0,Ciudad de México,"Calgary, Canadá",2018-02-10,5540.0,,http://www.vuelax.com/2018/02/10/vuelo-directo...,"Mexico City, CDMX, Mexico","Calgary, AB, Canada",19.432608,19.432608,51.048615,51.048615
1,Los Angeles,China,2018-02-09,7239.0,,http://www.vuelax.com/2018/02/09/l-a-a-china-7...,"Los Angeles, CA, USA",China,34.052234,34.052234,35.86166,35.86166
2,Ciudad de México,India,2018-02-09,8339.0,,http://www.vuelax.com/2018/02/09/cdmx-a-india-...,"Mexico City, CDMX, Mexico",India,19.432608,19.432608,20.593684,20.593684
3,Guadalajara,Costa Rica,2018-02-08,4075.0,,http://www.vuelax.com/2018/02/08/gdl-y-cdmx-a-...,"Guadalajara, Jalisco, Mexico",Costa Rica,20.659699,20.659699,9.748917,9.748917
4,Ciudad de México,Costa Rica,2018-02-08,4075.0,,http://www.vuelax.com/2018/02/08/gdl-y-cdmx-a-...,"Mexico City, CDMX, Mexico",Costa Rica,19.432608,19.432608,9.748917,9.748917
5,New York City,Islandia,2018-02-08,8539.0,,http://www.vuelax.com/2018/02/08/cdmx-a-nyc-ny...,"New York, NY, USA",Iceland,40.712775,40.712775,64.963051,64.963051
6,Ciudad de México,"Milán, Italia",2018-02-08,13030.0,,http://www.vuelax.com/2018/02/08/cdmx-a-milan-...,"Mexico City, CDMX, Mexico","Milan, Metropolitan City of Milan, Italy",19.432608,19.432608,45.464204,45.464204
7,Ciudad de México,Lima,2018-02-08,5573.0,,http://www.vuelax.com/2018/02/08/cdmx-a-lima-5...,"Mexico City, CDMX, Mexico","Lima, Peru",19.432608,19.432608,-12.046373,-12.046373
8,Cancún,Dubái,2018-02-07,12685.0,,http://www.vuelax.com/2018/02/07/cun-y-cdmx-a-...,"Cancún, Quintana Roo, Mexico",Dubai - United Arab Emirates,21.161908,21.161908,25.204849,25.204849
9,Ciudad de México,Dubái,2018-02-07,12685.0,,http://www.vuelax.com/2018/02/07/cun-y-cdmx-a-...,"Mexico City, CDMX, Mexico",Dubai - United Arab Emirates,19.432608,19.432608,25.204849,25.204849


In [34]:
separa_origin_df = separa_origin_df[['origin','clean_origin','origin_lat','origin_long',
                                     'destination', 'clean_destination','destination_lat','destination_long',
                                     'date', 'price', 'url','note']]
separa_origin_df.to_csv(join(base_dir, "separate_origins.csv"), encoding='utf-8')
separa_origin_df.head()

Unnamed: 0,origin,clean_origin,origin_lat,origin_long,destination,clean_destination,destination_lat,destination_long,date,price,url,note
0,Ciudad de México,"Mexico City, CDMX, Mexico",19.432608,19.432608,"Calgary, Canadá","Calgary, AB, Canada",51.048615,51.048615,2018-02-10,5540.0,http://www.vuelax.com/2018/02/10/vuelo-directo...,
1,Los Angeles,"Los Angeles, CA, USA",34.052234,34.052234,China,China,35.86166,35.86166,2018-02-09,7239.0,http://www.vuelax.com/2018/02/09/l-a-a-china-7...,
2,Ciudad de México,"Mexico City, CDMX, Mexico",19.432608,19.432608,India,India,20.593684,20.593684,2018-02-09,8339.0,http://www.vuelax.com/2018/02/09/cdmx-a-india-...,
3,Guadalajara,"Guadalajara, Jalisco, Mexico",20.659699,20.659699,Costa Rica,Costa Rica,9.748917,9.748917,2018-02-08,4075.0,http://www.vuelax.com/2018/02/08/gdl-y-cdmx-a-...,
4,Ciudad de México,"Mexico City, CDMX, Mexico",19.432608,19.432608,Costa Rica,Costa Rica,9.748917,9.748917,2018-02-08,4075.0,http://www.vuelax.com/2018/02/08/gdl-y-cdmx-a-...,


In [35]:
separa_origin_df = pd.read_csv(join(base_dir, "separate_origins.csv"), encoding='utf-8')
uk = separa_origin_df[separa_origin_df.clean_destination.str.contains("UK") & separa_origin_df.clean_origin.str.contains("Mexico")]
mexico_uk = uk.loc[:,(['date','clean_origin', 'clean_destination', 'price'])]
mexico_uk.date = pd.to_datetime(mexico_uk.date)
mexico_uk.set_index('date', inplace=True)
mexico_uk.index.name = 'date'
mexico_uk.sort_index(inplace=True)

In [36]:
origins = mexico_uk.clean_origin.unique().tolist()
destination = mexico_uk.clean_destination.unique().tolist()
print(mexico_uk.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 52 entries, 2016-10-20 to 2018-01-29
Data columns (total 3 columns):
clean_origin         52 non-null object
clean_destination    52 non-null object
price                52 non-null float64
dtypes: float64(1), object(2)
memory usage: 1.6+ KB
None
