**README**:

This code will scrap http://ojp.nationalrail.co.uk for journey times for each station in a list. Codes must me provided.

**TODO**:

 - Get off peak times as well (eg: weekend / Off-Peak)
 - Get frequencies from departure times

In [1]:
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import pandas as pd
import datetime

# National Rain Enquiries

## Hardcoded

In [2]:
# Base url
burl = 'http://ojp.nationalrail.co.uk/service/timesandfares/{O}/{D}/{DateCode}/{TimeCode}/{schedule}'

In [3]:
tomorrow = datetime.date.today() + datetime.timedelta(days=1)
DateCode = tomorrow.strftime("%d%m%y")

TimeCode = '0900'

schedule = 'arr'  # {'arr': 'Arrival', 'dep': 'Departure'}

In [4]:
# For NRE
errors = [
    #"Sorry, no stations found.", 
    #"Sorry, we can't plan this journey.", 
    "You need to correct the fields marked with errors before continuing.", 
    "No matching station found.", 
    "error!", 
]

In [5]:
s_sts_fp = Path(r'stations.csv')

In [6]:
ofp = 'NRE_JT.csv'

## Aux functions

In [7]:
class WrongStationCode(Exception):
    pass

In [8]:
def get_st_webpage_soup(O, D, DateCode=DateCode, TimeCode=TimeCode, burl=burl, errs=errors, schedule=schedule):
    '''Retrieve the NRE website for O, D station codes.'''
    
    url = burl.format(O=O, D=D, DateCode=DateCode, TimeCode=TimeCode, schedule=schedule)
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    for err in errs:
        if err in soup.text:
            raise WrongStationCode(f"{O}, {D}, {DateCode}, or {TimeCode} are not a valid. Found:\n{err}")
    
    return soup

In [9]:
def mins_from_tag(tag, num_conv=int):
    '''Returns durationn in minutes form a html tag.
    num_conv is a function to deal with string to number conversion.'''
    
    res = list(tag.stripped_strings)
    d = dict(zip(res[1::2], res[::2]))  # {'h': '1', 'm': '3'}
    h = num_conv(d.get('h', 0))
    m = num_conv(d.get('m', 0)) + 60 * h
    return m

In [10]:
def get_jny_durs(soup):
    '''Retrieve the travel duration from the NRE website.'''
    tags = soup.find_all('td', attrs={'class': 'dur'})
    durs = [mins_from_tag(tag) for tag in tags]
    return durs

In [11]:
def joined_strings(tag):
    strs = list(tag.stripped_strings)
    return ''.join(strs)

In [12]:
def get_from_st(soup):
    '''Retrieve the "From" station from the NRE website.'''
    
    tags = soup.select('#oft > tbody > tr.first.mtx > td.from')
    
    if not tags:
        tags = soup.select('#oft > tbody > tr.last.mtx > td.from')
        
    if tags:
        return joined_strings(tags[0])
        
    return ''

In [13]:
def get_to_st(soup):
    '''Retrieve the "To" station from the NRE website.'''
    
    tags = soup.select('#oft > tbody > tr.first.mtx > td.to')
    
    if not tags:
        tags = soup.select('#oft > tbody > tr.last.mtx > td.to')
        
    if tags:
        return joined_strings(tags[0])
    
    return ''

In [14]:
def scrapped_O_D_time(O, D, **kwargs):
    '''Returns a tuple (O, D, t) of the retrieved values.'''
    
    try:
        soup = get_st_webpage_soup(O, D, **kwargs)
    except WrongStationCode:
        return None
    
    times = get_jny_durs(soup)
    if times:
        t = times[0]
    else:
        return None
    
    ofromst = str(get_from_st(soup))
    otost = str(get_to_st(soup))
    
    return ofromst, otost, t

## Results

In [15]:
sts = pd.read_csv(s_sts_fp)

In [16]:
sts

Unnamed: 0,O,D
0,CanaryWarf,DEP
1,CanaryWarf,GNW
2,CanaryWarf,CTN
3,CanaryWarf,WWA
4,CanaryWarf,ABW
5,CanaryWarf,SGR
6,CanaryWarf,NWX
7,CanaryWarf,LEW
8,CanaryWarf,BKH
9,CanaryWarf,BNH


In [17]:
sts[['NRE_O', 'NRE_D', 'NRE_Minutes']] = sts.apply(lambda row: pd.Series(scrapped_O_D_time(row.O, row.D)), axis=1)

In [18]:
sts.to_csv(ofp, index=False)

In [19]:
sts

Unnamed: 0,O,D,NRE_O,NRE_D,NRE_Minutes
0,CanaryWarf,DEP,Canary WharfDLR,Deptford [DEP],24
1,CanaryWarf,GNW,Canary WharfDLR,Greenwich [GNW],18
2,CanaryWarf,CTN,Canary WharfDLR,Charlton [CTN],34
3,CanaryWarf,WWA,Canary WharfDLR,Woolwich Arsenal [WWA],34
4,CanaryWarf,ABW,Canary WharfDLR,Abbey Wood [ABW],40
5,CanaryWarf,SGR,Canary WharfDLR,Slade Green [SGR],57
6,CanaryWarf,NWX,Canary WharfDLR,New Cross [NWX],26
7,CanaryWarf,LEW,Canary WharfDLR,Lewisham [LEW],23
8,CanaryWarf,BKH,Canary WharfDLR,Blackheath [BKH],29
9,CanaryWarf,BNH,Canary WharfDLR,Barnehurst [BNH],46
