In [38]:
library(XML)

In [42]:
#Set URL leaf nodes
WomenURLs = 
  c("results/1999/cb99f.html", "results/2000/Cb003f.htm", "results/2001/oof_f.html",
    "results/2002/ooff.htm", "results/2003/CB03-F.HTM",
    "results/2004/women.htm", "results/2005/CB05-F.htm", 
    "results/2006/women.htm", "results/2007/women.htm", 
    "results/2008/women.htm", "results/2009/09cucb-F.htm",
    "results/2010/2010cucb10m-F.htm", 
    "results/2011/2011cucb10m-F.htm",
    "results/2012/2012cucb10m-F.htm")

#Set URL root
ubase = "http://www.cherryblossom.org/"

#Combine URLs into list
urls = paste(ubase, WomenURLs, sep = "")

#Print first 3 URLs
urls[1:3]

#Create vector of years
years = 1999:2012

In [40]:
extractResTable =
  #
  # Retrieve data from web site, 
  # find the preformatted text,
  # and write lines or return as a character vector.
  #
  function(url = "http://web.archive.org/web/20180803073407/http://www.cherryblossom.org/results/2009/09cucb-F.htm",
           year = 1999, sex = "female", file = NULL)
  {
    doc = htmlParse(url)

    if (year == 2000) {

      # Get preformatted text from 4th font element
      # The top file is ill formed so the <pre> search doesn't work.
      ff = getNodeSet(doc, "//font")
      txt = xmlValue(ff[[4]])
      els = strsplit(txt, "\r\n")[[1]]
    }
    else if (year == 2009 & sex == "female") {
      # Get preformatted text from <div class="Section1"> element
      # Each line of results is in a <pre> element
      div1 = getNodeSet(doc, "//div[@class='Section1']")
      pres = getNodeSet(div1[[1]], "//pre")
      els = sapply(pres, xmlValue)
    }
    else if (year == 1999 & sex == "female") {
      # Get preformatted text from <pre> elements
      pres = getNodeSet(doc, "//pre")
      txt = xmlValue(pres[[1]])
      els = strsplit(txt, "\n")[[1]]   
    } 
  
    
    else {
      # Get preformatted text from <pre> elements
      pres = getNodeSet(doc, "//pre")
      txt = xmlValue(pres[[1]])
      els = strsplit(txt, "\r\n")[[1]]   
    } 
    
    if (is.null(file)) return(els)
    # Write the lines as a text file.
    writeLines(els, con = file)
  }

In [41]:
#Format XML into table
WomenTables = mapply(extractResTable, url = urls, year = years)
names(WomenTables) = years
sapply(WomenTables, length)

ERROR: Error: failed to load HTTP resource



In [None]:
#Create separate 2012 table
w2012 = WomenTables$'2012'[-1:-8]

els = WomenTables$'2012'

els[1:10]

In [None]:
#Create separate 2011 table
els2011 = WomenTables$'2011'
els2011[1:10]

In [None]:
#Search for spacer in tables
eqIndex = grep("^===", els)
eqIndex

In [None]:
#Search for spacer in first three entries
first3 = substr(els, 1, 3)
which(first3 == "===")

In [None]:
#Establish where spacer, header and body are in each table
spacerRow = els[eqIndex]
headerRow = els[eqIndex - 1]
body = els[ -(1:eqIndex) ]

#Convert header to lower case
headerRow = tolower(headerRow)

In [None]:
#Find age in header row
ageStart = regexpr("ag", headerRow)
ageStart

#Extract ages
age = substr(body, start = ageStart, stop = ageStart + 1)
head(age)

#Print summary of ages
summary(as.numeric(age))

In [None]:
#Find spaces in data
blankLocs = gregexpr(" ", spacerRow)
blankLocs

#Give those space locations as search locations
searchLocs = c(0, blankLocs[[1]])

#Create a df of those locations
Values = mapply(substr, list(body), 
                start = searchLocs[ -length(searchLocs)] + 1, 
                stop = searchLocs[ -1 ] - 1)