-
Notifications
You must be signed in to change notification settings - Fork 4
/
swimmers.R
112 lines (99 loc) · 3.65 KB
/
swimmers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
library(RCurl)
library(XML)
library(stringr)
library(jsonlite)
# Get the list of major swimming events
# Olympics (Every 4-th years: from 2000, meetType 1)
# World Championships (Every odd years: meetType 2)
# European Championships (Every even years: meetType 3)
# Commonwealth Games (Every non-olympic 4-th years: 2006, 2010, 2014, 2018: meetType 5)
# Pan Pacific Championships (Every non-olympic 4-th years: 2006, 2010, 2014, 2018: meetType 7450054)
# get meet info for data generation with python and visualization of webapp
meetTypes <- c('1', '2', '3', '5', '7450054')
meetList <- list()
meetIdsAll <- c()
for (mt in meetTypes) {
# Olympics
print(mt)
html <- getURL(paste("https://www.swimrankings.net/index.php?page=meetSelect&selectPage=BYTYPE&nationId=0&meetType=", mt, sep=""))
doc <- htmlParse(html, asText=TRUE)
# check data quality
qualities <- xpathSApply(doc, "//td[@class='name']/img", xmlGetAttr, 'src')[1:10]
hasQuality <- c()
for (q in qualities) {
hasQuality <- c(hasQuality, str_detect(q, '5')) #meetQuality5.png is the indicator
}
count <- sum(hasQuality) + 1
print (count)
# Get meet ids -- roughly cut 10 events
links <- xpathSApply(doc, "//td[@class='name']/a", xmlGetAttr, 'href')[1:count]
meetIds <- c()
for (link in links) {
id <- unlist(str_split(link, "="))[3]
meetIds <- c(meetIds, id)
}
# Get meet info
meets <- xpathSApply(doc, "//table[@class='meetSearch']/tr", xmlValue)[2:count]
for (i in 1:length(meets)) {
meet <- meets[i]
year <- str_extract(meet, "(1|2)[0-9]{3}")
print (as.integer(year))
# set the year to extract data
if ((as.integer(year) >= 2017) == TRUE) {
# Append meet id to all meet ids
meetIdsAll <- c(meetIdsAll, meetIds[i])
# meetList obj
remains <- unlist(str_split(meet, "50m"))[2]
location <- str_extract(remains, "^.*\\([A-Z]*\\)")
location <- str_replace(location, "\u00a0", " ")
name <- str_trim(unlist(str_split(remains, "\\)"))[2])
print(name)
print(meetIds[i])
meetList[meetIds[i]] <- list(list(type = as.character(mt), year = year, location = location, name = name, id = meetIds[i]))
}
}
}
# connect HTML pages and parse contents, later used in python
genders <- c(1, 2)
styles <- list( '1' <- c(1, 2, 3, 5, 8, 10, 11, 13, 14, 16, 17, 18, 19, 27, 29, 40),
'2' <- c(1, 2, 3, 5, 6, 10, 11, 13, 14, 16, 17, 18, 19, 27, 29, 40))
for (meet in meetIdsAll) {
for (gender in genders) {
for (style in unlist(styles[gender])) {
#do only valid meet id
if (!is.null(meetList[[meet]])) {
print(meet)
url <- paste("https://www.swimrankings.net/index.php?page=meetDetail&meetId=",
meet,
"&gender=",
gender,
"&styleId=",
style, sep="")
html <- getURL(url)
doc <- htmlParse(html, asText=TRUE)
#save only accessible sites
if (xpathSApply(doc, "//p", xmlValue)[1] == "You need a valid Swimrankings account in order to access this site.") {
#remove meet list
meetList[meet] = NULL
print(c('not accessible', meet))
} else {
fileName <- paste("../python/R_results/html/", meet, "-", gender, "-", style, ".html", sep="")
sink(fileName)
print(doc, type='html')
sink()
print(fileName)
}
}
}
}
}
# change list to ordered array
meetListArray = list()
i = 1
for (meet in meetList) {
meet
meetListArray[i] = list(meet)
i = i + 1
}
# save as json file
write(toJSON(meetListArray), "../python/R_results/json/meets.json")