Skip to content

Commit

Permalink
making 15.1 analysis slightly more reproducible
Browse files Browse the repository at this point in the history
script to combine multiple scrapes of the same leaderboard
making ScrapeLeaderboard a function
  • Loading branch information
swiftsam committed Mar 7, 2015
1 parent 7f0580d commit bd6eaf3
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 42 deletions.
18 changes: 16 additions & 2 deletions analysis_2015_1.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
library(data.table)
library(ggplot2)
library(scales)

# leaderboard <- QueryDB(SELECT * FROM leaderboard WHERE year == 15;)
load("data/leaderboard.15.RData")
load("data/leaderboard.combined.pull.RData")

# create factors
leaderboard[, gender := factor(division, levels=2:1, labels=c("Women","Men"), ordered=TRUE)]
Expand All @@ -13,7 +14,17 @@ leaderboard[, participated := factor(is.na(score), levels=c(F,T), labels=c("WOD'
setkeyv(leaderboard, c("wod", "participated", "scaled", "athlete_id"))
leaderboard <- unique(leaderboard)

n.athletes <- leaderboard[, length(unique(athlete_id))]
n.men <- leaderboard[gender=="Men", length(unique(athlete_id))]
n.women <- leaderboard[gender=="Women", length(unique(athlete_id))]
n.athletes
n.women/n.athletes

# drop outs
leaderboard[, list(length(unique(athlete_id)) / n.athletes), by=participated]
leaderboard[gender == "Men", list(length(unique(athlete_id)) / n.men), by=participated]
leaderboard[gender == "Women", list(length(unique(athlete_id)) / n.women), by=participated]

ggplot(leaderboard[wod == "15.1",
list( N = length(unique(athlete_id))),
by=list(gender, participated)],
Expand All @@ -31,12 +42,15 @@ ggplot(leaderboard[wod == "15.1",
ggsave(filename="~/Desktop/crossfit_15.1_athlete_count.png",width=10, height=4)

# plot Scaled vs RX
leaderboard[participated == "WOD'd", list(length(unique(athlete_id)) / n.athletes), by=scaled]

ggplot(leaderboard[!is.na(score) & wod == "15.1",
.N, by=list(scaled, gender, wod)],
aes(gender, N, fill=scaled)) +
geom_bar( stat="identity", position = "stack") +
geom_text(aes(label=N),position = "stack", hjust=1.2) +
scale_fill_manual(values=c("#a6cee3","#b2df8a")) +
scale_y_continuous(breaks=seq(0,150000, 25000),labels=comma)+
coord_flip() +
labs(y="Number of Athletes",
x="",
Expand Down Expand Up @@ -97,7 +111,7 @@ ggplot(score.wide[!is.na(W15_1) & !is.na(W15_1A)],
limits=c(0,250)) +
facet_grid(gender~., scales="free") +
theme_bw(base_size = 18)
ggsave(filename="~/Desktop/crossfit_15.1v1A_scatter.png",width=10, height=6)
ggsave(filename="~/Desktop/crossfit_15.1v1A_scatter.png",width=8, height=7)



Expand Down
18 changes: 18 additions & 0 deletions combine_leaderboard_scrapes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
load("data/leaderboard.15.RData",verbose = TRUE)
leaderboard.15a <- leaderboard

load("data/leaderboard.15b.RData", verbose = TRUE)

leaderboard.meta <- rbind(leaderboard.15a, leaderboard15.b)
leaderboard.meta <- leaderboard.meta[order(retrieved_datetime),]

leaderboard.meta[, .N, by=list(pull_date = as.Date(retrieved_datetime))]

setkeyv(leaderboard.meta, c("athlete_id", "stage", "score"))

leaderboard.meta <- unique(leaderboard.meta, fromLast=TRUE, by = c("athlete_id", "stage","score"))
leaderboard.meta[, .N, by=list(pull_date = as.Date(retrieved_datetime))]

leaderboard <- leaderboard.meta

save(leaderboard,file = "data/leaderboard.combined.pull.RData")
86 changes: 46 additions & 40 deletions scrape_leaderboard.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,46 +7,52 @@ source("process_fns.R")
source("scrape_fns.R")
source("db_query.R")

pages <- QueryDB("SELECT * FROM leaderboard_pages WHERE retrieved_datetime IS NULL;")

if(pages[, .N] == 0){
message(Sys.time(), " No pages to scrape")
} else {
for(i in 1:pages[,.N]){
year <- pages[i, year]
division <- pages[i, division]
stage <- pages[i, stage]
page <- pages[i, pages]
score.type <- "points"
if(year == 14 & stage == 5) {
score.type <- "time"
}

lb.page <- GetLeaderboardPage(year, division, stage, page, score.type)

if(!is.null(lb.page)) {
# write new leaderboard records
db.con <- dbConnect(RMySQL::MySQL(),
dbname = "crossfit",
user = "crossfit",
password = "",
host = "127.0.0.1")

dbWriteTable(db.con, name = "leaderboard", value=lb.page, row.names = F, append=TRUE)
ScrapeLeaderboard <- function(){
pages <- QueryDB("SELECT * FROM leaderboard_pages WHERE retrieved_datetime IS NULL;")

if(pages[, .N] == 0){
message(Sys.time(), " No pages to scrape")
} else {
for(i in 1:pages[,.N]){
year <- pages[i, year]
division <- pages[i, division]
stage <- pages[i, stage]
page <- pages[i, pages]
score.type <- "points"
if(year == 14 & stage == 5) {
score.type <- "time"
}

QueryDB(paste0(
"UPDATE leaderboard_pages
SET retrieved_datetime = '",Sys.time(),"'
WHERE year = ",year,"
AND division = ",division,"
AND stage = ", stage, "
AND pages = ",page))
lb.page <- GetLeaderboardPage(year, division, stage, page, score.type)

dbDisconnect(db.con)

message(Sys.time(), " Success! year: ",year," div: ",division," stage: ",stage," page: ",page)
} else {
message(Sys.time(), " ** Fail! year: ",year," div: ",division," stage: ",stage," page: ",page)
}
}
if(!is.null(lb.page)) {
# write new leaderboard records
db.con <- dbConnect(RMySQL::MySQL(),
dbname = "crossfit",
user = "crossfit",
password = "",
host = "127.0.0.1")

dbWriteTable(db.con, name = "leaderboard", value=lb.page, row.names = F, append=TRUE)

QueryDB(paste0(
"UPDATE leaderboard_pages
SET retrieved_datetime = '",Sys.time(),"'
WHERE year = ",year,"
AND division = ",division,"
AND stage = ", stage, "
AND pages = ",page))

dbDisconnect(db.con)

message(Sys.time(), " Success! year: ",year," div: ",division," stage: ",stage," page: ",page)
} else {
message(Sys.time(), " ** Fail! year: ",year," div: ",division," stage: ",stage," page: ",page)
}
}
}
}




0 comments on commit bd6eaf3

Please sign in to comment.