In [2]:
library(tidyverse)
library(data.table)
library(foreach)
library(doParallel)

In [3]:
ex = "/n/holylfs/LABS/hoekstra_lab/brock/daymet/rawdata/lat_39.8171000_lon_-99.9428200.csv"
a=read.csv(ex,skip=7)

In [8]:
head(a)

year,yday,dayl..s.,prcp..mm.day.,srad..W.m.2.,swe..kg.m.2.,tmax..deg.c.,tmin..deg.c.,vp..Pa.
1980,1,34905.6,0,272.0,0,20.0,8.5,400
1980,2,34905.6,0,169.6,0,14.5,8.5,720
1980,3,34905.6,0,268.8,0,15.0,4.0,320
1980,4,34905.6,0,300.8,0,15.5,1.5,240
1980,5,34905.6,0,259.2,0,13.0,3.0,320
1980,6,34905.6,0,243.2,0,14.0,5.0,400


In [40]:
y = a %>% filter(yday %in% july) %>% summarize(mean_sum=mean(c(tmax..deg.c.,tmin..deg.c.)))
z = a %>% filter(yday %in% january) %>% summarize(mean_win=mean(c(tmax..deg.c.,tmin..deg.c.)))
x = mean(c(a$tmax..deg.c.,a$tmin..deg.c.))
data.frame(x,y,z) %>% magrittr::set_colnames(c("mean","mean_july","mean_january"))

mean,mean_july,mean_january
18.28362,27.79708,9.240846


In [5]:
numextract <- function(string){ 
  str_extract(string, "\\-*\\d+\\.*\\d*")
} 

get_following_num = function(string,pattern){
    expr = paste0("(",pattern,").+([0-9])")
    val = numextract(str_extract(string=string, pattern=regex(expr,ignore_case = TRUE)))[1]
    return(as.numeric(val))
}

get_metadata = function(csv_path){
    x = readLines(csv_path,n=7) 
    lat = get_following_num(x[1],"Latitude")
    lon = get_following_num(x[1],"Longitude")
    elevation = get_following_num(x[4],"Elevation")
    tile = get_following_num(x[3],"Tile")
    vals = c(lat,lon,elevation,tile)
    names(vals) = c("lat","lon","elevation","tile")
    return(vals)
}

get_tempdata = function(csv_path){
    july = seq(181,211)
    january = seq(1,31)
    dat = read.csv(csv_path,skip=7)
    mean_july = dat %>% filter(yday %in% july) %>% summarize(mean_sum=mean(c(tmax..deg.c.,tmin..deg.c.))) %>% .[1,1]
    mean_january = dat %>% filter(yday %in% january) %>% summarize(mean_win=mean(c(tmax..deg.c.,tmin..deg.c.))) %>% .[1,1]
    mean_temp = mean(c(dat$tmax..deg.c.,dat$tmin..deg.c.))
    mean_prec = mean(dat$prcp..mm.day.)
    mean_rad = mean(dat$srad..W.m.2.)
    vals = c(mean_temp,mean_july,mean_january,mean_prec,mean_rad)
    names(vals) = c("mean_annual_temp","mean_july_temp","mean_january_temp","mean_prec","mean_rad")
    return(vals)
}

get_alldata = function(csv_path){
    a = get_metadata(csv_path)
    b = get_tempdata(csv_path)
    df = data.frame(matrix(c(a,b),nrow=1,ncol=9))
    return(df)
}

get_alldata(ex)


X1,X2,X3,X4,X5,X6,X7,X8,X9
39.8171,-99.94282,719,11561,11.20508,25.29579,-2.242456,1.726734,346.0423


In [6]:
files = list.files("/n/holylfs/LABS/hoekstra_lab/brock/daymet/rawdata/",full.names=TRUE)
length(files)
#files = files[1:5000]

In [7]:
files = list.files("/n/holylfs/LABS/hoekstra_lab/brock/daymet/rawdata/",full.names=TRUE)
length(files)
ptm = proc.time()
r=lapply(files,function(path){get_alldata(path)})
proc.time() - ptm

cl <- parallel::makeCluster(4)
doParallel::registerDoParallel(cl)
ptm <- proc.time()
lst = foreach(i=1:length(files),.packages=c("tidyverse","magrittr")) %dopar% {
    get_alldata(files[i])
}
proc.time() - ptm

In [75]:
a = rbindlist(lst)
head(a)

X1,X2,X3,X4,X5,X6,X7
35.40019,-119.4658,84,11191,18.28362,27.79708,9.240846
35.40019,-119.4658,84,11191,18.28362,27.79708,9.240846
35.40019,-119.4658,84,11191,18.28362,27.79708,9.240846
35.40019,-119.4658,84,11191,18.28362,27.79708,9.240846
35.40019,-119.4658,84,11191,18.28362,27.79708,9.240846
35.40019,-119.4658,84,11191,18.28362,27.79708,9.240846
