# EIMS Data
This script will read in the individual EIMS data files from the given directory and compile/prase the data for the model.

In [None]:
source('source.r')

# File Input

First we need to set the directory and file name criteria of the data to load. We also set the date-time format that the parser will be expecting.

In [8]:
input.dir = 'Raw Data/EIMS/'
output.dir = 'RStates/EIMS/'
dt.format = "%m/%d/%y %I:%M %p"
dt.format2 = "%m/%d/%y %I:%M"

## Generate list of files in the directory
file.list = list.files(input.dir)

## split file.list into usable lists
file.list.nu = file.list[grepl('not usable', file.list)] # not usable
file.list.2 = file.list[grepl('corr.txt', file.list)] # usable

## Print total number of files in each category
print(paste('Number of files in file.list:', length(file.list)))
print(paste('Number of files in file.list.2:', length(file.list.2)))
print(paste('Number of files in file.list.nu:', length(file.list.nu)))


[1] "Number of files in file.list: 177"
[1] "Number of files in file.list.2: 76"
[1] "Number of files in file.list.nu: 5"


In [9]:
file.list.2

In [10]:
## Read in first data file and configure formatting around it
data = read.csv(paste0(input.dir, file.list.2[12]), sep='\t')
file.list.2[10]
data$time = strptime(data$time, format = dt.format2, tz='UTC')

data$time[10] # Sample value

[1] "2021-06-05 04:26:00 UTC"

# Main Functions

These functions will do the heavy lifting for the EIMS preparation. The __avg.eims()__ generates the average of N minutes of samples while __filter.eims()__ goes QC type of work by removing high/low pressure samples and handles missing values.

In [11]:
## Average
avg.eims = function(x, N, flag = 1) {
    i = 1
    
    while (i < nrow(x)) {
        dt = as.numeric(difftime(x$time, x$time[i], unit='secs'))
        l = which(dt >= 0 & dt < N*60)
        
        ## Calculate the mean of the relevant columns, removing na's
        if (flag == 1) {
            x[i, 3:ncol(x)] = apply(x[l, 3:ncol(x)], 2, function(x) {mean(x, na.rm = TRUE)})
        } else {
            x[i, 3:ncol(x)] = apply(x[l, 3:ncol(x)], 2, function(x) {median(x, na.rm = TRUE)})
        }
        
        if (max(dt[l], na.rm = TRUE, 0) <= N / 2) {
            x$Flag[i] = 3
        }
    
        if (length(l) > 1) {
            l = l[l != i]
            if (length(l) > 1) {
                x = x[-l,]
            }
        }
        i = i + 1
    }
    x
}

## Prepare filter function
filter.eims = function(x) {
    x$Flag = 1
    
    ## Remove entries where flow not within 5% of 100
    x$Flag[x$flow > 105 | x$flow < 95] = 3 # Reject
    x$Flag[x$O2.Ar > 35 | x$O2.Ar < 16] = 3 # Reject
    
    ## Remove first 2 minute (safer to remove bad data than to keep it)
    l = which(as.numeric(difftime(x$time, x$time[1], unit='mins'))^2 < 2)
    x$Flag[l] = 3 # Reject
    
    ## Remove last 2 minute (safer to remove bad data than to keep it)
    l = which(as.numeric(difftime(x$time, x$time[nrow(x)], unit='mins'))^2 < 2)
    x$Flag[l] = 3 # Reject
    
    ## find all valve switches and remove all entries within a miute of it.
    l = which(diff(x$Valve) != 0)
    for (i in l) {
        dt = abs(as.numeric(difftime(x$time[i], x$time, unit='mins')))     
        x$Flag[which(dt <= 1)] = 3 # Reject
    }
    ## Remove NA O2Ar entries
    x$Flag[is.na(x$O2.Ar)] = 3 # Reject
    
    x$Flag[x$Valve == 2] = 2 # Calibration
    #x$Flag[is.na(x$Valve)] = 3 # Reject unknown valve
    
    x
}

## Check the filter and avg functions

In [12]:
## Copy of data to run through filter (for tuning the filter parameters)
temp = data
temp = filter.eims(temp)
temp.avg = avg.eims(temp[temp$Flag == 1,], 120, flag = 1) # N = 2 min = 120 seconds
temp.cal = avg.eims(temp[temp$Flag == 2,], 120, flag = 2) # N = 2 min = 120 seconds

## Plot preliminary figures for visual check.
plot(temp$time, temp$O2.Ar, col = temp$Flag, pch=20, ylab='O2/Ar', xlab='', yaxs='i', main='Flag Values')

plot(temp.avg$time, temp.avg$O2.Ar, col = "blue", pch=16, ylab='O2/Ar', xlab='', yaxs='i', main='Averaging')
points(temp$time, temp$O2.Ar, col = "#00000020", pch=20, cex=0.3)
points(temp.cal$time, temp.cal$O2.Ar, col = "red", pch=20, cex=1.5)

ERROR: Error in png(tf, width, height, "in", pointsize, bg, res, antialias = antialias): unable to start png() device


Plot with title "Flag Values"

ERROR: Error in png(tf, width, height, "in", pointsize, bg, res, antialias = antialias): unable to start png() device


Plot with title "Averaging"

In [14]:
read.eims = function(valve = 1, input.dir, file.list, N = 2, verbose = FALSE, template = data, make.img = FALSE) {
    ## Create template data structure to add new entries to:
    dat = template[1,] # valve = 1
    dat$Flag = 0
    
    for (i in file.list) {
        
        #### load file and parse the time and filter the values
        data = read.csv(paste0(input.dir, i), sep='\t')
        if (grepl('M', data$time[1])) {
            data$time = strptime(data$time, format = dt.format, tz='GMT')
        } else {
            data$time = strptime(data$time, format = dt.format2, tz='GMT')
        }
        
        data$Flag = 0
        
        ## Filter
        data = filter.eims(data)
        
        if (make.img) {
            cols = c('black', 'orange', 'red')
    
            png(file = paste0('Output/EIMS/', i, '-valve-', valve, '.png'))
            try({plot(data$time, data$O2.Ar, col=cols[data$Flag], pch=20, ylab='O2/Ar', xlab='', main=i, cex=0.2)})
            
        }
        
        ## Verbose
        if (verbose) { print(paste("Loaded file:", i)) }
        
        #### determine valve  
        l = which(data$Valve == valve)
        temp = avg.eims(data[l,], N, valve)
        #temp = data[l,]
        
        if (make.img) {
            if (valve == 2) {
                points(temp$time, temp$O2.Ar, pch=7)
                lines(temp$time, temp$O2.Ar, col='#00000080')
            }
            dev.off()
        }
        
        if (length(names(temp)) == length(names(dat))) {
            ## Do the column names match?
            names(temp) = names(dat) # Force colname match (dangerous), but will check that lengths are the same
            dat = rbind(dat, temp)
            
            if (verbose) { print(paste('  Added', nrow(temp),'to valve 1.')) }
            
        }
        else {print('error'); print(i)}
    }


    ## Remove the first row used to initiate the table
    dat = dat[-1,]
    dat = dat[,-c(13, 14, 15, 23, 24, 25)]
    
    dat
}

## Run the parser
This will initialize the parser for valve = 1 (sampling) and then valve = 2 (calibration). Included in this is the ___filter.eims()___ and ___take.avg()___ functions.

__NB This will take a while...__

In [15]:
data.v1 = read.eims(1, input.dir, file.list.2, N = 2, verbose = TRUE, template = data, make.img = TRUE)
#data.v1 = read.eims(1, input.dir, file.list.2, N = 30, verbose = FALSE, template = data, make.img = FALSE)

In [16]:
data.v2 = read.eims(2, input.dir, file.list.2, N = 120, verbose = TRUE, template = data, make.img = FALSE)

[1] "Loaded file: 2017-0601-1200 MID-valve corr.txt"
[1] "  Added 1 to valve 1."
[1] "Loaded file: 2017-0601-2031 MID-valve corr.txt"
[1] "  Added 2 to valve 1."
[1] "Loaded file: 2017-0602-0000 MID-valve corr.txt"
[1] "  Added 3 to valve 1."
[1] "Loaded file: 2017-0602-1200 MID-valve corr.txt"
[1] "  Added 0 to valve 1."
[1] "Loaded file: 2017-0602-1449 MID-valve corr.txt"
[1] "  Added 2 to valve 1."
[1] "Loaded file: 2017-0603-1200 MID-valve corr.txt"
[1] "  Added 0 to valve 1."
[1] "Loaded file: 2017-0603-1347 MID-valve corr.txt"
[1] "  Added 0 to valve 1."
[1] "Loaded file: 2017-0603-1548 MID-valve corr.txt"
[1] "  Added 2 to valve 1."
[1] "Loaded file: 2017-0603-2102 MID-valve corr.txt"
[1] "  Added 1310 to valve 1."
[1] "Loaded file: 2017-0604-0000 MID-valve corr.txt"
[1] "  Added 0 to valve 1."
[1] "Loaded file: 2017-0604-0448 MID-valve corr.txt"
[1] "  Added 0 to valve 1."
[1] "Loaded file: 2017-0604-1200 MID-valve corr.txt"
[1] "  Added 2 to valve 1."
[1] "Loaded file: 2017-06

In [17]:
data.v3 = read.eims(3, input.dir, file.list.2, N = 2, verbose = TRUE, template = data, make.img = FALSE)

[1] "Loaded file: 2017-0601-1200 MID-valve corr.txt"
[1] "  Added 0 to valve 1."
[1] "Loaded file: 2017-0601-2031 MID-valve corr.txt"
[1] "  Added 0 to valve 1."
[1] "Loaded file: 2017-0602-0000 MID-valve corr.txt"
[1] "  Added 0 to valve 1."
[1] "Loaded file: 2017-0602-1200 MID-valve corr.txt"
[1] "  Added 0 to valve 1."
[1] "Loaded file: 2017-0602-1449 MID-valve corr.txt"
[1] "  Added 0 to valve 1."
[1] "Loaded file: 2017-0603-1200 MID-valve corr.txt"
[1] "  Added 0 to valve 1."
[1] "Loaded file: 2017-0603-1347 MID-valve corr.txt"
[1] "  Added 0 to valve 1."
[1] "Loaded file: 2017-0603-1548 MID-valve corr.txt"
[1] "  Added 0 to valve 1."
[1] "Loaded file: 2017-0603-2102 MID-valve corr.txt"
[1] "  Added 1403 to valve 1."
[1] "Loaded file: 2017-0604-0000 MID-valve corr.txt"
[1] "  Added 191 to valve 1."
[1] "Loaded file: 2017-0604-0448 MID-valve corr.txt"
[1] "  Added 0 to valve 1."
[1] "Loaded file: 2017-0604-1200 MID-valve corr.txt"
[1] "  Added 0 to valve 1."
[1] "Loaded file: 2017-

In [18]:
## Also, correct the time since the year is not actually 2021...
data.v1$time = data.v1$time - 4 * 365.25 * 86400 - 86400
data.v2$time = data.v2$time - 4 * 365.25 * 86400 - 86400
data.v3$time = data.v3$time - 4 * 365.25 * 86400 - 86400

## Remove NA O2:Ar values
data.v1 = data.v1[which(!is.na(data.v1$O2.Ar)),]
data.v2 = data.v2[which(!is.na(data.v2$O2.Ar)),]
data.v3 = data.v3[which(!is.na(data.v3$O2.Ar)),]

## Number of entries, do they seem right?
print(paste0('Valve 1 has ', nrow(data.v1), ' entries with ', length(which(data.v1$Flag == 3)), ' flagged values.'))
print(paste0('Valve 2 has ', nrow(data.v2), ' entries with ', length(which(data.v2$Flag == 3)), ' flagged values.'))
print(paste0('Valve 3 has ', nrow(data.v3), ' entries with ', length(which(data.v3$Flag == 3)), ' flagged values.'))

eims = list(sample = data.v1, cal = data.v2, error = data.v3)

[1] "Valve 1 has 1127 entries with 23 flagged values."
[1] "Valve 2 has 143 entries with 2 flagged values."
[1] "Valve 3 has 25 entries with 13 flagged values."


## Review results

In [20]:
#pdf('./Output/Generate EIMS - Review 1.pdf')
#png('./Output/Generate EIMS - Review 1.png')

par(mfrow=c(2,1))
cols = c('black', 'dark green', 'red')

plot(eims$sample$time, eims$sample$O2.Ar, pch=16, cex=0.3, ylab='O2/Ar', xlab='', col=cols[eims$sample$Flag])
points(eims$cal$time, eims$cal$O2.Ar, pch=16, cex=0.2, col=cols[eims$cal$Flag])

legend(data.v1$time[1], 19, c('Measurements', 'Calibration'), col=c('black', 'green'), pch=16, cex=0.6)

plot(eims$sample$time, eims$sample$N2.Ar, pch=16, cex=0.3, ylab='N2/Ar', xlab='', col=cols[eims$sample$Flag])
points(eims$cal$time, eims$cal$N2.Ar, pch=16, cex=0.2, col=cols[eims$cal$Flag])

plot(eims$sample$time, eims$sample$time, pch=16, cex=0.3, ylab='Time', xlab='', col=cols[eims$sample$Flag])
points(eims$cal$time, eims$cal$time+86400, cex=0.2, col=cols[eims$cal$Flag])

plot(eims$sample$time, eims$sample$O2..uM., pch=16, cex=0.3, ylab='O2', xlab='', col=cols[eims$sample$Flag])
points(eims$cal$time, eims$cal$O2..uM., pch=16, cex=0.2, col=cols[eims$cal$Flag])

legend(eims$cal$time[1], 470, c('Measurements', 'Calibration'), col=c('black', 'dark green'), pch=16, cex=0.6)

dev.off()

ERROR: Error in png(tf, width, height, "in", pointsize, bg, res, antialias = antialias): unable to start png() device


plot without title

ERROR: Error in png(tf, width, height, "in", pointsize, bg, res, antialias = antialias): unable to start png() device


plot without title

In [None]:
#pdf('./Output/Generate EIMS - Review 2.pdf')
#png('./Output/Generate EIMS - Review 2.png')

plot(eims$sample$time, eims$sample$time, pch=16, cex=0.3, ylab='Time', xlab='', col = cols[eims$sample$Flag])
points(eims$cal$time, eims$cal$time+86400, pch=16, cex=0.5, col = cols[eims$cal$Flag])
points(eims$error$time, eims$error$time+86400*2, pch=16, cex=0.5, col = cols[eims$error$Flag])
legend(eims$sample$time[1], eims$sample$time[nrow(eims$sample)], c('Measurements', 'Calibration', 'Reject'),
       col=cols, pch=16)

dev.off()

In [None]:
plot(data.v1$time, data.v1$TP, col='white', ylim=c(3.2e-6, 5.5e-6), ylab='Total Pressure', xlab='')

col = c('Black', 'blue', 'grey', 'dark green')

for (i in 1:length(unique(eims$sample[,1]))) {
    l = which(eims$sample[,1] == unique(data.v1[,1])[i])
    points(eims$sample$time[l], eims$sample$TP[l], col=col[i %% 2 + 1], pch=20, cex=0.6)
}

for (i in 1:length(unique(eims$cal[,1]))) {
    l = which(eims$cal[,1] == unique(eims$cal[,1])[i])
    points(eims$cal$time[l], eims$cal$TP[l], col=col[i %% 2 + 3], pch=20, cex=0.6)
}


In [None]:
#pdf('Output/Generate EIMS - Calibration Filter.pdf')

plot(eims$cal$time, eims$cal$O2.Ar, pch=20, ylab='O2/Ar', xlab='', main='EIMS Calibration Filter')
lines(eims$cal$time, runmed(eims$cal$O2.Ar, 5), col='red', lwd=2)

delta = abs(runmed(eims$cal$O2.Ar, 5) - eims$cal$O2.Ar)
l = which(delta > 0.05)
points(eims$cal$time[l], eims$cal$O2.Ar[l], pch=16, col='red')

lines(eims$cal$time[-l], runmed(eims$cal$O2.Ar[-l], 5))

dev.off()

In [None]:
## Apply Calibration Filter
eims$cal = eims$cal[-l,]

---
# Save

In [19]:
## Save the data
#save(eims, file='./RStates/EIMS.2min.120min.rdata')
save(eims, file='./RStates/EIMS.30min.120min.rdata')

## Optional
#write.xlsx(data.v1, './Input Data/EIMS-Valve1.2min.xlsx')
#write.xlsx(data.v2, './Input Data/EIMS-Valve2.60min.xlsx')

---
# Review

In [None]:
load('./RStates/EIMS.2min.120min.rdata')

In [None]:
par(mfrow=c(3,1))

#pdf('Output/Generate EIMS - 24H Plots.pdf')

dt = floor(as.numeric(difftime(eims$sample$time, min(eims$sample$time, na.rm = TRUE), units='days')))

for (i in 1:floor(max(dt, na.rm = TRUE))) {
    k = which(dt ==i)
    if(length(k) < 1) {
        k = 1
    }
    plot(eims$sample$time[k], eims$sample$O2.Ar[k], pch=20, main=paste('Day', i, '-', eims$sample$time[k[1]]), xlab='',
         ylab='O2/Ar', ylim=c(18,33), cex=0.5, yaxs='i')
    points(eims$cal$time, eims$cal$O2.Ar, pch=4, col='red')
}

dev.off()