In [15]:
pollutantmean <- function(directory, pollutant, id = 1:332) {
        ## 'directory' is a character vector of length 1 indicating
        ## the location of the CSV files
        
        ## 'pollutant' is a character vector of length 1 indicating
        ## the name of the pollutant for which we will calculate the
        ## mean; either "sulfate" or "nitrate".
        
        ## 'id' is an integer vector indicating the monitor ID numbers
        ## to be used
        
        ## Return the mean of the pollutant across all monitors list
        ## in the 'id' vector (ignoring NA values)
        accum <- c() # Create an empty vector
        
        for(i in id)
        {
                file_name <- sprintf("%03d.csv", i)
                file_name <- paste0(directory,"/",file_name)
                                
                heisenberg <- read.csv(file=file_name,head=TRUE,sep=",")
                vec <- heisenberg[,pollutant] ## create a vector with the values only
                val <- vec[!is.na(vec)]       ## extract the values of the vector       
                accum <- c(accum,val)
        }
        mean(accum)
}
pollutantmean("specdata","sulfate")

In [18]:
pollutantmean("specdata", "sulfate", 1:10)

In [19]:
pollutantmean("specdata", "nitrate", 70:72)

In [20]:
pollutantmean("specdata", "sulfate", 34)

In [21]:
pollutantmean("specdata", "nitrate")

In [16]:
complete <- function(directory, id = 1:332) {
        ## 'directory' is a character vector of length 1 indicating
        ## the location of the CSV files
        
        ## 'id' is an integer vector indicating the monitor ID numbers
        ## to be used
        
        ## Return a data frame of the form:
        ## id nobs
        ## 1  117
        ## 2  1041
        ## ...
        ## where 'id' is the monitor ID number and 'nobs' is the
        ## number of complete cases
        output <- data.frame()
        for(i in id)
        {
                file_name <- sprintf("%03d.csv", i)
                file_name <- paste0(directory,"/",file_name)
                
                heisenberg <- read.csv(file=file_name,head=TRUE,sep=",")
                valSul <- !is.na(heisenberg[,"sulfate"])  # Gets  where are the values in the sulfate col
                valNut <- !is.na(heisenberg[,"nitrate"])  # Gets where are the values in the nitrade col
                nobs <- sum(valSul & valNut)
                newRow <- data.frame(i,nobs)
                output <- rbind(output,newRow)
        }
        names(output) <- c("id","nobs")
        print(output)
}
complete("specdata")

     id nobs
1     1  117
2     2 1041
3     3  243
4     4  474
5     5  402
6     6  228
7     7  442
8     8  192
9     9  275
10   10  148
11   11  443
12   12   96
13   13   46
14   14   96
15   15   83
16   16   60
17   17  927
18   18   84
19   19  353
20   20  124
21   21  426
22   22  135
23   23  492
24   24  885
25   25  463
26   26  586
27   27  338
28   28  475
29   29  711
30   30  932
31   31  483
32   32  616
33   33  466
34   34  165
35   35  509
36   36  495
37   37  497
38   38  491
39   39  734
40   40   21
41   41  227
42   42   60
43   43   74
44   44  283
45   45  424
46   46   89
47   47  540
48   48   62
49   49  473
50   50  459
51   51  193
52   52  812
53   53  342
54   54  219
55   55  372
56   56  642
57   57  452
58   58  391
59   59  445
60   60  448
61   61  155
62   62  414
63   63  403
64   64  932
65   65   66
66   66  374
67   67  436
68   68  418
69   69   15
70   70  124
71   71  360
72   72  406
73   73   60
74   74  462
75   75  779
76   76  385

In [22]:
cc <- complete("specdata", c(6, 10, 20, 34, 100, 200, 310))
print(cc$nobs)

   id nobs
1   6  228
2  10  148
3  20  124
4  34  165
5 100  104
6 200  460
7 310  232
[1] 228 148 124 165 104 460 232


In [23]:
cc <- complete("specdata", 54)
print(cc$nobs)

  id nobs
1 54  219
[1] 219


In [24]:
set.seed(42)
cc <- complete("specdata", 332:1)
use <- sample(332, 10)
print(cc[use, "nobs"])

     id nobs
1   332   16
2   331  284
3   330  447
4   329  439
5   328  967
6   327  162
7   326  215
8   325  817
9   324   34
10  323   34
11  322  301
12  321  353
13  320  627
14  319  113
15  318  200
16  317   47
17  316   77
18  315  183
19  314  888
20  313  368
21  312  216
22  311   65
23  310  232
24  309  213
25  308   79
26  307  174
27  306  203
28  305  263
29  304  135
30  303  585
31  302  937
32  301  438
33  300  927
34  299  331
35  298   66
36  297   10
37  296   14
38  295   75
39  294    0
40  293    0
41  292    0
42  291    0
43  290   91
44  289    0
45  288   40
46  287  812
47  286    0
48  285   83
49  284   87
50  283   90
51  282   92
52  281   81
53  280   15
54  279  822
55  278    0
56  277  908
57  276    0
58  275    0
59  274    4
60  273  203
61  272  253
62  271  499
63  270  411
64  269  191
65  268  424
66  267  403
67  266  439
68  265  438
69  264   44
70  263  357
71  262  245
72  261   50
73  260  386
74  259   76
75  258  444
76  257  886

In [25]:
cr <- corr("specdata")                
cr <- sort(cr)                
set.seed(868)                
out <- round(cr[sample(length(cr), 5)], 4)
print(out)

[1]  0.2688  0.1127 -0.0085  0.4586  0.0447


In [26]:
cr <- corr("specdata", 129)                
cr <- sort(cr)                
n <- length(cr)                
set.seed(197)                
out <- c(n, round(cr[sample(n, 5)], 4))
print(out)

[1] 243.0000   0.2540   0.0504  -0.1462  -0.1680   0.5969


In [27]:
cr <- corr("specdata", 2000)                
n <- length(cr)                
cr <- corr("specdata", 1000)                
cr <- sort(cr)
print(c(n, round(cr, 4)))

[1]  0.0000 -0.0190  0.0419  0.1901


In [17]:
corr <- function(directory, threshold = 0) {
        ## 'directory' is a character vector of length 1 indicating
        ## the location of the CSV files
        
        ## 'threshold' is a numeric vector of length 1 indicating the
        ## number of completely observed observations (on all
        ## variables) required to compute the correlation between
        ## nitrate and sulfate; the default is 0
        
        ## Return a numeric vector of correlations
        id <- 1:332
        correl <- c()
        for(i in id)
        {
                file_name <- sprintf("%03d.csv", i)
                file_name <- paste0(directory,"/",file_name)
                
                heisenberg <- read.csv(file=file_name,head=TRUE,sep=",")
                valSul <- !is.na(heisenberg[,"sulfate"])  # Gets where are the values in the sulfate col
                valNut <- !is.na(heisenberg[,"nitrate"])  # Gets where are the values in the nitrade col
                completeVal <- valSul & valNut # Gets where both sulfate & nitrate have values
                # if the # of complete cases is equal or greater than the threshold
                if(sum(completeVal) >= threshold)
                {
                        sulfate <- heisenberg[,"sulfate"][completeVal] # Complete Values of sulfate
                        nitrade <- heisenberg[,"nitrate"][completeVal] # Complete Values of nitrate
                        correl <- c(correl, cor(sulfate,nitrade))
                }
        }
        correl
}
corr("specdata",150)