This repository has been archived by the owner on May 23, 2023. It is now read-only.
/
Analytics.scala
541 lines (477 loc) · 19.5 KB
/
Analytics.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
/*
* Copyright (c) 2014 Oculus Info Inc.
* http://www.oculusinfo.com/
*
* Released under the MIT License.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished to do
* so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package com.oculusinfo.tilegen.tiling.analytics
import java.lang.{Double => JavaDouble}
import java.lang.{Integer => JavaInt}
import java.lang.{Long => JavaLong}
import java.util.{List => JavaList}
import com.oculusinfo.binning.util.JsonUtilities
import org.json.{JSONArray, JSONObject}
import scala.collection.mutable.MutableList
import scala.collection.mutable.{Map => MutableMap}
import scala.reflect.ClassTag
import org.apache.spark.SparkContext
import org.apache.spark.Accumulator
import org.apache.spark.AccumulatorParam
import com.oculusinfo.binning.TileData
import com.oculusinfo.binning.TileIndex
import com.oculusinfo.binning.metadata.PyramidMetaData
import scala.util.Try
/**
* An Analytic is basically an aggregation function, to describe how to
* aggregate the results of some analysis across bins, tiles, or anything
* else.
*
* It also encapsulates some simple knowledge of default values.
*
* @tparam T The type of value this analytic processes or aggregates.
*/
trait Analytic[T] extends Serializable
{
/**
* Used to combine two values during processing. No assumptions
* should be made regarding the cardinality of either
* value - i.e., don't assume that the first is the aggregate, and
* the second, the new value, or anything like that. Either or
* both may be a primary or aggregate value.
*/
def aggregate (a: T, b: T): T
/**
* The default processing value to use for an analytic group known
* to have no value.
*/
def defaultProcessedValue: T
/**
* The default processing value to use for an analytic group whose
* value is unknown, so as to initialize it for aggregation with
* any known values.
*/
def defaultUnprocessedValue: T
}
/**
* BinningAnalytic extends Analytic specifically for binning in tiles; it
* allows a two-stage processing of values, with a processing type that is
* aggregated, and converted to a result type just before writing out a final
* value.
*
* There are two driving cases behind breaking the calculation into processing
* and final components. [1] The first case is where the result type is
* complex, and can be represented more succinctly and efficiently durring
* processing (such as, for instance, using an array instead of a list, or an
* native integer instead of a JavaInt). [2] The second case is analytics
* (like mean or standard deviation) where different values are needed at
* processing than are written out (for example, when calculating a mean, one
* needs to record total number and running sum, but only writes a single
* number)
*
* @tparam PROCESSING_TYPE An intermediate type used to store data
* needed to produce the result type when the
* analytic is complete.
* @tparam RESULT_TYPE The final type of value to be written out as the
* results of this analytic.
*/
trait BinningAnalytic[PROCESSING_TYPE, RESULT_TYPE] extends Analytic[PROCESSING_TYPE] {
/**
* Finish off a processing value, converting it to a result value
*/
def finish (value: PROCESSING_TYPE): RESULT_TYPE
}
/**
* A TileAnalytic extends Analytic with a few simple pices that allow values to
* be written to metadata, both on each individual tile, and globally.
*
* Note that, while binning analytics have a final form to which values must be
* converted, tile analytics all convert their values to strings for writing to
* metadata, and therefore don't need an explicit final form type; this
* conversion is inherent in the valueToString method.
*
* @tparam T The type of value this analytic processes or aggregates.
*/
trait TileAnalytic[T] extends Analytic[T] {
/**
* This is ignored if the analytic is the main value used for
* tiling; otherwise, it will be used to label the results in
* tables or metadata where the result value is written.
*/
def name: String
/**
* Convert the analytic value to a JSON representation, that can be written into tile or pyramid metadata by
* simple string conversion. This method should capture everything about the analytic, including its name.
*
* @param value The collected value to be stored
* @param location Where the value is to be stored. Ordinarily, most implementations will ignore this, but they may
* use it to specify only writing to pyramid metadata, or only writing to tile metadata
* @return The same value, converted to JSON, and wrapped in Some, or None if nothing is to be stored
*/
def storableValue (value: T, location: TileAnalytic.Locations.Value): Option[JSONObject] = {
val result = new JSONObject()
result.put(name, value)
Some(result)
}
override def toString = "["+name+"]"
}
object TileAnalytic {
/** An enum of possible locations to which to store TileAnalytic data */
object Locations extends Enumeration {val Tile, Pyramid = Value}
private[analytics] def topLevelMap (value: JSONObject): Map[String, String] =
JSONObject.getNames(value).map(key => (key, value.get(key).toString)).toMap
}
/**
* An analytic that combines two other analytics into one.
*
* @see CompositeAnalysisDescription
*/
class ComposedTileAnalytic[T1, T2]
(val1: TileAnalytic[T1],
val2: TileAnalytic[T2])
extends TileAnalytic[(T1, T2)]
{
def name: String = val1.name+","+val2.name
def aggregate (a: (T1, T2), b: (T1, T2)): (T1, T2) =
(val1.aggregate(a._1, b._1), val2.aggregate(a._2, b._2))
def defaultProcessedValue: (T1, T2) =
(val1.defaultProcessedValue, val2.defaultProcessedValue)
def defaultUnprocessedValue: (T1, T2) =
(val1.defaultUnprocessedValue, val2.defaultUnprocessedValue)
override def storableValue (value: (T1, T2), location: TileAnalytic.Locations.Value): Option[JSONObject] = {
// We want to combine the results of both our components into a single return value.
val v1 = val1.storableValue(value._1, location)
val v2 = val2.storableValue(value._2, location)
val k1 = v1.map(JSONObject.getNames(_)).getOrElse(Array[String]())
val k2 = v2.map(JSONObject.getNames(_)).getOrElse(Array[String]())
val keys = k1.toSet ++ k2.toSet
val result = new JSONObject()
keys.foreach{key =>
val value1 = v1.map(json => if (json.has(key)) Some(json.get(key)) else None).getOrElse(None)
val value2 = v2.map(json => if (json.has(key)) Some(json.get(key)) else None).getOrElse(None)
// Where keys don't interfere, use the key as is
if (value1.isDefined && value2.isDefined) {
// sadly, keys are interfering; preface them with the analytic name
result.put(val1.name+"."+key, value1.get)
result.put(val2.name+"."+key, value2.get)
} else if (value1.isDefined) {
result.put(key, value1.get)
} else if (value2.isDefined) {
result.put(key, value2.get)
}
}
Some(result)
}
override def toString = "["+val1+" + "+val2+"]"
}
/**
* An accumulator that accumulates a TileAnalytic across multiple tiles
*
* @param analytic An analytic defining an aggregation function to be used to
* accumulate values
*
* @tparam T The type of value to be accumulated
*/
class AnalyticAccumulatorParam[T] (analytic: Analytic[T]) extends AccumulatorParam[T] {
// AccumulatorParam implementation
def zero (initialValue: T): T = analytic.defaultUnprocessedValue
def addInPlace (a: T, b: T): T = analytic.aggregate(a, b)
}
object AnalysisDescription {
def getStandardLevelMetadataMap (name: String, min: Int, max: Int):
Map[String, TileIndex => Boolean] =
{
getGlobalOnlyMetadataMap(name) ++
(for (level <- min to max) yield
(level+"."+name -> ((t: TileIndex) => level == t.getLevel)))
}
def getGlobalOnlyMetadataMap (name: String) :
Map[String, TileIndex => Boolean] =
Map("global."+name -> ((t: TileIndex) => true))
/*
* Take the old metadata value and overlay the new one on top of it.
*/
private def combineMetaData (origValue: Any, newValue: Any): Any =
Try{
// Convert both inputs to Json, and overlay them
val origJson = origValue match {
case json: JSONObject => JsonUtilities.deepClone(json)
case null => new JSONObject("{}")
case _ => new JSONObject(origValue.toString())
}
val newJson = newValue match {
case json: JSONObject => json
case null => new JSONObject("{}")
case _ => new JSONObject(newValue.toString())
}
JsonUtilities.overlayInPlace(origJson, newJson)
}.getOrElse(
// Default to simple replacement if there is any failure in overlaying.
newValue
)
// Apply accumulated metadata info to actual global metadata for a pyramid
def record[T] (analysis: AnalysisDescription[_, T], metaData: PyramidMetaData): Unit = {
metaData.setCustomMetaData(analysis.accumulatedResults);
}
// Apply accumulated metadata info to an actual tile
def record[T] (value: T, analysis: AnalysisDescription[_, T], tile: TileData[_]): Unit = {
analysis.analytic.storableValue(value, TileAnalytic.Locations.Tile).foreach { json =>
TileAnalytic.topLevelMap(json).foreach{case (key, value) =>
tile.setMetaData(key, combineMetaData(tile.getMetaData(key), value))
}
}
}
}
/**
* An AnalysisDescription describes an analysis that needs to be executed
* during the tiling process. Both tile and data analytics use this class
* as their basic description.
*
* There are three main parts to an analysis description:
* <ol>
* <li>The conversion function, convert, which extracts the needed values for
* analysis from the raw data</li>
* <li>The analytic, which describes how analytic values are treated</li>
* <li>Accumulator-related methods, which describe how analyses are aggregated
* across the data set.</li>
* </ol>
*
* @tparam RT The raw type of data on which this analysis takes place
* @tparam AT The type of data collected by this analysis, to be aggregated and
* stored. In an AnalyticDescription, the result of analyzing a
* single record of type RT is a single record of type AT, and this
* analysis is captured by the convert method.
*/
trait AnalysisDescription[RT, AT] extends Serializable {
val analysisTypeTag: ClassTag[AT]
def convert: RT => AT
def analytic: TileAnalytic[AT]
// Add a data point to appropriate accumulators
def accumulate (tile: TileIndex, data: AT): Unit
// Deal with accumulators
def addAccumulator (sc: SparkContext, name: String, test: (TileIndex) => Boolean): Unit
def accumulatedResults: JSONObject
// Standard accumulators
def addLevelAccumulator (sc: SparkContext, level: Int): Unit =
addAccumulator(sc, ""+level, (test: TileIndex) => (level == test.getLevel))
def addGlobalAccumulator (sc: SparkContext): Unit =
addAccumulator(sc, "global", (test: TileIndex) => true)
}
/**
* A small, simple data class to store what an analysis description needs to
* remember about a single cross-dataset accumulator of its analysis.
*
* @tparam AT The analysis type being accumulated
*/
case class MetaDataAccumulatorInfo[AT] (name: String,
test: TileIndex => Boolean,
accumulator: Accumulator[AT]) {}
/**
* A standard analysis description parent class for descriptions of a single,
* monolithic analysis (as opposed to a composite analysis).
*
* See AnalysisDescription for descriptions of the generic type parameters.
*/
class MonolithicAnalysisDescription[RT, AT: ClassTag]
(convertParam: RT => AT,
analyticParam: TileAnalytic[AT])
extends AnalysisDescription[RT, AT]
{
val analysisTypeTag = implicitly[ClassTag[AT]]
def convert = convertParam
def analytic = analyticParam
def accumulate (tile: TileIndex, data: AT): Unit =
accumulatorInfos.foreach(info =>
if (info._2.test(tile))
info._2.accumulator += data
)
// Deal with accumulators
protected val accumulatorInfos = MutableMap[String, MetaDataAccumulatorInfo[AT]]()
def addAccumulator (sc: SparkContext, name: String, test: (TileIndex) => Boolean): Unit =
// Don't add anything twice
if (!accumulatorInfos.contains(name)) {
val defaultValue = analytic.defaultUnprocessedValue
val accumulator = sc.accumulator(defaultValue)(new AnalyticAccumulatorParam(analytic))
accumulatorInfos(name) =
new MetaDataAccumulatorInfo(name, test, accumulator)
}
def accumulatedResults: JSONObject = {
val result = new JSONObject
accumulatorInfos.map{case (key, accum) =>
analytic.storableValue(accum.accumulator.value, TileAnalytic.Locations.Pyramid).foreach(value =>
result.put(key, value)
)
}
result
}
override def toString = analyticParam.toString
}
/**
* A description of a single analysis which is not aggregated into the global
* metadata, but only exists on tiles
*
* See AnalysisDescription for descriptions of the generic type parameters.
*/
class TileOnlyMonolithicAnalysisDescription[RT, AT: ClassTag]
(convertParam: RT => AT,
analyticParam: TileAnalytic[AT])
extends MonolithicAnalysisDescription[RT, AT](convertParam, analyticParam)
{
// Ignores requests to add standard accumulators
override def addLevelAccumulator (sc: SparkContext, level: Int): Unit = {}
override def addGlobalAccumulator (sc: SparkContext): Unit = {}
override def accumulatedResults: JSONObject = new JSONObject
}
/**
* A class to combine two analyses into a single analysis.
*
* This is needed because, for reasons of type generification, we can
* only pass in a single tile analytic, and a single data analytic,
* into the binning process.
*
* If we actually need more than one analytic, we combine them into
* one using this class.
*
* See AnalysisDescription for descriptions of the generic type parameters.
*/
class CompositeAnalysisDescription[RT, AT1: ClassTag, AT2: ClassTag]
(analysis1: AnalysisDescription[RT, AT1],
analysis2: AnalysisDescription[RT, AT2])
extends AnalysisDescription[RT, (AT1, AT2)]
{
val analysisTypeTag = implicitly[ClassTag[(AT1, AT2)]]
private val composedConversion = (raw: RT) => (analysis1.convert(raw), analysis2.convert(raw))
private val composedAnalytic = new ComposedTileAnalytic(analysis1.analytic, analysis2.analytic)
def convert = composedConversion
def analytic = composedAnalytic
def accumulate (tile: TileIndex, data: (AT1, AT2)): Unit = {
analysis1.accumulate(tile, data._1)
analysis2.accumulate(tile, data._2)
}
def addAccumulator (sc: SparkContext, name: String, test: (TileIndex) => Boolean): Unit = {
analysis1.addAccumulator(sc, name, test)
analysis2.addAccumulator(sc, name, test)
}
def accumulatedResults: JSONObject = {
val res1 = analysis1.accumulatedResults
val res2 = analysis2.accumulatedResults
JsonUtilities.overlayInPlace(res1, res2)
}
// Helper functions for testing purposes only
/** Count all composed sub-components. For testing purposes only. */
def countComponents: Int = {
val count1: Int = if (analysis1.isInstanceOf[CompositeAnalysisDescription[_, _, _]])
analysis1.asInstanceOf[CompositeAnalysisDescription[_, _, _]].countComponents
else 1
val count2: Int = if (analysis2.isInstanceOf[CompositeAnalysisDescription[_, _, _]])
analysis2.asInstanceOf[CompositeAnalysisDescription[_, _, _]].countComponents
else 1
count1 + count2
}
/** Get the nth composed sub-component. For testing purposes only. */
def getComponent (n: Int): AnalysisDescription[RT, _] = {
getComponentInternal(n)._1.get
}
protected def getComponentInternal (n: Int): (Option[AnalysisDescription[RT, _]], Int) = {
val postFirst =
if (analysis1.isInstanceOf[CompositeAnalysisDescription[RT, _, _]])
analysis1.asInstanceOf[CompositeAnalysisDescription[RT, _, _]].getComponentInternal(n)
else if (0 == n) (Some(analysis1), -1)
else (None, n - 1)
if (postFirst._1.isDefined) postFirst
else {
val n2 = postFirst._2
val postSecond =
if (analysis2.isInstanceOf[CompositeAnalysisDescription[RT, _, _]])
analysis2.asInstanceOf[CompositeAnalysisDescription[RT, _, _]].getComponentInternal(n2)
else if (0 == n2) (Some(analysis2), -1)
else (None, n2 - 1)
postSecond
}
}
override def toString = "["+analysis1+","+analysis2+"]"
}
/**
* A class (and companion object) to take an analysis of bin values and convert
* it into an analysis of tiles.
*/
object AnalysisDescriptionTileWrapper {
def acrossTile[BT, AT] (convertFcn: BT => AT,
analytic: TileAnalytic[AT]): TileData[BT] => AT =
{
tile: TileData[BT] => {
val index = tile.getDefinition
(for (x <- 0 until index.getXBins;
y <- 0 until index.getYBins)
yield convertFcn(tile.getBin(x, y))
).reduce((a, b) => analytic.aggregate(a, b))
}
}
}
class AnalysisDescriptionTileWrapper[RT, AT: ClassTag]
(convert: RT => AT,
analytic: TileAnalytic[AT])
extends MonolithicAnalysisDescription[TileData[RT], AT](
AnalysisDescriptionTileWrapper.acrossTile(convert, analytic),
analytic)
{
}
// /////////////////////////////////////////////////////////////////////////////
// Some standard analytic functions
//
/**
* The Custom Metadata analytic is a dummy tile analytic that does not actually
* write anything to tiles. It's only purpose is to help
* CustomGlobalMetadata fulfill its interface without having any side-effects.
*
*/
class CustomMetadataAnalytic extends TileAnalytic[String]
{
def aggregate (a: String, b: String): String = a
def defaultProcessedValue: String = ""
def defaultUnprocessedValue: String = ""
def name: String = "VariableSeries"
override def storableValue (value: String, location: TileAnalytic.Locations.Value): Option[JSONObject] =
if (TileAnalytic.Locations.Pyramid == location) super.storableValue(value, location) else None
}
/**
* A very simply tile analytic that just writes custom metadata directly to the tile set
* metadata, and no where else.
*
* @tparam T The raw data type of input records. Nothing in this analytic uses
* this type, it just must match the dataset.
*/
class CustomGlobalMetadata[T] (customData: Map[String, Object])
extends AnalysisDescription[T, String]
{
val analysisTypeTag = implicitly[ClassTag[String]]
def convert: T => String = (raw: T) => ""
def analytic: TileAnalytic[String] = new CustomMetadataAnalytic
def accumulate (tile: TileIndex, data: String): Unit = {}
// Global metadata needs no accumulators - it doesn't actually have any data.
def addAccumulator (sc: SparkContext, name: String, test: (TileIndex) => Boolean): Unit = {}
def accumulatedResults: JSONObject = {
customData.foldLeft(new JSONObject()) { (res, curr) =>
curr match {
case (key: String, value: JSONObject) => res.put(key, value)
case (key: String, value: Object) if JsonUtilities.isJSON(value.toString) => res.put(key, new JSONObject(value.toString))
case _ => res.put(curr._1, curr._2)
}
}
}
}