-
Notifications
You must be signed in to change notification settings - Fork 13
/
filecounter.py
802 lines (689 loc) · 33.3 KB
/
filecounter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
import os
import json
import thread
import collections
import time
import datetime
import psycopg2
import re
import requests
import logging, logging.config, logstash
import pandas as pd
from flask import Flask, render_template, send_file, request, url_for, redirect, make_response
from flask_wtf import FlaskForm as Form
from wtforms import TextField, TextAreaField, validators, StringField, SubmitField, DateField, SelectMultipleField, widgets
from wtforms.fields.html5 import DateField
from wtforms.validators import DataRequired
from pyclowder.connectors import Connector
from pyclowder.datasets import submit_extraction, get_file_list
from pyclowder.files import submit_extraction as submit_file_extraction
from terrautils.extractors import load_json_file
from terrautils.sensors import Sensors
import utils
import counts
config = {}
app_dir = '/home/filecounter'
SCAN_LOCK = False
count_defs = counts.SENSOR_COUNT_DEFINITIONS
DEFAULT_COUNT_START = None
DEFAULT_COUNT_END = None
CLOWDER_HOST = "https://terraref.ncsa.illinois.edu/clowder/"
CLOWDER_KEY = os.getenv('CLOWDER_KEY', False)
CONN = Connector("", {}, mounted_paths={"/home/clowder/sites":"/home/clowder/sites"})
# UTILITIES ----------------------------
def update_nested_dict(existing, new):
"""Nested update of python dictionaries for config parsing
Adapted from http://stackoverflow.com/questions/3232943/update-value-of-a-nested-dictionary-of-varying-depth
"""
for k, v in new.iteritems():
if isinstance(existing, collections.Mapping):
if isinstance(v, collections.Mapping):
r = update_nested_dict(existing.get(k, {}), v)
existing[k] = r
else:
existing[k] = new[k]
else:
existing = {k: new[k]}
return existing
def generate_dates_in_range(start_date_string, end_date_string=None):
"""Return list of date strings between start and end dates."""
start_date = datetime.datetime.strptime(start_date_string, '%Y-%m-%d')
if not end_date_string:
end_date = datetime.datetime.now()
else:
end_date = datetime.datetime.strptime(end_date_string, '%Y-%m-%d')
days_between = (end_date - start_date).days
date_strings = []
for i in range(0, days_between+1):
current_date = start_date + datetime.timedelta(days=i)
current_date_string = current_date.strftime('%Y-%m-%d')
date_strings.append(current_date_string)
return date_strings
def get_percent_columns(current_dataframe):
colnames = list(current_dataframe.columns.values)
percent_columns = []
for each in colnames:
if each.endswith('%'):
percent_columns.append(each)
return percent_columns
def highlight_max(s):
'''
highlight the maximum in a Series yellow.
'''
is_max = s == s.max()
return ['background-color: red' if v else '' for v in is_max]
def color_percents(val):
"""
Takes a scalar and returns a string with
the css property `'color: red'` for negative
strings, black otherwise.
"""
if val == 100:
color = 'green'
elif val >= 99:
color = 'greenyellow'
elif val >= 95:
color = 'yellow'
else:
color = 'lightcoral'
return 'background-color: %s' % color
def render_date_entry(sensorname, columns, rowdata, rowindex):
html = '<div><br/><a style="font-size:18px"><b>%s</b></a>' % rowdata['date']
html += ' <a href="/newschedule/%s/%s/%s">(Recount)</a>' % (
sensorname, rowdata['date'], rowdata['date'])
html += '</br><table style="border: solid 2px;border-spacing:0px">'
sensordef = count_defs[sensorname]
vals = {}
for colname in columns:
if not colname.endswith("%"):
if colname in sensordef:
if colname not in vals:
vals[colname] = {}
vals[colname]["count"] = rowdata[colname]
if colname in sensordef and "parent" in sensordef[colname]:
vals[colname]["parent"] = sensordef[colname]["parent"]
else:
parcol = colname.replace("%", "")
parname = sensordef[parcol]["parent"]
if parcol not in vals:
vals[parcol] = {}
vals[parcol]["%"] = rowdata[colname]
vals[parcol]["%str"] = "%s%% of %s" % (rowdata[colname], parname)
for group in sensordef:
api_link = ""
if group != sensorname:
group_cell = '<td style="border:solid 1px">...%s</td>' % group
if sensordef[group]["type"] == "timestamp":
if "%" in vals[group] and vals[group]["%"] < 100:
api_link = '<a href="/submitmissing/%s/%s/%s">Submit to %s</a>' % (
sensorname, group, rowdata['date'], sensordef[group]["extractor"])
elif sensordef[group]["type"] == "psql":
if "%" in vals[group] and vals[group]["%"] == 100:
api_link = '<a href="/submitrulecheck/%s/%s/%s">Retrigger ncsa.rulechecker.terra</a>' % (
sensorname, group, rowdata['date'])
elif "%" in vals[group] and vals[group]["%"] < 100:
api_link = '<a href="/submitmissingrulechecks/%s/%s/%s">Submit to ncsa.rulechecker.terra</a>' % (
sensorname, group, rowdata['date'])
elif sensordef[group]["type"] == "regex":
if "parent" in sensordef[group]:
api_link = '<a href="/submitmissingregex/%s/%s/%s">Submit to %s</a>' % (
sensorname, group, rowdata['date'], sensordef[group]["extractor"])
elif sensordef[group]["type"] == "plot":
if "parent" in sensordef[group]:
api_link = '<a href="/submitmissingplots/%s/%s/%s">Submit to %s</a>' % (
sensorname, group, rowdata['date'], sensordef[group]["extractor"])
else:
group_cell = '<td style="border:solid 1px"><b>raw data</b></td>'
api_cell = '<td style="border:solid 1px">%s</td>' % api_link
if group in vals:
if "%" in vals[group]:
count_cell = '<td style="border:solid 1px;%s"><a title="%s">%s</a></td>' % (
color_percents(vals[group]["%"]),
vals[group]["%str"],
vals[group]["count"])
else:
count_cell = '<td style="border:solid 1px"><a>%s</a></td>' % vals[group]["count"]
else:
count_cell = '<td style="border:solid 1px"><a>Missing</a></td>'
html += '<tr>'
html += group_cell
html += count_cell
html += api_cell
html += '</tr>'
html += '</table></div>'
return html
def get_dsid_by_name(dsname):
url = "%sapi/datasets?key=%s&title=%s&exact=true" % (CLOWDER_HOST, CLOWDER_KEY, dsname)
result = requests.get(url)
result.raise_for_status()
if len(result.json()) > 0:
ds_id = result.json()[0]['id']
return ds_id
else:
return None
def connect_to_psql():
psql_db = os.getenv("RULECHECKER_DATABASE", config['postgres']['database'])
psql_host = os.getenv("RULECHECKER_HOST", config['postgres']['host'])
psql_user = os.getenv("RULECHECKER_USER", config['postgres']['username'])
psql_pass = os.getenv("RULECHECKER_PASSWORD", config['postgres']['password'])
psql_conn = psycopg2.connect(dbname=psql_db, user=psql_user, host=psql_host, password=psql_pass)
return psql_conn
# FLASK COMPONENTS ----------------------------
def create_app(test_config=None):
pipeline_csv = os.path.join(config['csv_path'], "{}.csv")
sensor_names = count_defs.keys()
# create and configure the app
app = Flask(__name__, instance_relative_config=True)
app.config.from_mapping(
SECRET_KEY='dev',
DATABASE=os.path.join(app.instance_path, 'flaskr.sqlite'),
)
if test_config is None:
# load the instance config, if it exists, when not testing
app.config.from_pyfile('config.py', silent=True)
else:
# load the test config if passed in
app.config.from_mapping(test_config)
# ensure the instance folder exists
try:
os.makedirs(app.instance_path)
except OSError:
pass
class MultiCheckboxField(SelectMultipleField):
widget = widgets.ListWidget(prefix_label=False)
option_widget = widgets.CheckboxInput()
class SensorDateSelectForm(Form):
sensor_names = count_defs.keys()
selects = [(x, x) for x in sensor_names]
sensors = MultiCheckboxField('Label', choices=selects)
start_date = DateField('Start', format='%Y-%m-%d', validators=[DataRequired()])
end_date = DateField('End', format='%Y-%m-%d')
submit = SubmitField('Count files for these days', validators=[DataRequired()])
@app.route('/sensors', defaults={'message': "Available Sensors and Options"})
@app.route('/sensors/<string:message>')
def sensors(message):
return render_template('sensors.html', sensors=sensor_names, message=message)
@app.route('/download/<sensor_name>')
def download(sensor_name):
current_csv = pipeline_csv.format(sensor_name)
current_csv_name = os.path.basename(current_csv)
return send_file(current_csv,
mimetype='text/csv',
attachment_filename=current_csv_name,
as_attachment=True)
@app.route('/showcsv/<sensor_name>', defaults={'days': 14})
@app.route('/showcsv/<sensor_name>/<int:days>')
def showcsv(sensor_name, days):
# data = dataset.html
current_csv = pipeline_csv.format(sensor_name)
if not os.path.isfile(current_csv):
return "File does not exist"
df = pd.read_csv(current_csv, index_col=False)
if days == 0:
percent_columns = get_percent_columns(df)
for each in percent_columns:
df[each] = df[each].mul(100).astype(int)
dfs = df.style
dfs.applymap(color_percents, subset=percent_columns).set_table_attributes("border=1")
my_html = dfs.render()
return my_html
else:
return df.tail(days).to_html()
@app.route('/showcsvbyseason/<sensor_name>', defaults={'season': 6})
@app.route('/showcsvbyseason/<sensor_name>/<int:season>')
def showcsvbyseason(sensor_name, season):
if season == 6:
start = '2018-04-06'
end = '2018-08-01'
current_csv = pipeline_csv.format(sensor_name)
df = pd.read_csv(current_csv, index_col=False)
df_season = df.loc[(df['date'] >= start) & (df['date'] <= end)]
# Omit rows with zero count in raw_data
for sensorname in ['stereoTop', 'flirIrCamera', 'scanner3DTop']:
if sensorname in df_season.columns:
df_season = df_season[df[sensorname] != 0]
percent_columns = get_percent_columns(df_season)
for each in percent_columns:
df_season[each] = df_season[each].mul(100).astype(int)
dfs = df_season.style
dfs.applymap(color_percents, subset=percent_columns).set_table_attributes("border=1")
html = dfs.render()
return html
else:
current_csv = pipeline_csv.format(sensor_name)
df = pd.read_csv(current_csv, index_col=False)
percent_columns = get_percent_columns(df)
for each in percent_columns:
df[each] = df[each].mul(100).astype(int)
dfs = df.style
dfs.applymap(color_percents, subset=percent_columns).set_table_attributes("border=1")
my_html = dfs.render()
return my_html
@app.route('/resubmitbyseason/<sensor_name>', defaults={'season': 6})
@app.route('/resubmitbyseason/<sensor_name>/<int:season>')
def resubmitbyseason(sensor_name, season):
if season == 6:
start = '2018-04-06'
end = '2018-08-01'
current_csv = pipeline_csv.format(sensor_name)
df = pd.read_csv(current_csv, index_col=False)
df_season = df.loc[(df['date'] >= start) & (df['date'] <= end)]
# Omit rows with zero count in raw_data
primary_sensor = None
for sensorname in ['stereoTop', 'flirIrCamera', 'scanner3DTop', 'ps2Top', 'EnvironmentLogger']:
if sensorname in df_season.columns:
df_season = df_season[df[sensorname] != 0]
primary_sensor = sensorname
percent_columns = get_percent_columns(df_season)
for each in percent_columns:
df_season[each] = df_season[each].mul(100).astype(int)
# Create header and key
html = "<h1>Seasonal Counts: %s</h1><div>" % primary_sensor
html += '<a style="%s">%s</a></br>' % (color_percents(100),' 100% coverage')
html += '<a style="%s">%s</a></br>' % (color_percents(99), '>=99% coverage')
html += '<a style="%s">%s</a></br>' % (color_percents(98), '>=95% coverage')
html += '<a style="%s">%s</a></br></br>' % (color_percents(0), ' <95% coverage')
# Create daily entries
cols = list(df_season.columns.values)
for index, row in df_season.iterrows():
html += render_date_entry(primary_sensor, cols, row, index)
html += "</div>"
return html
else:
current_csv = pipeline_csv.format(sensor_name)
df = pd.read_csv(current_csv, index_col=False)
percent_columns = get_percent_columns(df)
for each in percent_columns:
df[each] = df[each].mul(100).astype(int)
dfs = df.style
dfs.applymap(color_percents, subset=percent_columns).set_table_attributes("border=1")
my_html = dfs.render()
return my_html
@app.route('/submitmissing/<sensor_name>/<target>/<date>')
@utils.requires_user("admin")
def submit_missing_timestamps(sensor_name, target, date):
sensordef = count_defs[sensor_name]
targetdef = sensordef[target]
extractorname = targetdef["extractor"]
submitted = []
notfound = []
if "parent" in targetdef:
# Count expected parent counts & actual current progress counts from filesystem
parentdef = sensordef[targetdef["parent"]]
parent_dir = os.path.join(parentdef["path"], date)
target_dir = os.path.join(targetdef["path"], date)
parent_timestamps = os.listdir(parent_dir)
if os.path.isdir(target_dir):
target_timestamps = os.listdir(target_dir)
else:
target_timestamps = []
disp_name = Sensors("", "ua-mac").get_display_name(targetdef["parent"])
missing = list(set(parent_timestamps)-set(target_timestamps))
for ts in missing:
if ts.find("-") > -1 and ts.find("__") > -1:
dataset_name = disp_name+" - "+ts
raw_dsid = get_dsid_by_name(dataset_name)
if raw_dsid:
submit_extraction(CONN, CLOWDER_HOST, CLOWDER_KEY, raw_dsid, extractorname)
submitted.append({"name": dataset_name, "id": raw_dsid})
else:
notfound.append({"name": dataset_name})
return json.dumps({
"extractor": extractorname,
"datasets submitted": submitted,
"datasets not found": notfound
})
@app.route('/submitrulecheck/<sensor_name>/<target>/<date>')
@utils.requires_user("admin")
def submit_rulecheck(sensor_name, target, date):
sensordef = count_defs[sensor_name]
targetdef = sensordef[target]
submitted = []
s = Sensors("", "ua-mac")
if "parent" in targetdef:
target_dir = os.path.join(sensordef[targetdef["parent"]]["path"], date)
target_timestamps = os.listdir(target_dir)
disp_name = s.get_display_name(targetdef["parent"])
for ts in target_timestamps:
if ts.find("-") > -1 and ts.find("__") > -1: # TODO: and os.listdir(os.path.join(target_dir, ts)):
# Get first populated timestamp for the date that has a Clowder ID
dataset_name = disp_name+" - "+ts
raw_dsid = get_dsid_by_name(dataset_name)
if raw_dsid:
# Submit associated Clowder ID to rulechecker
submit_extraction(CONN, CLOWDER_HOST, CLOWDER_KEY, raw_dsid, "ncsa.rulechecker.terra")
submitted.append({"name": dataset_name, "id": raw_dsid})
break
return json.dumps({
"extractor": "ncsa.rulechecker.terra",
"datasets submitted": submitted
})
@app.route('/submitmissingrulechecks/<sensor_name>/<target>/<date>')
@utils.requires_user("admin")
def submit_missing_timestamps_from_rulechecker(sensor_name, target, date):
sensordef = count_defs[sensor_name]
targetdef = sensordef[target]
extractorname = targetdef["extractor"]
submitted = []
notfound = []
if "parent" in targetdef:
# Count expected parent counts from filesystem
parentdef = sensordef[targetdef["parent"]]
parent_dir = os.path.join(parentdef["path"], date)
parent_timestamps = os.listdir(parent_dir)
# Count actual current progress counts from PSQL
psql_conn = connect_to_psql()
target_timestamps = []
query_string = targetdef["query_list"] % date
curs = psql_conn.cursor()
curs.execute(query_string)
for result in curs:
target_timestamps.append(result[0].split("/")[-2])
disp_name = Sensors("", "ua-mac").get_display_name(targetdef["parent"])
missing = list(set(parent_timestamps)-set(target_timestamps))
for ts in missing:
if ts.find("-") > -1 and ts.find("__") > -1:
dataset_name = disp_name+" - "+ts
raw_dsid = get_dsid_by_name(dataset_name)
if raw_dsid:
submit_extraction(CONN, CLOWDER_HOST, CLOWDER_KEY, raw_dsid, extractorname)
submitted.append({"name": dataset_name, "id": raw_dsid})
else:
notfound.append({"name": dataset_name})
return json.dumps({
"extractor": extractorname,
"datasets submitted": submitted,
"datasets not found": notfound
})
@app.route('/submitmissingregex/<sensor_name>/<target>/<date>')
@utils.requires_user("admin")
def submit_missing_regex(sensor_name, target, date):
sensordef = count_defs[sensor_name]
targetdef = sensordef[target]
extractorname = targetdef["extractor"]
submitted = []
notfound = []
if "parent" in targetdef:
# Count expected parent counts from filesystem
parentdef = sensordef[targetdef["parent"]]
parent_dir = os.path.join(parentdef["path"], date)
if parentdef["type"] == "regex" and parentdef["path"] == targetdef["path"]:
for file in os.listdir(parent_dir):
if re.match(parentdef["regex"], file):
expected_output = file.replace(targetdef["parent_replacer_check"][1],
targetdef["parent_replacer_check"][0])
if not os.path.isfile(os.path.join(parent_dir, expected_output)):
# Find the file ID of the parent file and submit it
dataset_name = parentdef["dispname"]+" - "+date
dsid = get_dsid_by_name(dataset_name)
if dsid:
parent_id = None
dsfiles = get_file_list(CONN, CLOWDER_HOST, CLOWDER_KEY, dsid)
matchfile = file.replace("_thumb.tif", ".tif")
for dsfile in dsfiles:
if dsfile["filename"] == matchfile:
parent_id = dsfile["id"]
break
if parent_id:
submit_file_extraction(CONN, CLOWDER_HOST, CLOWDER_KEY, parent_id, extractorname)
submitted.append({"name": matchfile, "id": parent_id})
else:
notfound.append({"name": matchfile})
else:
notfound.append({"name": dataset_name})
return json.dumps({
"extractor": extractorname,
"datasets submitted": submitted,
"datasets not found": notfound
})
@app.route('/submitmissingplots/<sensor_name>/<target>/<date>')
@utils.requires_user("admin")
def submit_missing_plots(sensor_name, target, date):
sensordef = count_defs[sensor_name]
targetdef = sensordef[target]
extractorname = targetdef["extractor"]
submitted = []
notfound = []
if "parent" in targetdef:
# Count expected parent counts from filesystem
parentdef = sensordef[targetdef["parent"]]
parent_dir = os.path.join(parentdef["path"], date)
target_dir = os.path.join(targetdef["path"], date)
parent_plots = os.listdir(parent_dir)
if os.path.isdir(target_dir):
target_plots = os.listdir(target_dir)
else:
target_plots = []
return json.dumps({
"extractor": extractorname,
"datasets submitted": submitted,
"datasets not found": notfound
})
@app.route('/dateoptions', methods=['POST','GET'])
@utils.requires_user("admin")
def dateoptions():
form = SensorDateSelectForm(request.form)
if form.validate_on_submit():
raw_selected_sensors = form.sensors.data
for r in raw_selected_sensors:
# TODO: Currently only one sensor can be scheduled at a time
return redirect(url_for('schedule_count',
sensor=str(r),
start_range=str(form.start_date.data.strftime('%Y-%m-%d')),
end_range=str(form.end_date.data.strftime('%Y-%m-%d'))))
return render_template('dateoptions.html', form=form)
@app.route('/archive')
def archive():
sensor_list = count_defs.keys()
current_time_stamp = str(datetime.datetime.now()).replace(' ', '_')
for sensor in sensor_list:
output_file = os.path.join(config['csv_path'], sensor + ".csv")
if os.path.exists(output_file):
archived_file = os.path.join(config['csv_path'], sensor + '_' + current_time_stamp + ".csv")
os.rename(output_file, archived_file)
if os.path.exists(output_file):
try:
os.remove(output_file)
except OSError as e:
logging.info(e)
message = "Archived existing count csvs"
logging.info("Archived existing count csvs")
return redirect(url_for('sensors', message=message))
@app.route('/newschedule/<sensor>/<start_range>/<end_range>')
@utils.requires_user("admin")
def schedule_count(sensor, start_range, end_range):
dates_in_range = generate_dates_in_range(start_range, end_range)
psql_conn = connect_to_psql()
thread.start_new_thread(update_file_count_csvs, (sensor, dates_in_range, psql_conn))
message = "Custom scan scheduled for %s on %s dates" % (sensor, len(dates_in_range))
return redirect(url_for('sensors', message=message))
return app
# COUNTING COMPONENTS ----------------------------
def run_regular_update(use_defaults=False):
"""Perform regular update of previous two weeks for all sensors"""
psql_conn = connect_to_psql()
while True:
# Determine two weeks before current date, or by defaults
if use_defaults:
logging.info("Using default values instead of previous 2 weeks")
start_date_string = DEFAULT_COUNT_START
end_date_string = DEFAULT_COUNT_END
dates_to_check = generate_dates_in_range(start_date_string, end_date_string)
else:
today = datetime.datetime.now()
two_weeks = today - datetime.timedelta(days=14)
start_date_string = os.getenv('START_SCAN_DATE', two_weeks.strftime("%Y-%m-%d"))
dates_to_check = generate_dates_in_range(start_date_string)
logging.info("Checking counts for all sensors for dates %s - %s" % (start_date_string, dates_to_check[-1]))
for s in count_defs.keys():
psql_conn = update_file_count_csvs(s, dates_to_check, psql_conn)
# Wait 1 hour for next iteration
time.sleep(3600)
def retrive_single_count(target_count, target_def, date, psql_conn):
"""Return count of specified type (see counts.py for types)"""
count = 0
if target_def["type"] == "timestamp":
date_dir = os.path.join(target_def["path"], date)
if os.path.exists(date_dir):
logging.info(" [%s] counting timestamps in %s" % (target_count, date_dir))
# TODO: Only count non-empty directories
"""count = 0
for sub in os.listdir(date_dir):
if os.listdir(os.path.join(date_dir, sub)):
count += 1"""
count = len(os.listdir(date_dir))
else:
logging.info(" [%s] directory not found: %s" % (target_count, date_dir))
elif target_def["type"] == "plot":
date_dir = os.path.join(target_def["path"], date)
if os.path.exists(date_dir):
logging.info(" [%s] counting plots in %s" % (target_count, date_dir))
# TODO: Only count non-empty directories
"""count = 0
for sub in os.listdir(date_dir):
if os.listdir(os.path.join(date_dir, sub)):
count += 1"""
count = len(os.listdir(date_dir))
else:
logging.info(" [%s] directory not found: %s" % (target_count, date_dir))
elif target_def["type"] == "regex":
date_dir = os.path.join(target_def["path"], date)
if os.path.exists(date_dir):
logging.info(" [%s] matching regex against %s" % (target_count, date_dir))
for date_content in os.listdir(date_dir):
if os.path.isdir(date_content):
# This is timestamp-level search
ts_dir = os.path.join(date_dir, date_content)
for file in os.listdir(ts_dir):
if re.match(target_def["regex"], file):
count += 1
else:
# No timestamp (e.g. fullfield)
if re.match(target_def["regex"], date_content):
count += 1
else:
logging.info(" [%s] directory not found: %s" % (target_count, date_dir))
elif target_def["type"] == "psql":
logging.info(" [%s] querying PSQL records for %s" % (target_count, date))
query_string = target_def["query_count"] % date
curs = psql_conn.cursor()
curs.execute(query_string)
for result in curs:
count = result[0]
return count
def update_file_count_csvs(sensor, dates_to_check, psql_conn):
"""Perform necessary counting on specified dates to update CSV for all sensors."""
global SCAN_LOCK
while SCAN_LOCK:
logging.info("Another thread currently locking database; waiting 60 seconds to retry")
time.sleep(60)
logging.info("Locking scan for %s on %s dates" % (sensor, len(dates_to_check)))
SCAN_LOCK = True
output_file = os.path.join(config['csv_path'], sensor+".csv")
logging.info("Updating counts for %s into %s" % (sensor, output_file))
targets = count_defs[sensor]
cols = ["date"]
for target_count in targets:
target_def = targets[target_count]
cols.append(target_count)
if "parent" in target_def:
cols.append(target_count + '%')
# Load data frame from existing CSV or create a new one
if os.path.exists(output_file):
logging.info("csv exists for %s" % output_file)
try:
df = pd.read_csv(output_file)
except Exception as e:
logging.info(e)
logging.info('CSV exists, could not read as dataframe')
cols = ["date"]
df = pd.DataFrame(columns=cols)
logging.info("CSV existed but could not be read, created dataframe for %s " % sensor)
df_columns = list(df.columns.values)
if df_columns != cols:
logging.info("CSV existed but had malformed columns, created dataframe for %s " % sensor)
df = pd.DataFrame(columns=cols)
else:
logging.info("output file for %s does not exist" % sensor)
df = pd.DataFrame(columns=cols)
logging.info("CSV did not exist, created dataframe for %s " % sensor)
# Populate count and percentage (if applicable) for each target count
logging.info("the columns of the csv are %s " % str(df.columns.values))
for current_date in dates_to_check:
logging.info("[%s] %s" % (sensor, current_date))
counts = {}
percentages = {}
for target_count in targets:
target_def = targets[target_count]
try:
counts[target_count] = retrive_single_count(target_count, target_def, current_date, psql_conn)
except:
psql_conn = connect_to_psql()
counts[target_count] = retrive_single_count(target_count, target_def, current_date, psql_conn)
if "parent" in target_def:
if target_def["parent"] not in counts:
counts[target_def["parent"]] = retrive_single_count(targets[target_def["parent"]], current_date, psql_conn)
if counts[target_def["parent"]] > 0:
percentages[target_count] = (counts[target_count]*1.0)/(counts[target_def["parent"]]*1.0)
else:
percentages[target_count] = 0.0
# If this date already has a row, just update
if current_date in df['date'].values:
logging.info("Already have data for date %s " % current_date)
updated_entry = [current_date]
for target_count in targets:
target_def = targets[target_count]
updated_entry.append(counts[target_count])
if "parent" in target_def:
updated_entry.append(percentages[target_count])
df.loc[df['date'] == current_date] = updated_entry
# If not, create a new row
else:
logging.info("No data for date %s adding to dataframe" % current_date)
new_entry = [current_date]
indices = ["date"]
for target_count in targets:
target_def = targets[target_count]
indices.append(target_count)
new_entry.append(counts[target_count])
if "parent" in target_def:
indices.append(target_count+'%')
new_entry.append(percentages[target_count])
df = df.append(pd.Series(new_entry, index=indices), ignore_index=True)
logging.info("Writing %s" % output_file)
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df.sort_values(by=['date'], inplace=True, ascending=True)
df.to_csv(output_file, index=False)
SCAN_LOCK = False
return psql_conn
if __name__ == '__main__':
logger = logging.getLogger('counter')
config = load_json_file(os.path.join(app_dir, "config_default.json"))
if os.path.exists(os.path.join(app_dir, "data/config_custom.json")):
print("...loading configuration from config_custom.json")
config = update_nested_dict(config, load_json_file(os.path.join(app_dir, "data/config_custom.json")))
try:
DEFAULT_COUNT_START = str(config["default_count_start"])
DEFAULT_COUNT_END = str(config["default_count_end"])
print(DEFAULT_COUNT_START, DEFAULT_COUNT_END)
print("default start and end provided")
except:
print("No default values for start and end")
else:
print("...no custom configuration file found. using default values")
# Initialize logger handlers
with open(os.path.join(app_dir, "config_logging.json"), 'r') as f:
log_config = json.load(f)
main_log_file = os.path.join(config["log_path"], "log_filecounter.txt")
log_config['handlers']['file']['filename'] = main_log_file
if not os.path.exists(config["log_path"]):
os.makedirs(config["log_path"])
if not os.path.isfile(main_log_file):
open(main_log_file, 'a').close()
logging.config.dictConfig(log_config)
thread.start_new_thread(run_regular_update, (True,))
apiIP = os.getenv('COUNTER_API_IP', "0.0.0.0")
apiPort = os.getenv('COUNTER_API_PORT', "5454")
app = create_app()
logger.info("*** API now listening on %s:%s ***" % (apiIP, apiPort))
app.run(host=apiIP, port=apiPort)