/
clean_and_agg_seqs_small.py
72 lines (58 loc) · 2.2 KB
/
clean_and_agg_seqs_small.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import glob
import json
positive_folder = 'D:\\mids\\data\\stage\\positive'
positive_end_file = 'parsed_positive.json'
negative_folder = 'D:\\mids\\data\\stage\\negative'
negative_end_file = 'parsed_negative.json'
ingest = 'D:\\mids\\data\\stage\\ingest_test'
ingest_end_file = 'parsed.json'
#create endpoints if they don't exist
def touch(fname):
open(fname, 'a').close()
os.utime(fname, None)
touch(positive_folder + '\\' + positive_end_file)
touch(negative_folder + '\\' + negative_end_file)
result_positive = []
folders = glob.glob(positive_folder + '\\*')
for folder in folders[0:10]:
subfiles = glob.glob(folder+'\\*')
for file in subfiles:
if os.path.getsize(file) > 300 and file[-5:] == '.json':
with open(file, 'r') as infile:
record = json.load(infile)
record['files'][0]['label'] = 'Positive'
result_positive.append(record)
print(len(result_positive))
#with open(positive_folder + '\\' + positive_end_file, 'w') as outfile:
with open(ingest + '\\' + positive_end_file, 'w') as outfile:
for i in result_positive:
json.dump(i, outfile)
outfile.write('\n')
result_negative = []
folders = glob.glob(negative_folder + '\\*')
for folder in folders[0:10]:
subfiles = glob.glob(folder+'\\*')
for file in subfiles:
if os.path.getsize(file) > 300 and file[-5:] == '.json':
with open(file, 'r') as infile:
record['files'][0]['label'] = 'Negative'
result_negative.append(record)
print(len(result_negative))
#with open(negative_folder + '\\' + negative_end_file, 'w') as outfile:
with open(ingest + '\\' + negative_end_file, 'w') as outfile:
for i in result_negative:
json.dump(i, outfile)
outfile.write('\n')
total = []
total.extend(result_positive)
total.extend(result_negative)
# with open(ingest + '\\' + ingest_end_file, 'w') as outfile:
# json.dump(total, outfile)
with open(ingest + '\\' + ingest_end_file, 'w') as outfile:
for i in total:
json.dump(i, outfile)
outfile.write('\n')
# with open(negative_folder + '\\' + negative_end_file, 'w') as outfile:
# json.dump(result_negative, outfile)
print('Complete.')