forked from netdata/learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ingest.py
1041 lines (857 loc) · 40.7 KB
/
ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
This is the script that gathers markdown files from all of Netdata's repos in this repo
Stages of this ingest script:
Stage_1: Ingest every available markdown from the defaultRepos
Stage_2: We create three buckets:
1. all_markdown_files: all the markdown files in defaultRepos
2. markdown_files_with_metadata: all the markdown files that have hidden metadata fields
3. toPublish: markdown files that must be included in the learn
(metadata_key_value: "learn_status": "Published")
Stage_3:
1. Move the toPublish markdown files under the DOCS_PREFIX folder based on their metadata (they decide where,
they live)
2. Generate autogenerated pages
Stage_4: Sanitization
1. Make the hidden metadata fields actual readable metadata for docusaurus
2.
Stage_5: Convert GH links to version specific links
"""
# Imports
import argparse
import glob
import os
import re
import shutil
import errno
import json
import ast
import git
import autogenerateRedirects as genRedirects
import pandas as pd
import numpy as np
from pathlib import Path
DRY_RUN = False
DEBUG = False
DOCS_PREFIX = "will be added by arguments"
rest_files_dictionary = {}
rest_files_with_metadata_dictionary = {}
to_publish = {}
all_markdown_files = []
UNCORRELATED_LINK_COUNTER = 0
FAIL_ON_NETDATA_BROKEN_LINKS = False
# Temporarily until we release (change it (the default) to /docs
# version_prefix = "nightly" # We use this as the version prefix in the link strategy
TEMP_FOLDER = "ingest-temp-folder"
default_repos = {
"netdata":
{
"owner": "netdata",
"branch": "master",
"HEAD": "master",
},
"go.d.plugin":
{
"owner": "netdata",
"branch": "master",
"HEAD": "master",
},
".github":
{
"owner": "netdata",
"branch": "main",
"HEAD": "main",
},
"agent-service-discovery":
{
"owner": "netdata",
"branch": "master",
"HEAD": "master",
},
"netdata-grafana-datasource-plugin":
{
"owner": "netdata",
"branch": "master",
"HEAD": "master",
},
"helmchart":
{
"owner": "netdata",
"branch": "master",
"HEAD": "master",
}
}
def clean_and_lower_string(string):
return re.sub(r'(-)+', '-', string.lower().replace(",", "-").replace(" ", "-").replace("//", "/"))
def populate_integrations(markdownFiles):
"""
if a symlink, read that, if not, look inside integrations folder.
"""
print("### Populating map from Integration metadata rows ###\n")
metadata_dictionary = {}
ignore_dup = []
# Read the map file, to replace the placeholder for the dynamic part
map_file = pd.read_csv("map.tsv", sep='\t')
collectors_entries = pd.DataFrame()
exporting_entries = pd.DataFrame()
alerting_agent_entries = pd.DataFrame()
alerting_cloud_entries = pd.DataFrame()
readmes_first = []
others_last = []
for file in markdownFiles:
if "README.md" in file:
readmes_first.append(file)
else:
others_last.append(file)
markdownFiles = readmes_first + others_last
for file in markdownFiles:
path = file.split("integrations")[0].replace("README.md", "")
whole_file = Path(file).read_text()
if whole_file not in ignore_dup and "DO NOT EDIT THIS FILE DIRECTLY" in whole_file:
meta = whole_file.split(
"endmeta-->")[0].replace("<!--startmeta", "---") + "---"
metadata_dictionary = read_metadata(meta)
if os.path.islink(file):
ignore_dup.append(whole_file)
# If it is a manual symlink, meaning a README symlink but the folder has more than one integration, thus their custom_edit_urls are unique. 1:1 integrations have the README link as custom_edit_url
if not file.replace("ingest-temp-folder/", "").split('/', 1)[1] in metadata_dictionary['custom_edit_url']:
proper_edit_url = file.replace(
"ingest-temp-folder/", "")
proper_edit_url = "https://github.com/netdata/" + \
proper_edit_url.split(
'/', 1)[0] + "/edit/master/" + proper_edit_url.split('/', 1)[1]
metadata_dictionary['custom_edit_url'] = proper_edit_url
# print("path:", file)
# print(metadata_dictionary)
metadf = pd.DataFrame([metadata_dictionary])
if "collectors" in path or "modules" in path:
collectors_entries = pd.concat(
[collectors_entries, metadf])
# print(collectors_entries)
# quit()
elif "exporting" in path:
exporting_entries = pd.concat([exporting_entries, metadf])
# print(exporting_entries)
# here we need a different check, as the path variable gets messed up
elif "cloud-notifications" in file:
# print("in")
alerting_cloud_entries = pd.concat(
[alerting_cloud_entries, metadf])
else:
alerting_agent_entries = pd.concat(
[alerting_agent_entries, metadf])
# print("Collectors\n", collectors_entries, "Agent alerts\n", alerting_agent, "Cloud alerts\n", alerting_cloud, "Exporting", exporting_entries)
replace_index = map_file.loc[map_file['custom_edit_url']
== "collectors_integrations"].index
# print(replace_index[0])
upper = map_file.iloc[:replace_index[0]]
lower = map_file.iloc[replace_index[0]+1:]
map_file = pd.concat([upper, collectors_entries.sort_values(
by=['sidebar_label'], key=lambda col: col.str.lower()), lower], ignore_index=True)
replace_index = map_file.loc[map_file['custom_edit_url']
== "agent_notifications_integrations"].index
# print(replace_index[0])
upper = map_file.iloc[:replace_index[0]]
lower = map_file.iloc[replace_index[0]+1:]
map_file = pd.concat([upper, alerting_agent_entries.sort_values(
by=['sidebar_label'], key=lambda col: col.str.lower()), lower], ignore_index=True)
replace_index = map_file.loc[map_file['custom_edit_url']
== "cloud_notifications_integrations"].index
upper = map_file.iloc[:replace_index[0]]
lower = map_file.iloc[replace_index[0]+1:]
map_file = pd.concat([upper, alerting_cloud_entries.sort_values(
by=['sidebar_label'], key=lambda col: col.str.lower()), lower], ignore_index=True)
replace_index = map_file.loc[map_file['custom_edit_url']
== "exporters_integrations"].index
# print(replace_index[0])
upper = map_file.iloc[:replace_index[0]]
lower = map_file.iloc[replace_index[0]+1:]
map_file = pd.concat([upper, exporting_entries.sort_values(
by=['sidebar_label'], key=lambda col: col.str.lower()), lower], ignore_index=True)
map_file.to_csv("ingest/generated_map.tsv", sep='\t', index=False)
# quit()
return map_file
def unsafe_cleanup_folders(folder_to_delete):
"""Cleanup every file in the specified folderToDelete."""
print("Try to clean up the folder: ", folder_to_delete)
try:
shutil.rmtree(folder_to_delete)
print("Done")
except Exception as e:
print("Couldn't delete the folder due to the exception: \n", e)
def produce_gh_view_link_for_repo(repo, file_path):
"""
This function return the GitHub link (view link) of a repo e.g <owner>/<repo>
Limitation it produces only the master, main links only for the netdata org
"""
if repo == ".github":
return f"https://github.com/netdata/{repo}/blob/main/{file_path}"
else:
return f"https://github.com/netdata/{repo}/blob/master/{file_path}"
def produce_gh_edit_link_for_repo(repo, file_path):
"""
This function return the GitHub link (view link) of a repo e.g <owner>/<repo>
Limitation it produces only the master, main links only for the netdata org
"""
if repo == ".github":
return f"https://github.com/netdata/{repo}/edit/main/{file_path}"
else:
return "https://github.com/netdata/{repo}/edit/master/{file_path}"
def safe_cleanup_learn_folders(folder_to_delete):
"""
Cleanup every file in the specified folderToDelete, that doesn't have the `part_of_learn: True`
metadata in its metadata. It also prints a list of the files that don't have this kind of
"""
deleted_files = []
md_files = fetch_markdown_from_repo(folder_to_delete)
print(
f"Files in the {folder_to_delete} folder #{len(md_files)} which are about to be deleted")
for md in md_files:
metadata = read_metadata(Path(md).read_text().split("-->")[0])
try:
if "part_of_learn" in metadata.keys():
# Reductant condition to emphasize what we are looking for when we clean up learn files
if metadata["part_of_learn"] == "True":
pass
else:
deleted_files.append(md)
os.remove(md)
except Exception as e:
print(f"Couldn't delete the {md} file reason: {e}")
print(
f"Cleaned up #{len(deleted_files)} files under {folder_to_delete} folder")
def verify_string_is_dictionary(string_input):
"""
function to verify that a string input is of dictionary type
"""
try:
if isinstance(ast.literal_eval(string_input), dict):
return True
else:
return False
except:
return False
def unpack_dictionary_string_to_dictionary(string_input):
return ast.literal_eval(string_input)
def copy_doc(src, dest):
"""
Copy a file
"""
# Get the path
try:
shutil.copy(src, dest)
except IOError as e:
# ENOENT(2): file does not exist, raised also on missing dest parent dir
if e.errno != errno.ENOENT:
raise
# try creating parent directories
os.makedirs(os.path.dirname(dest))
shutil.copy(src, dest)
def clone_repo(owner, repo, branch, depth, prefix_folder):
"""
Clone a repo in a specific depth and place it under the prefixFolder
INPUTS:
https://github.com/{owner}/{repo}:{branch}
as depth we specify the history of the repo (depth=1 fetches only the latest commit in this repo)
"""
try:
output_folder = prefix_folder + repo
# print("DEBUG", outputFolder)
git.Git().clone(
f"https://github.com/{owner}/{repo}.git", output_folder, depth=depth, branch=branch)
return f"Cloned the {branch} branch from {repo} repo (owner: {owner})"
except Exception as e:
return f"Couldn't clone the {branch} branch from {repo} repo (owner: {owner}) \n Exception {e} raised"
def create_mdx_path_from_metadata(metadata):
"""
Create a path from the documents metadata
REQUIRED KEYS in the metadata input:
[sidebar_label, learn_rel_path]
In the returned (final) path we sanitize "/", "//" , "-", "," with one dash
"""
final_file = ' '.join((metadata["sidebar_label"]
.replace("'", " ")
.replace(":", " ")
.replace("/", " ")
.replace(")", " ")
.replace(",", " ")
.replace("(", " ")
.replace("`", " ")).split())
if "Data Collection" in metadata['learn_rel_path']\
and metadata['learn_rel_path'].split("/")[-1] != "Data Collection" and 'External-plugins' not in metadata['learn_rel_path']:
last_folder = metadata['learn_rel_path'].split("Data Collection", 1)[1]
last_folder = "data-collection" + last_folder
# print(last_folder)
# exit()
# If the file is inside the monitor-anything category,
# meaning that it will try to render the sidebar category label to whatever the folder has,
# return an array of two things; [the final path, the proper slug].
# We use the slug to avoid having %20 (replacing spaces) in the link of the file.
return ["{}/{}/{}.mdx".format(DOCS_PREFIX,
metadata["learn_rel_path"]
.split("Data Collection")[0].lower().replace(" ", "-") + last_folder,
final_file.replace(" ", "-")).replace("//", "/"),
"/{}/{}".format(metadata["learn_rel_path"],
final_file.replace(" ", "-")).lower().replace(" ", "-").replace("//", "/")]
else:
return ("{}/{}/{}.mdx".format(DOCS_PREFIX,
metadata["learn_rel_path"],
final_file.replace(" ", "-")).lower().replace(" ", "-").replace("//", "/"))
def fetch_markdown_from_repo(output_folder):
return glob.glob(
output_folder + '/**/*.md*', recursive=True) + glob.glob(output_folder + '/.**/*.md*', recursive=True)
def insert_and_read_hidden_metadata_from_doc(path_to_file, dictionary):
"""
Taking a path of a file as input
Identify the area with pattern " <!-- ...multiline string -->" and converts them
to a dictionary of key:value pairs
"""
# TODO work here, predict yaml file from path, should be easy, if readme try os.exists for meta yaml, if inside integrations folder, try one out.
# TODO unique in custom edit url might need custom editurl + sidebar_label so it can be reproduced here.
repo, path = path_to_file.replace("ingest-temp-folder/", "").split('/', 1)
if repo == ".github":
key = "https://github.com/netdata/" + repo + "/edit/main" + "/" + path
else:
key = "https://github.com/netdata/" + repo + "/edit/master" + "/" + path
output = ""
for field in dictionary.loc[dictionary['custom_edit_url'] == key]:
try:
val = dictionary.loc[dictionary['custom_edit_url']
== key][field].values[0]
# print((not val == np.nan), val != val, val)
val = str(val)
if (not val == np.nan) and val != "nan":
if field == "learn_rel_path":
if val == "root":
# print("ROOT")
val = "/"
if "Data Collection" in val or "Data Collection" in val:
output += "toc_max_heading_level: 4\n"
if field == "sidebar_position":
output += "{0}: \"{1}\"\n".format(field,
val.replace("\"", ""))
else:
output += "{0}: \"{1}\"\n".format(field,
val.replace("\"", ""))
except Exception as e:
pass
if len(output) > 0:
output = "<!--\n" + output + "-->\n"
whole_file = Path(path_to_file).read_text()
if whole_file.startswith("<!--"):
body = whole_file.split("-->", 1)[1]
else:
body = whole_file
Path(path_to_file).write_text(output + body)
# print(path_to_file, output)
metadata_dictionary = {}
with open(path_to_file, "r+") as fd:
raw_text = "".join(fd.readlines())
pattern = r"((^<!--|^---)\n)((.|\n)*?)(\n(-->|---))"
match_group = re.search(pattern, raw_text)
# print(match_group)
if match_group:
raw_metadata = match_group[3]
list_metadata = raw_metadata.split("\n")
while list_metadata:
line = list_metadata.pop(0)
split_in_keywords = line.split(": ", 1)
key = split_in_keywords[0]
value = split_in_keywords[1]
if verify_string_is_dictionary(value):
value = unpack_dictionary_string_to_dictionary(value)
# If it's a multiline string
while list_metadata and len(list_metadata[0].split(": ", 1)) <= 1:
line = list_metadata.pop(0)
value = value + line.lstrip(' ')
value = value.strip("\"")
metadata_dictionary[key] = value.lstrip('>-')
# print("\n\n")
return metadata_dictionary
else:
return []
def update_metadata_of_file(path_to_file, dictionary):
"""
Taking a path of a file as input
Identify the area with pattern
"<!-- ...multiline string -->"
and converts them to a dictionary
of key:value pairs
"""
output = ""
for field in dictionary:
val = str(dictionary[field]).replace("\"", "")
output += f"{field}: \"{val}\"\n"
if len(output) > 0:
output = "<!--\n" + output + "-->"
whole_file = Path(path_to_file).read_text()
if whole_file.startswith("<!--"):
body = whole_file.split("-->", 1)[1]
else:
body = whole_file
Path(path_to_file).write_text(output+body)
def read_metadata(meta):
metadata_dictionary = {}
pattern = r"((<!--|---)\n)((.|\n)*?)(\n(-->|---))"
match_group = re.search(pattern, meta)
# If metadata is found
if match_group:
raw_metadata = match_group[3]
list_metadata = raw_metadata.split("\n")
# Split the key: value pairs
while list_metadata:
line = list_metadata.pop(0)
split_in_keywords = line.split(": ", 1)
key = split_in_keywords[0]
value = split_in_keywords[1]
if verify_string_is_dictionary(value):
value = unpack_dictionary_string_to_dictionary(
value)
# If it's a multiline string
while list_metadata and len(list_metadata[0].split(": ", 1)) <= 1:
line = list_metadata.pop(0)
value = value + line.lstrip(' ')
value = value.strip("\"")
metadata_dictionary[key] = value.lstrip('>-')
return metadata_dictionary
def sanitize_page(path):
"""
Converts the
"<!--" -> "---"
"-->" -> "---"
It converts only the first occurrences of these patterns
Side effect:
If the document doesn't have purposeful metadata but it contains this pattern in it's body this function replace
these patterns
"""
body = Path(path).read_text()
# Replace the metadata with comments
body = body.replace("<!--", "---", 1)
body = body.replace("-->", "---", 1)
match_group = re.search(r'meta_yaml: "(.*)"', body)
if match_group:
# If the file has a meta_yaml field, then it is an integration, and we need to put the value into custom_edit_url too
body = re.sub(r"meta_yaml:.*\n",
"",
re.sub(r'custom_edit_url:.*',
f"custom_edit_url: \"{match_group[1]}\"",
body))
# The list with the lines that will be written in the file
output = []
# For each line of the file I read
for line in body.splitlines():
# If the line isn't an H1 title, and it isn't an analytics pixel, append it to the output list
if not line.startswith("[![analytics]"):
output.append(line + "\n")
output = "".join(output)
# Try to remove excess newlines from the start of the document
output = re.sub(r'---(\n\s*\n)', '---\n\n', output)
# Try to add a newline to the start of a document that has no newline
if not re.match(r'---(\n\s*\n)', output):
# print(path, "not matching")
output = output.replace("---\n", "---\n\n", 2)
# revert first line
output = output.replace("---\n\n", "---\n", 1)
# Open the file for overwriting, we are going to write the output list in the file
Path(path).write_text(output)
def add_new_learn_path_key_to_dict(input_dict, docs_prefix, docs_path_learn, temp_folder):
"""
This function takes as an argument our dictionary of the Ingest process and creates a new dictionary with key-value
pairs of type Source file -> Target file (learn_absolute path)
"""
output_dictionary = dict()
for element in input_dict:
repo = input_dict[element]["ingestedRepo"]
file_path = element.replace(temp_folder+"/"+repo+"/", "")
source_link = produce_gh_view_link_for_repo(repo, file_path)
output_dictionary[source_link] = input_dict[element]["learnPath"]\
.split(".mdx")[0]\
.lstrip('"')\
.rstrip('"')\
.replace(docs_prefix, docs_path_learn)
source_link = produce_gh_edit_link_for_repo(repo, file_path)
output_dictionary[source_link] = input_dict[element]["learnPath"]\
.split(".mdx")[0]\
.lstrip('"')\
.rstrip('"')\
.replace(docs_prefix, docs_path_learn)
# Check for pages that are category overview pages, and have filepath like ".../monitor/monitor".
# This way we remove the double dirname in the end, because docusaurus routes the file to .../monitor
if output_dictionary[source_link].split("/")[len(output_dictionary[source_link].split("/"))-1] == \
output_dictionary[source_link].split("/")[len(output_dictionary[source_link].split("/"))-2]:
same_parent_dir = output_dictionary[source_link].split(
"/")[len(output_dictionary[source_link].split("/"))-2]
proper_link = output_dictionary[source_link].split(
same_parent_dir, 1)
output_dictionary[source_link] = proper_link[0] + \
proper_link[1].strip("/")
_temp = output_dictionary[source_link].replace("'", " ").replace(":", " ").replace(")", " ").replace(
",", " ").replace("(", " ").replace("/ +/g", ' ').replace(" ", "%20").replace('/-+/', '-')
# If there is a slug present in the file, then that is the new_learn_path, with a "/docs" added in the front.
try:
input_dict[element].update(
{"new_learn_path": "/docs"+input_dict[element]["metadata"]["slug"]})
except KeyError:
input_dict[element].update({"new_learn_path": _temp})
return input_dict
def convert_github_links(path_to_file, input_dict):
"""
Input:
path: The path to the markdown file
input_dict: the dictionary with every info about all files
Expected format of links in files:
[*](https://github.com/netdata/netdata/blob/master/*)
or go.d.plugin or any other Netdata repo
"""
whole_file = Path(path_to_file).read_text()
global UNCORRELATED_LINK_COUNTER
# Split the file into its metadata and body, so that this function doesn't touch the metadata fields
metadata = "---" + whole_file.split("---", 2)[1] + "---"
body = whole_file.split("---", 2)[2]
custom_edit_url_arr = re.findall(r'custom_edit_url(.*)', metadata)
# If there are links inside the body
if re.search(r"\]\((.*?)\)", body):
# Find all the links and add them in an array
urls = []
temp = re.findall(r'\[\n|.*?]\((\n|.*?)\)', body)
# For every link, try to not touch the heading that link points to, as it stays the same after the conversion
for link in temp:
urls.append(link.split('#')[0])
for url in urls:
# The URL will get replaced by the value of the replaceString
try:
# The keys inside fileDict are like "ingest-temp-folder/netdata/collectors/charts.d.plugin/ap/README.md"
# so from the link, we need:
# 1. replace the https link prefix up until our organization identifier with the prefix of the temp folder
# 2. try and catch any mishaps in links that instead of "blob" have "edit"
# 3. remove "blob/master/" or "blob/main/"
# 4. Then we have the correct key for the dictionary
dictionary = input_dict[url.replace("https://github.com/netdata", TEMP_FOLDER).replace(
"edit/", "blob/", 1).replace("blob/master/", "").replace("blob/main/", "")]
replace_string = dictionary["new_learn_path"]
# In some cases, a "id: someId" will be in a file, this is to change a file's link in Docusaurus,
# so we need to be careful to honor that
try:
metadata_id = dictionary["metadata"]["id"]
replace_string = replace_string.replace(
replace_string.split(
"/")[len(replace_string.split("/"))-1],
metadata_id
)
except Exception as e:
# There is no "id" metadata in the file, do nothing
pass
body = body.replace("]("+url, "]("+replace_string)
# In the end replace the URL with the replaceString
except Exception as e:
# This is probably a link that can't be translated to a Learn link (e.g. An external file)
if url.startswith("https://github.com/netdata") and re.search(r"\.md", url):
# Try to rescue an integration link
if "integrations" in url and ("collectors" in url or "modules" in url):
# Due to the integrations/cloud_notifications/integrations/.. scenario, we use rsplit to remove the last occurrence of "integrations"
# We want to map links to specific integrations mds, to their parent README, in case the above try-catch failed to find the replacement.
try_url = url.rsplit("integrations", 1)[
0] + "README.md"
# The URL will get replaced by the value of the replaceString
try:
# The keys inside fileDict are like "ingest-temp-folder/netdata/collectors/charts.d.plugin/ap/README.md"
# , so from the link, we need:
# replace the https link prefix until our organization identifier with the prefix of the temp folder
# try and catch any mishaps in links that instead of "blob" have "edit"
# remove "blob/master/" or "blob/main/"
# Then we have the correct key for the dictionary
dictionary = input_dict[try_url.replace("https://github.com/netdata", TEMP_FOLDER).replace(
"edit", "blob").replace("blob/master/", "").replace("blob/main/", "")]
replace_string = dictionary["new_learn_path"]
# In some cases, a "id: someId" will be in a file, this is to change a file's link in Docusaurus,
# so we need to be careful to honor that
try:
metadata_id = dictionary["metadata"]["id"]
replace_string = replace_string.replace(
replace_string.split(
"/")[len(replace_string.split("/"))-1],
metadata_id
)
except Exception as e:
# There is no "id" metadata in the file, do nothing
pass
# In the end replace the URL with the replaceString
body = body.replace("]("+url, "]("+replace_string)
except:
# Increase the counter of the broken links,
# fetch the custom_edit_url variable for printing and print a message
UNCORRELATED_LINK_COUNTER += 1
if len(custom_edit_url_arr[0]) > 1:
custom_edit_url = custom_edit_url_arr[0].replace(
"\"", "").strip(":")
else:
custom_edit_url = "NO custom_edit_url found, please add one"
print(UNCORRELATED_LINK_COUNTER,
"INFO: In File:",
custom_edit_url,
"\n", "URL:", url, "\n")
else:
# Increase the counter of the broken links,
# fetch the custom_edit_url variable for printing and print a message
UNCORRELATED_LINK_COUNTER += 1
if len(custom_edit_url_arr[0]) > 1:
custom_edit_url = custom_edit_url_arr[0].replace(
"\"", "").strip(":")
else:
custom_edit_url = "NO custom_edit_url found, please add one"
print(UNCORRELATED_LINK_COUNTER,
"INFO: In File:",
custom_edit_url,
"\n", "URL:", url, "\n")
# Construct again the whole file
whole_file = metadata + body
# Write everything onto the file again
Path(path_to_file).write_text(whole_file)
def automate_sidebar_position(dictionary):
"""
This function returns a column for the map dataframe, that assigns a certain number to every entry.
There are 3 different rules
Level 1 -> 100_000 gap between the top categories
Level 2 -> 2_000 gap between the level two categories
Level 3 -> 40 gap between the level three categories
Level 4 -> categories and documents at this level have no gap
"""
print("### Automating sidebar_position ###", '\n')
position_array = []
# counters
counter_one = 0
counter_two = 0
counter_three = 0
counter_four = 0
# Start from the first entry and keep it as the previous
split = dictionary['learn_rel_path'][0].split("/")
try:
previous_first_level = split[0]
previous_second_level = split[1]
previous_third_level = split[2]
except IndexError:
pass
# For every entry, check for every level of the path whether or not it is different.
# If it is, increment that level's counter by the specified amount.
for path, i in zip(dictionary['learn_rel_path'], range(0, len(dictionary))):
if str(path) != "nan":
split = str(path+f"/{i}").split("/")
# Split the current path
try:
current_first_level = split[0]
current_second_level = split[1]
current_third_level = split[2]
except IndexError:
pass
# This works more or less like a Greek abacus
try:
if current_first_level != previous_first_level:
counter_one += 100000
counter_two = 0
counter_three = 0
counter_four = 0
elif current_second_level != previous_second_level:
counter_two += 2000
counter_three = 0
counter_four = 0
elif current_third_level != previous_third_level:
counter_three += 40
counter_four = 0
else:
counter_four += 1
except UnboundLocalError:
pass
try:
previous_first_level = current_first_level
previous_second_level = current_second_level
previous_third_level = current_third_level
except UnboundLocalError:
pass
position_array.append(
counter_one+counter_two+counter_three+counter_four)
else:
# If for any reason the path is nan, just add a -1, this is very unlikely that it will be the case
position_array.append(-1)
return position_array
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Ingest docs from multiple repositories')
parser.add_argument(
'--repos',
default=[],
nargs='+',
help='Choose specific repo you want ingest, if not set, defaults ingested'
)
parser.add_argument(
"--dry-run",
help="Don't save a file with the output.",
action="store_true",
)
parser.add_argument(
"-d", "--debug",
help="Enable debug printing",
action="store_true",
)
parser.add_argument(
"--docs-prefix",
help="Don't save a file with the output.",
dest="DOCS_PREFIX",
default="docs"
)
parser.add_argument(
"-f", "--fail-on-internal-broken-links",
help="Don't proceed with the process if internal broken links are found.",
action="store_true",
)
list_of_repos_in_str = []
# netdata/netdata:branch tkatsoulas/go.d.plugin:mybranch
args = parser.parse_args()
kArgs = args._get_kwargs()
# Create local copies from the parse_args input
DOCS_PREFIX = args.DOCS_PREFIX
for arg in kArgs:
if arg[0] == "repos":
list_of_repos_in_str = arg[1]
if arg[0] == "dry_run":
DRY_RUN = arg[1]
if arg[0] == "debug" or arg[0] == "debug":
if arg[1]:
DEBUG = True
print("RUNNING WITH DEBUG MESSAGES ON")
if arg[0] == "fail_on_internal_broken_links":
FAIL_ON_NETDATA_BROKEN_LINKS = arg[1]
if len(list_of_repos_in_str) > 0:
for repo_str in list_of_repos_in_str:
try:
_temp = repo_str.split("/")
repo_owner, repository, repo_branch = [
_temp[0]] + (_temp[1].split(":"))
default_repos[repository]["owner"] = repo_owner
default_repos[repository]["branch"] = repo_branch
except (TypeError, ValueError):
print(
"You specified a wrong format in at least one of the repos you want to ingest")
parser.print_usage()
exit(-1)
except KeyError:
print(repository)
print("The repo you specified in not in predefined repos")
print(default_repos.keys())
parser.print_usage()
exit(-1)
except Exception as exc:
print("Unknown error in parsing", exc)
# Clean up old clones into a temp dir
unsafe_cleanup_folders(TEMP_FOLDER)
# Clean up old ingested docs
safe_cleanup_learn_folders(DOCS_PREFIX)
print("Creating a temp directory: ", TEMP_FOLDER)
try:
os.mkdir(TEMP_FOLDER)
except FileExistsError:
print("Folder already exists")
# Clone all the predefined repos
for repo_name in default_repos.keys():
print(clone_repo(default_repos[repo_name]["owner"], repo_name,
default_repos[repo_name]["branch"], 1, TEMP_FOLDER + "/"))
# We fetch the markdown files from the repositories
all_markdown_files = fetch_markdown_from_repo(TEMP_FOLDER)
print("Files detected: ", len(all_markdown_files), "\n")
# Fill the mapDict with the metadata the integration mds have (autogenerated metadata)
mapDict = populate_integrations(all_markdown_files)
# set the index to the unique custom_edit_url column
mapDict.set_index('custom_edit_url').T.to_dict('dict')
# Automate the sidebar position
mapDict['sidebar_position'] = automate_sidebar_position(mapDict)
# Make the column type integer
mapDict['sidebar_position'] = mapDict['sidebar_position'].astype(int)
markdown_files_with_metadata = []
for markdown in all_markdown_files:
# print("File: ", markdown)
md_metadata = insert_and_read_hidden_metadata_from_doc(
markdown, mapDict)
# Check to see if the dictionary returned is empty
if len(md_metadata) > 0:
# print("FOUND METADATA", markdown)
# print(metadata)
markdown_files_with_metadata.append(markdown)
if "learn_status" in md_metadata.keys() and md_metadata["learn_status"] == "Published":
try:
# check the type of the response (for more info of what the response can be check
# the return statements of the function itself)
response = create_mdx_path_from_metadata(md_metadata)
if type(response) != str:
# If the response is not a string then it is a two item array, [final path, slug]
md_metadata.update({"slug": str(response[1])})
to_publish[markdown] = {
"metadata": md_metadata,
"learnPath": str(response[0]),
"ingestedRepo": str(markdown.split("/", 2)[1])
}
md_metadata.update(
{"learn_link": "https://learn.netdata.cloud/docs" + md_metadata['slug']})
else:
to_publish[markdown] = {
"metadata": md_metadata,
"learnPath": str(response),
"ingestedRepo": str(markdown.split("/", 2)[1])
}
# replace first ", " and then " ", this needs to be handled in a prettier way, but other updates in this file are on the way.
if md_metadata['learn_rel_path'] != md_metadata['sidebar_label']:
md_metadata.update({"learn_link": "https://learn.netdata.cloud/docs/" + clean_and_lower_string(
md_metadata['learn_rel_path']) + "/" + clean_and_lower_string(md_metadata['sidebar_label'])})
else:
md_metadata.update(
{"learn_link": "https://learn.netdata.cloud/docs/" + clean_and_lower_string(md_metadata['learn_rel_path'])})
update_metadata_of_file(markdown, md_metadata)
except KeyError as exc:
print(
f"File {markdown} doesn't contain key-value", exc)
else:
# We don't need these files
rest_files_with_metadata_dictionary[markdown] = {
"metadata": md_metadata,
"learnPath": str(f"docs/_archive/_{markdown}"),
"ingestedRepo": str(markdown.split("/", 2)[1])
}
# Don't fail on empty markdown
elif not os.stat(markdown).st_size == 0:
rest_files_dictionary[markdown] = {"tmpPath": markdown}
del md_metadata
# FILE MOVING
print("### Moving files ###\n")
# identify published documents
print(f"### Found Learn files: {len(to_publish)}###\n")
for md_file in to_publish:
copy_doc(md_file, to_publish[md_file]["learnPath"])
sanitize_page(to_publish[md_file]["learnPath"])