ingest/ingest.py

"""
This is the script that gathers markdown files from all of Netdata's repos in this repo

Stages of this ingest script:

    Stage_1: Ingest every available markdown from the defaultRepos

    Stage_2: We create three buckets:
                1. all_markdown_files: all the markdown files in defaultRepos
                2. markdown_files_with_metadata: all the markdown files that have hidden metadata fields
                3. toPublish: markdown files that must be included in the learn 
                    (metadata_key_value: "learn_status": "Published") 

    Stage_3: 
        1. Move the toPublish markdown files under the DOCS_PREFIX folder based on their metadata (they decide where, 
            they live)
        2. Generate autogenerated pages

    Stage_4: Sanitization
                1. Make the hidden metadata fields actual readable metadata for docusaurus
                2. 
                
    Stage_5: Convert GH links to version specific links
"""

# Imports
import argparse
import glob
import os
import re
import shutil
import errno
import json
import ast
import git
import autogenerateRedirects as genRedirects
import pandas as pd
import numpy as np
from pathlib import Path


DRY_RUN = False
DEBUG = False
DOCS_PREFIX = "will be added by arguments"

rest_files_dictionary = {}
rest_files_with_metadata_dictionary = {}
to_publish = {}
all_markdown_files = []
UNCORRELATED_LINK_COUNTER = 0
FAIL_ON_NETDATA_BROKEN_LINKS = False
# Temporarily until we release (change it (the default) to /docs
# version_prefix = "nightly"  # We use this as the version prefix in the link strategy
TEMP_FOLDER = "ingest-temp-folder"
default_repos = {
    "netdata":
        {
            "owner": "netdata",
            "branch": "master",
            "HEAD": "master",
        },
    "go.d.plugin":
        {
            "owner": "netdata",
            "branch": "master",
            "HEAD": "master",
        },
    ".github":
        {
            "owner": "netdata",
            "branch": "main",
            "HEAD": "main",
        },
    "agent-service-discovery":
        {
            "owner": "netdata",
            "branch": "master",
            "HEAD": "master",
        },
    "netdata-grafana-datasource-plugin":
        {
            "owner": "netdata",
            "branch": "master",
            "HEAD": "master",
        },
    "helmchart":
        {
            "owner": "netdata",
            "branch": "master",
            "HEAD": "master",
        }
}


def clean_and_lower_string(string):
    return re.sub(r'(-)+', '-', string.lower().replace(",", "-").replace(" ", "-").replace("//", "/"))


def populate_integrations(markdownFiles):
    """
    if a symlink, read that, if not, look inside integrations folder.
    """

    print("### Populating map from Integration metadata rows ###\n")

    metadata_dictionary = {}
    ignore_dup = []

    # Read the map file, to replace the placeholder for the dynamic part
    map_file = pd.read_csv("map.tsv", sep='\t')

    collectors_entries = pd.DataFrame()
    exporting_entries = pd.DataFrame()
    alerting_agent_entries = pd.DataFrame()
    alerting_cloud_entries = pd.DataFrame()

    readmes_first = []
    others_last = []
    for file in markdownFiles:
        if "README.md" in file:
            readmes_first.append(file)
        else:
            others_last.append(file)

    markdownFiles = readmes_first + others_last

    for file in markdownFiles:
        path = file.split("integrations")[0].replace("README.md", "")

        whole_file = Path(file).read_text()

        if whole_file not in ignore_dup and "DO NOT EDIT THIS FILE DIRECTLY" in whole_file:

            meta = whole_file.split(
                "endmeta-->")[0].replace("<!--startmeta", "---") + "---"

            metadata_dictionary = read_metadata(meta)

            if os.path.islink(file):
                ignore_dup.append(whole_file)
                # If it is a manual symlink, meaning a README symlink but the folder has more than one integration, thus their custom_edit_urls are unique. 1:1 integrations have the README link as custom_edit_url
                if not file.replace("ingest-temp-folder/", "").split('/', 1)[1] in metadata_dictionary['custom_edit_url']:
                    proper_edit_url = file.replace(
                        "ingest-temp-folder/", "")

                    proper_edit_url = "https://github.com/netdata/" + \
                        proper_edit_url.split(
                            '/', 1)[0] + "/edit/master/" + proper_edit_url.split('/', 1)[1]
                    metadata_dictionary['custom_edit_url'] = proper_edit_url

                    # print("path:", file)
                    # print(metadata_dictionary)

            metadf = pd.DataFrame([metadata_dictionary])
            if "collectors" in path or "modules" in path:
                collectors_entries = pd.concat(
                    [collectors_entries, metadf])
                # print(collectors_entries)
                # quit()
            elif "exporting" in path:
                exporting_entries = pd.concat([exporting_entries, metadf])
                # print(exporting_entries)
            # here we need a different check, as the path variable gets messed up
            elif "cloud-notifications" in file:

                # print("in")
                alerting_cloud_entries = pd.concat(
                    [alerting_cloud_entries, metadf])
            else:
                alerting_agent_entries = pd.concat(
                    [alerting_agent_entries, metadf])

    # print("Collectors\n", collectors_entries, "Agent alerts\n", alerting_agent, "Cloud alerts\n",  alerting_cloud, "Exporting",  exporting_entries)

    replace_index = map_file.loc[map_file['custom_edit_url']
                                 == "collectors_integrations"].index
    # print(replace_index[0])
    upper = map_file.iloc[:replace_index[0]]
    lower = map_file.iloc[replace_index[0]+1:]

    map_file = pd.concat([upper, collectors_entries.sort_values(
        by=['sidebar_label'], key=lambda col: col.str.lower()), lower], ignore_index=True)

    replace_index = map_file.loc[map_file['custom_edit_url']
                                 == "agent_notifications_integrations"].index
    # print(replace_index[0])
    upper = map_file.iloc[:replace_index[0]]
    lower = map_file.iloc[replace_index[0]+1:]

    map_file = pd.concat([upper, alerting_agent_entries.sort_values(
        by=['sidebar_label'], key=lambda col: col.str.lower()), lower], ignore_index=True)

    replace_index = map_file.loc[map_file['custom_edit_url']
                                 == "cloud_notifications_integrations"].index
    upper = map_file.iloc[:replace_index[0]]
    lower = map_file.iloc[replace_index[0]+1:]

    map_file = pd.concat([upper, alerting_cloud_entries.sort_values(
        by=['sidebar_label'], key=lambda col: col.str.lower()), lower], ignore_index=True)

    replace_index = map_file.loc[map_file['custom_edit_url']
                                 == "exporters_integrations"].index
    # print(replace_index[0])
    upper = map_file.iloc[:replace_index[0]]
    lower = map_file.iloc[replace_index[0]+1:]

    map_file = pd.concat([upper, exporting_entries.sort_values(
        by=['sidebar_label'], key=lambda col: col.str.lower()), lower], ignore_index=True)

    map_file.to_csv("ingest/generated_map.tsv", sep='\t', index=False)

    # quit()
    return map_file


def unsafe_cleanup_folders(folder_to_delete):
    """Cleanup every file in the specified folderToDelete."""
    print("Try to clean up the folder: ", folder_to_delete)
    try:
        shutil.rmtree(folder_to_delete)
        print("Done")
    except Exception as e:
        print("Couldn't delete the folder due to the exception: \n", e)


def produce_gh_view_link_for_repo(repo, file_path):
    """
    This function return the GitHub link (view link) of a repo e.g <owner>/<repo>
    Limitation it produces only  the master, main links only for the netdata org
    """
    if repo == ".github":
        return f"https://github.com/netdata/{repo}/blob/main/{file_path}"
    else:
        return f"https://github.com/netdata/{repo}/blob/master/{file_path}"


def produce_gh_edit_link_for_repo(repo, file_path):
    """
    This function return the GitHub link (view link) of a repo e.g <owner>/<repo>
    Limitation it produces only  the master, main links only for the netdata org
    """
    if repo == ".github":
        return f"https://github.com/netdata/{repo}/edit/main/{file_path}"
    else:
        return "https://github.com/netdata/{repo}/edit/master/{file_path}"


def safe_cleanup_learn_folders(folder_to_delete):
    """
    Cleanup every file in the specified folderToDelete, that doesn't have the `part_of_learn: True`
    metadata in its metadata. It also prints a list of the files that don't have this kind of
    """
    deleted_files = []
    md_files = fetch_markdown_from_repo(folder_to_delete)
    print(
        f"Files in the {folder_to_delete} folder #{len(md_files)} which are about to be deleted")
    for md in md_files:

        metadata = read_metadata(Path(md).read_text().split("-->")[0])
        try:
            if "part_of_learn" in metadata.keys():
                # Reductant condition to emphasize what we are looking for when we clean up learn files
                if metadata["part_of_learn"] == "True":
                    pass
            else:
                deleted_files.append(md)
                os.remove(md)
        except Exception as e:
            print(f"Couldn't delete the {md} file reason: {e}")
    print(
        f"Cleaned up #{len(deleted_files)} files under {folder_to_delete} folder")


def verify_string_is_dictionary(string_input):
    """
    function to verify that a string input is of dictionary type
    """
    try:
        if isinstance(ast.literal_eval(string_input), dict):
            return True
        else:
            return False
    except:
        return False


def unpack_dictionary_string_to_dictionary(string_input):
    return ast.literal_eval(string_input)


def copy_doc(src, dest):
    """
    Copy a file
    """
    # Get the path
    try:
        shutil.copy(src, dest)
    except IOError as e:
        # ENOENT(2): file does not exist, raised also on missing dest parent dir
        if e.errno != errno.ENOENT:
            raise
        # try creating parent directories
        os.makedirs(os.path.dirname(dest))
        shutil.copy(src, dest)


def clone_repo(owner, repo, branch, depth, prefix_folder):
    """
    Clone a repo in a specific depth and place it under the prefixFolder
    INPUTS:
        https://github.com/{owner}/{repo}:{branch}
        as depth we specify the history of the repo (depth=1 fetches only the latest commit in this repo)
    """
    try:
        output_folder = prefix_folder + repo
        # print("DEBUG", outputFolder)
        git.Git().clone(
            f"https://github.com/{owner}/{repo}.git", output_folder, depth=depth, branch=branch)
        return f"Cloned the {branch} branch from {repo} repo (owner: {owner})"
    except Exception as e:
        return f"Couldn't clone the {branch} branch from {repo} repo (owner: {owner}) \n Exception {e} raised"


def create_mdx_path_from_metadata(metadata):
    """
    Create a path from the documents metadata
    REQUIRED KEYS in the metadata input:
        [sidebar_label, learn_rel_path]
    In the returned (final) path we sanitize "/", "//" , "-", "," with one dash
    """
    final_file = ' '.join((metadata["sidebar_label"]
                          .replace("'", " ")
                          .replace(":", " ")
                          .replace("/", " ")
                          .replace(")", " ")
                          .replace(",", " ")
                          .replace("(", " ")
                          .replace("`", " ")).split())

    if "Data Collection" in metadata['learn_rel_path']\
            and metadata['learn_rel_path'].split("/")[-1] != "Data Collection" and 'External-plugins' not in metadata['learn_rel_path']:
        last_folder = metadata['learn_rel_path'].split("Data Collection", 1)[1]
        last_folder = "data-collection" + last_folder
        # print(last_folder)
        # exit()
        # If the file is inside the monitor-anything category,
        # meaning that it will try to render the sidebar category label to whatever the folder has,
        # return an array of two things; [the final path, the proper slug].
        # We use the slug to avoid having %20 (replacing spaces) in the link of the file.
        return ["{}/{}/{}.mdx".format(DOCS_PREFIX,
                                      metadata["learn_rel_path"]
                                      .split("Data Collection")[0].lower().replace(" ", "-") + last_folder,
                                      final_file.replace(" ", "-")).replace("//", "/"),
                "/{}/{}".format(metadata["learn_rel_path"],
                                final_file.replace(" ", "-")).lower().replace(" ", "-").replace("//", "/")]

    else:
        return ("{}/{}/{}.mdx".format(DOCS_PREFIX,
                                      metadata["learn_rel_path"],
                                      final_file.replace(" ", "-")).lower().replace(" ", "-").replace("//", "/"))


def fetch_markdown_from_repo(output_folder):
    return glob.glob(
        output_folder + '/**/*.md*', recursive=True) + glob.glob(output_folder + '/.**/*.md*', recursive=True)


def insert_and_read_hidden_metadata_from_doc(path_to_file, dictionary):
    """
    Taking a path of a file as input
    Identify the area with pattern " <!-- ...multiline string -->" and  converts them
    to a dictionary of key:value pairs
    """
    # TODO work here, predict yaml file from path, should be easy, if readme try os.exists for meta yaml, if inside integrations folder, try one out.

    # TODO unique in custom edit url might need custom editurl + sidebar_label so it can be reproduced here.
    repo, path = path_to_file.replace("ingest-temp-folder/", "").split('/', 1)

    if repo == ".github":
        key = "https://github.com/netdata/" + repo + "/edit/main" + "/" + path
    else:
        key = "https://github.com/netdata/" + repo + "/edit/master" + "/" + path

    output = ""
    for field in dictionary.loc[dictionary['custom_edit_url'] == key]:
        try:
            val = dictionary.loc[dictionary['custom_edit_url']
                                 == key][field].values[0]

            # print((not val == np.nan),  val != val, val)
            val = str(val)

            if (not val == np.nan) and val != "nan":

                if field == "learn_rel_path":
                    if val == "root":
                        # print("ROOT")
                        val = "/"

                    if "Data Collection" in val or "Data Collection" in val:
                        output += "toc_max_heading_level: 4\n"

                if field == "sidebar_position":
                    output += "{0}: \"{1}\"\n".format(field,
                                                      val.replace("\"", ""))
                else:
                    output += "{0}: \"{1}\"\n".format(field,
                                                      val.replace("\"", ""))

        except Exception as e:
            pass

    if len(output) > 0:
        output = "<!--\n" + output + "-->\n"

        whole_file = Path(path_to_file).read_text()

        if whole_file.startswith("<!--"):
            body = whole_file.split("-->", 1)[1]
        else:
            body = whole_file

        Path(path_to_file).write_text(output + body)

        # print(path_to_file, output)
        metadata_dictionary = {}
        with open(path_to_file, "r+") as fd:
            raw_text = "".join(fd.readlines())
            pattern = r"((^<!--|^---)\n)((.|\n)*?)(\n(-->|---))"
            match_group = re.search(pattern, raw_text)
            # print(match_group)
            if match_group:
                raw_metadata = match_group[3]
                list_metadata = raw_metadata.split("\n")
                while list_metadata:
                    line = list_metadata.pop(0)
                    split_in_keywords = line.split(": ", 1)
                    key = split_in_keywords[0]
                    value = split_in_keywords[1]
                    if verify_string_is_dictionary(value):
                        value = unpack_dictionary_string_to_dictionary(value)
                    # If it's a multiline string
                    while list_metadata and len(list_metadata[0].split(": ", 1)) <= 1:
                        line = list_metadata.pop(0)
                        value = value + line.lstrip(' ')
                    value = value.strip("\"")
                    metadata_dictionary[key] = value.lstrip('>-')
        # print("\n\n")
        return metadata_dictionary
    else:
        return []


def update_metadata_of_file(path_to_file, dictionary):
    """
    Taking a path of a file as input
    Identify the area with pattern 
    "<!-- ...multiline string -->" 
    and converts them to a dictionary 
    of key:value pairs
    """

    output = ""

    for field in dictionary:
        val = str(dictionary[field]).replace("\"", "")
        output += f"{field}: \"{val}\"\n"
    if len(output) > 0:
        output = "<!--\n" + output + "-->"

    whole_file = Path(path_to_file).read_text()

    if whole_file.startswith("<!--"):
        body = whole_file.split("-->", 1)[1]
    else:
        body = whole_file

    Path(path_to_file).write_text(output+body)


def read_metadata(meta):
    metadata_dictionary = {}
    pattern = r"((<!--|---)\n)((.|\n)*?)(\n(-->|---))"
    match_group = re.search(pattern, meta)

    # If metadata is found
    if match_group:
        raw_metadata = match_group[3]
        list_metadata = raw_metadata.split("\n")
        # Split the key: value pairs
        while list_metadata:
            line = list_metadata.pop(0)
            split_in_keywords = line.split(": ", 1)
            key = split_in_keywords[0]
            value = split_in_keywords[1]
            if verify_string_is_dictionary(value):
                value = unpack_dictionary_string_to_dictionary(
                    value)
            # If it's a multiline string
            while list_metadata and len(list_metadata[0].split(": ", 1)) <= 1:
                line = list_metadata.pop(0)
                value = value + line.lstrip(' ')
            value = value.strip("\"")
            metadata_dictionary[key] = value.lstrip('>-')

    return metadata_dictionary


def sanitize_page(path):
    """
    Converts the
        "<!--" -> "---"
        "-->" -> "---"
    It converts only the first occurrences of these patterns
    Side effect:
        If the document doesn't have purposeful metadata but it contains this pattern in it's body this function replace
        these patterns
    """

    body = Path(path).read_text()

    # Replace the metadata with comments
    body = body.replace("<!--", "---", 1)
    body = body.replace("-->", "---", 1)

    match_group = re.search(r'meta_yaml: "(.*)"', body)
    if match_group:
        # If the file has a meta_yaml field, then it is an integration, and we need to put the value into custom_edit_url too
        body = re.sub(r"meta_yaml:.*\n",
                      "",
                      re.sub(r'custom_edit_url:.*',
                             f"custom_edit_url: \"{match_group[1]}\"",
                             body))

    # The list with the lines that will be written in the file
    output = []

    # For each line of the file I read
    for line in body.splitlines():
        # If the line isn't an H1 title, and it isn't an analytics pixel, append it to the output list
        if not line.startswith("[![analytics]"):
            output.append(line + "\n")
    output = "".join(output)
    # Try to remove excess newlines from the start of the document
    output = re.sub(r'---(\n\s*\n)', '---\n\n', output)
    # Try to add a newline to the start of a document that has no newline
    if not re.match(r'---(\n\s*\n)', output):
        # print(path, "not matching")
        output = output.replace("---\n", "---\n\n", 2)
        # revert first line
        output = output.replace("---\n\n", "---\n", 1)

    # Open the file for overwriting, we are going to write the output list in the file
    Path(path).write_text(output)


def add_new_learn_path_key_to_dict(input_dict, docs_prefix, docs_path_learn, temp_folder):
    """
    This function takes as an argument our dictionary of the Ingest process and creates a new dictionary with key-value
    pairs of type Source file -> Target file (learn_absolute path)
    """
    output_dictionary = dict()
    for element in input_dict:
        repo = input_dict[element]["ingestedRepo"]
        file_path = element.replace(temp_folder+"/"+repo+"/", "")

        source_link = produce_gh_view_link_for_repo(repo, file_path)
        output_dictionary[source_link] = input_dict[element]["learnPath"]\
            .split(".mdx")[0]\
            .lstrip('"')\
            .rstrip('"')\
            .replace(docs_prefix, docs_path_learn)
        source_link = produce_gh_edit_link_for_repo(repo, file_path)
        output_dictionary[source_link] = input_dict[element]["learnPath"]\
            .split(".mdx")[0]\
            .lstrip('"')\
            .rstrip('"')\
            .replace(docs_prefix, docs_path_learn)

        # Check for pages that are category overview pages, and have filepath like ".../monitor/monitor".
        # This way we remove the double dirname in the end, because docusaurus routes the file to .../monitor
        if output_dictionary[source_link].split("/")[len(output_dictionary[source_link].split("/"))-1] == \
                output_dictionary[source_link].split("/")[len(output_dictionary[source_link].split("/"))-2]:

            same_parent_dir = output_dictionary[source_link].split(
                "/")[len(output_dictionary[source_link].split("/"))-2]

            proper_link = output_dictionary[source_link].split(
                same_parent_dir, 1)
            output_dictionary[source_link] = proper_link[0] + \
                proper_link[1].strip("/")

        _temp = output_dictionary[source_link].replace("'", " ").replace(":", " ").replace(")", " ").replace(
            ",", " ").replace("(", " ").replace("/  +/g", ' ').replace(" ", "%20").replace('/-+/', '-')
        # If there is a slug present in the file, then that is the new_learn_path, with a "/docs" added in the front.
        try:
            input_dict[element].update(
                {"new_learn_path": "/docs"+input_dict[element]["metadata"]["slug"]})
        except KeyError:
            input_dict[element].update({"new_learn_path": _temp})

    return input_dict


def convert_github_links(path_to_file, input_dict):
    """
    Input:
        path: The path to the markdown file
        input_dict: the dictionary with every info about all files

    Expected format of links in files:
        [*](https://github.com/netdata/netdata/blob/master/*)
        or go.d.plugin or any other Netdata repo
    """

    whole_file = Path(path_to_file).read_text()

    global UNCORRELATED_LINK_COUNTER

    # Split the file into its metadata and body, so that this function doesn't touch the metadata fields
    metadata = "---" + whole_file.split("---", 2)[1] + "---"
    body = whole_file.split("---", 2)[2]

    custom_edit_url_arr = re.findall(r'custom_edit_url(.*)', metadata)

    # If there are links inside the body
    if re.search(r"\]\((.*?)\)", body):
        # Find all the links and add them in an array
        urls = []
        temp = re.findall(r'\[\n|.*?]\((\n|.*?)\)', body)
        # For every link, try to not touch the heading that link points to, as it stays the same after the conversion
        for link in temp:
            urls.append(link.split('#')[0])

        for url in urls:
            # The URL will get replaced by the value of the replaceString
            try:
                # The keys inside fileDict are like "ingest-temp-folder/netdata/collectors/charts.d.plugin/ap/README.md"
                # so from the link, we need:
                # 1. replace the https link prefix up until our organization identifier with the prefix of the temp folder
                # 2. try and catch any mishaps in links that instead of "blob" have "edit"
                # 3. remove "blob/master/" or "blob/main/"
                # 4. Then we have the correct key for the dictionary

                dictionary = input_dict[url.replace("https://github.com/netdata", TEMP_FOLDER).replace(
                    "edit/", "blob/", 1).replace("blob/master/", "").replace("blob/main/", "")]

                replace_string = dictionary["new_learn_path"]

                # In some cases, a "id: someId" will be in a file, this is to change a file's link in Docusaurus,
                # so we need to be careful to honor that
                try:
                    metadata_id = dictionary["metadata"]["id"]

                    replace_string = replace_string.replace(
                        replace_string.split(
                            "/")[len(replace_string.split("/"))-1],
                        metadata_id
                    )

                except Exception as e:
                    # There is no "id" metadata in the file, do nothing

                    pass

                body = body.replace("]("+url, "]("+replace_string)
                # In the end replace the URL with the replaceString
            except Exception as e:
                # This is probably a link that can't be translated to a Learn link (e.g. An external file)
                if url.startswith("https://github.com/netdata") and re.search(r"\.md", url):
                    # Try to rescue an integration link
                    if "integrations" in url and ("collectors" in url or "modules" in url):
                        # Due to the integrations/cloud_notifications/integrations/.. scenario, we use rsplit to remove the last occurrence of "integrations"
                        # We want to map links to specific integrations mds, to their parent README, in case the above try-catch failed to find the replacement.
                        try_url = url.rsplit("integrations", 1)[
                            0] + "README.md"
                        # The URL will get replaced by the value of the replaceString
                        try:
                            # The keys inside fileDict are like "ingest-temp-folder/netdata/collectors/charts.d.plugin/ap/README.md"
                            # , so from the link, we need:
                            # replace the https link prefix until our organization identifier with the prefix of the temp folder
                            # try and catch any mishaps in links that instead of "blob" have "edit"
                            # remove "blob/master/" or "blob/main/"
                            # Then we have the correct key for the dictionary

                            dictionary = input_dict[try_url.replace("https://github.com/netdata", TEMP_FOLDER).replace(
                                "edit", "blob").replace("blob/master/", "").replace("blob/main/", "")]
                            replace_string = dictionary["new_learn_path"]

                            # In some cases, a "id: someId" will be in a file, this is to change a file's link in Docusaurus,
                            # so we need to be careful to honor that
                            try:
                                metadata_id = dictionary["metadata"]["id"]

                                replace_string = replace_string.replace(
                                    replace_string.split(
                                        "/")[len(replace_string.split("/"))-1],
                                    metadata_id
                                )
                            except Exception as e:
                                # There is no "id" metadata in the file, do nothing
                                pass

                            # In the end replace the URL with the replaceString
                            body = body.replace("]("+url, "]("+replace_string)
                        except:
                            # Increase the counter of the broken links,
                            # fetch the custom_edit_url variable for printing and print a message
                            UNCORRELATED_LINK_COUNTER += 1

                            if len(custom_edit_url_arr[0]) > 1:
                                custom_edit_url = custom_edit_url_arr[0].replace(
                                    "\"", "").strip(":")
                            else:
                                custom_edit_url = "NO custom_edit_url found, please add one"

                            print(UNCORRELATED_LINK_COUNTER,
                                  "INFO: In File:",
                                  custom_edit_url,
                                  "\n", "URL:", url, "\n")
                    else:
                        # Increase the counter of the broken links,
                        # fetch the custom_edit_url variable for printing and print a message
                        UNCORRELATED_LINK_COUNTER += 1

                        if len(custom_edit_url_arr[0]) > 1:
                            custom_edit_url = custom_edit_url_arr[0].replace(
                                "\"", "").strip(":")
                        else:
                            custom_edit_url = "NO custom_edit_url found, please add one"

                        print(UNCORRELATED_LINK_COUNTER,
                              "INFO: In File:",
                              custom_edit_url,
                              "\n", "URL:", url, "\n")

    # Construct again the whole file
    whole_file = metadata + body

    # Write everything onto the file again
    Path(path_to_file).write_text(whole_file)


def automate_sidebar_position(dictionary):
    """
    This function returns a column for the map dataframe, that assigns a certain number to every entry.

    There are 3 different rules

    Level 1 -> 100_000 gap between the top categories
    Level 2 -> 2_000 gap between the level two categories
    Level 3 -> 40 gap between the level three categories
    Level 4 -> categories and documents at this level have no gap
    """

    print("### Automating sidebar_position ###", '\n')

    position_array = []

    # counters
    counter_one = 0
    counter_two = 0
    counter_three = 0
    counter_four = 0

    # Start from the first entry and keep it as the previous
    split = dictionary['learn_rel_path'][0].split("/")
    try:
        previous_first_level = split[0]
        previous_second_level = split[1]
        previous_third_level = split[2]
    except IndexError:
        pass

    # For every entry, check for every level of the path whether or not it is different.
    # If it is, increment that level's counter by the specified amount.
    for path, i in zip(dictionary['learn_rel_path'], range(0, len(dictionary))):
        if str(path) != "nan":
            split = str(path+f"/{i}").split("/")

            # Split the current path
            try:
                current_first_level = split[0]
                current_second_level = split[1]
                current_third_level = split[2]
            except IndexError:
                pass

            # This works more or less like a Greek abacus
            try:
                if current_first_level != previous_first_level:
                    counter_one += 100000
                    counter_two = 0
                    counter_three = 0
                    counter_four = 0
                elif current_second_level != previous_second_level:
                    counter_two += 2000
                    counter_three = 0
                    counter_four = 0
                elif current_third_level != previous_third_level:
                    counter_three += 40
                    counter_four = 0
                else:
                    counter_four += 1

            except UnboundLocalError:
                pass

            try:
                previous_first_level = current_first_level
                previous_second_level = current_second_level
                previous_third_level = current_third_level
            except UnboundLocalError:
                pass

            position_array.append(
                counter_one+counter_two+counter_three+counter_four)
        else:
            # If for any reason the path is nan, just add a -1, this is very unlikely that it will be the case
            position_array.append(-1)

    return position_array


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Ingest docs from multiple repositories')

    parser.add_argument(
        '--repos',
        default=[],
        nargs='+',
        help='Choose specific repo you want ingest, if not set, defaults ingested'
    )

    parser.add_argument(
        "--dry-run",
        help="Don't save a file with the output.",
        action="store_true",
    )

    parser.add_argument(
        "-d", "--debug",
        help="Enable debug printing",
        action="store_true",
    )

    parser.add_argument(
        "--docs-prefix",
        help="Don't save a file with the output.",
        dest="DOCS_PREFIX",
        default="docs"
    )

    parser.add_argument(
        "-f", "--fail-on-internal-broken-links",
        help="Don't proceed with the process if internal broken links are found.",
        action="store_true",
    )

    list_of_repos_in_str = []
    # netdata/netdata:branch tkatsoulas/go.d.plugin:mybranch
    args = parser.parse_args()
    kArgs = args._get_kwargs()

    # Create local copies from the parse_args input
    DOCS_PREFIX = args.DOCS_PREFIX
    for arg in kArgs:
        if arg[0] == "repos":
            list_of_repos_in_str = arg[1]
        if arg[0] == "dry_run":
            DRY_RUN = arg[1]
        if arg[0] == "debug" or arg[0] == "debug":
            if arg[1]:
                DEBUG = True
                print("RUNNING WITH DEBUG MESSAGES ON")
        if arg[0] == "fail_on_internal_broken_links":
            FAIL_ON_NETDATA_BROKEN_LINKS = arg[1]

    if len(list_of_repos_in_str) > 0:
        for repo_str in list_of_repos_in_str:
            try:
                _temp = repo_str.split("/")
                repo_owner, repository, repo_branch = [
                    _temp[0]] + (_temp[1].split(":"))
                default_repos[repository]["owner"] = repo_owner
                default_repos[repository]["branch"] = repo_branch
            except (TypeError, ValueError):
                print(
                    "You specified a wrong format in at least one of the repos you want to ingest")
                parser.print_usage()
                exit(-1)
            except KeyError:
                print(repository)
                print("The repo you specified in not in predefined repos")
                print(default_repos.keys())
                parser.print_usage()
                exit(-1)
            except Exception as exc:
                print("Unknown error in parsing", exc)

    # Clean up old clones into a temp dir
    unsafe_cleanup_folders(TEMP_FOLDER)
    # Clean up old ingested docs
    safe_cleanup_learn_folders(DOCS_PREFIX)
    print("Creating a temp directory: ", TEMP_FOLDER)

    try:
        os.mkdir(TEMP_FOLDER)
    except FileExistsError:
        print("Folder already exists")

    # Clone all the predefined repos
    for repo_name in default_repos.keys():
        print(clone_repo(default_repos[repo_name]["owner"], repo_name,
              default_repos[repo_name]["branch"], 1, TEMP_FOLDER + "/"))

    # We fetch the markdown files from the repositories
    all_markdown_files = fetch_markdown_from_repo(TEMP_FOLDER)
    print("Files detected: ", len(all_markdown_files), "\n")

    # Fill the mapDict with the metadata the integration mds have (autogenerated metadata)
    mapDict = populate_integrations(all_markdown_files)

    # set the index to the unique custom_edit_url column
    mapDict.set_index('custom_edit_url').T.to_dict('dict')

    # Automate the sidebar position
    mapDict['sidebar_position'] = automate_sidebar_position(mapDict)
    # Make the column type integer
    mapDict['sidebar_position'] = mapDict['sidebar_position'].astype(int)

    markdown_files_with_metadata = []

    for markdown in all_markdown_files:
        # print("File: ", markdown)
        md_metadata = insert_and_read_hidden_metadata_from_doc(
            markdown, mapDict)
        # Check to see if the dictionary returned is empty
        if len(md_metadata) > 0:
            # print("FOUND METADATA", markdown)
            # print(metadata)
            markdown_files_with_metadata.append(markdown)
            if "learn_status" in md_metadata.keys() and md_metadata["learn_status"] == "Published":
                try:
                    # check the type of the response (for more info of what the response can be check
                    # the return statements of the function itself)
                    response = create_mdx_path_from_metadata(md_metadata)

                    if type(response) != str:
                        # If the response is not a string then it is a two item array, [final path, slug]
                        md_metadata.update({"slug": str(response[1])})
                        to_publish[markdown] = {
                            "metadata": md_metadata,
                            "learnPath": str(response[0]),
                            "ingestedRepo": str(markdown.split("/", 2)[1])
                        }

                        md_metadata.update(
                            {"learn_link": "https://learn.netdata.cloud/docs" + md_metadata['slug']})

                    else:
                        to_publish[markdown] = {
                            "metadata": md_metadata,
                            "learnPath": str(response),
                            "ingestedRepo": str(markdown.split("/", 2)[1])
                        }
                        # replace first ", " and then " ", this needs to be handled in a prettier way, but other updates in this file are on the way.
                        if md_metadata['learn_rel_path'] != md_metadata['sidebar_label']:
                            md_metadata.update({"learn_link": "https://learn.netdata.cloud/docs/" + clean_and_lower_string(
                                md_metadata['learn_rel_path']) + "/" + clean_and_lower_string(md_metadata['sidebar_label'])})
                        else:
                            md_metadata.update(
                                {"learn_link": "https://learn.netdata.cloud/docs/" + clean_and_lower_string(md_metadata['learn_rel_path'])})
                    update_metadata_of_file(markdown, md_metadata)
                except KeyError as exc:
                    print(
                        f"File {markdown} doesn't contain key-value", exc)
            else:
                # We don't need these files
                rest_files_with_metadata_dictionary[markdown] = {
                    "metadata": md_metadata,
                    "learnPath": str(f"docs/_archive/_{markdown}"),
                    "ingestedRepo": str(markdown.split("/", 2)[1])
                }
        # Don't fail on empty markdown
        elif not os.stat(markdown).st_size == 0:
            rest_files_dictionary[markdown] = {"tmpPath": markdown}
        del md_metadata

    # FILE MOVING
    print("### Moving files ###\n")

    # identify published documents
    print(f"### Found Learn files: {len(to_publish)}###\n")

    for md_file in to_publish:
        copy_doc(md_file, to_publish[md_file]["learnPath"])
        sanitize_page(to_publish[md_file]["learnPath"])

    print("### Fixing github links ###")

    # After the moving, we have a new metadata, called new_learn_path, and we utilize that to fix links that were
    # pointing to GitHub relative paths
    file_dict = add_new_learn_path_key_to_dict(
        to_publish, DOCS_PREFIX, "/docs", TEMP_FOLDER)

    for md_file in to_publish:
        convert_github_links(file_dict[md_file]["learnPath"], file_dict)

    genRedirects.main(file_dict)
    print("Done.", "Uncorrelated links (links from our github repos that the files are not in Learn):",
          UNCORRELATED_LINK_COUNTER)

    if DEBUG:
        # Print the list of markdown not in Learn, for debugging purposes
        if len(rest_files_dictionary):
            print("ABORT: Files found that are not in the map, exiting...")
            for md_file in rest_files_dictionary:
                print(rest_files_dictionary[md_file]["tmpPath"])

    # Write the current dict into a file, so we can check for redirects in the next commit
    temp_dict = {}
    custom_edit_urls_array = []
    new_learn_paths_array = []

    for repo_name in file_dict:
        custom_edit_urls_array.append(
            file_dict[repo_name]["metadata"]["custom_edit_url"])
        new_learn_paths_array.append(file_dict[repo_name]["new_learn_path"])

    temp_dict['custom_edit_url'] = custom_edit_urls_array
    temp_dict['learn_path'] = new_learn_paths_array

    df = pd.DataFrame.from_dict(temp_dict)
    df.set_index('custom_edit_url')
    df.to_csv("./ingest/one_commit_back_file-dict.tsv", sep='\t', index=False)

    unsafe_cleanup_folders(TEMP_FOLDER)

    print("OPERATION FINISHED")