In [14]:
import re, os, json, pandas as pd
from algoliasearch.search_client import SearchClient

def list_files(filepath, filetype):
    paths = []
    for root, dirs, files in os.walk(filepath):
        for file in files:
            if file.lower().endswith(filetype.lower()):
                paths.append(os.path.join(root, file))
    return(paths)


def search_item(word, header):
    r = re.compile(word)
    content = list(filter(r.match, header))[0]
    content = content[len(word) + 2:]
    return content.strip('"').strip("'")

def structure_markdown(df, path):
    # split markdown in header and body
    max_index = df[1:].index('---') + 1
    header = df[1:max_index]
    body_list = df[max_index + 1:]
    body = " ".join(body_list)

    # define regular expression patterns
    code_block = re.compile("{{% codeblock %}}(.+?){{% /codeblock %}}")
    headers = re.compile("#{2,6}\s(.+)")

    # separate code, headers, and content
    body_no_code = re.sub(code_block, "", body)
    body_no_headers = " ".join([word for word in body_list if not re.search(headers, word)])
    
    return {
        "objectID": path,
        "title": search_item("title", header),
        "description": search_item("description", header),
        "keywords": search_item("keywords", header),
        "code": re.findall(code_block, body),
        "headers": [re.search(headers, word).group(1) for word in body_list if re.search(headers, word)],
        "content": re.sub(code_block, "", body_no_headers)
    }

def export_data(file_paths):
    json_data = []

    for path in file_paths:
        f = open(path, 'r')
        df = f.read().split('\n')
        try: 
            json_data.append(structure_markdown(df, path))
        except:
            print(path) # skipped files
    return json_data

    
# generate list of all markdown files
file_paths = list_files(".", ".md")
json_data = export_data(file_paths)

algolia_key = os.environ['ALGOLIA_KEY'] 
client = SearchClient.create('02IYLG4AP9', algolia_key)
index = client.init_index('actors_index_db')

index.save_objects(json_data)

./disclaimer.md
./examples/_index.md
./tutorials/_indexs.md
./tutorials/educational-support/_index.md
./tutorials/more-tutorials/_index.md
./tutorials/more-tutorials/contribute-to-tilburg-science-hub/building-block-shell.md
./tutorials/more-tutorials/contribute-to-tilburg-science-hub/tutorial-shell.md
./tutorials/more-tutorials/airbnb-workflow/conclusion.md
./tutorials/more-tutorials/airbnb-workflow/exporting_data.md
./tutorials/more-tutorials/airbnb-workflow/transforming_data.md
./tutorials/more-tutorials/airbnb-workflow/loading_data.md
./tutorials/more-tutorials/airbnb-workflow/automating_workflows.md
./tutorials/project-management/_index.md
./tutorials/reproducible-research/_index.md
./tutorials/reproducible-research/practicing-pipeline-automation-make/pipeline-make.md
./tutorials/reproducible-research/practicing-pipeline-automation-make/clone.md
./tutorials/reproducible-research/practicing-pipeline-automation-make/finish.md
./tutorials/reproducible-research/practicing-pipeline-auto

<algoliasearch.responses.IndexingResponse at 0x7fd2751ad6d0>

In [12]:
json_data

[{'file': './examples/reproducible-workflow-airbnb.md',
  'title': 'A Reproducible Research Workflow with AirBnB Data',
  'description': 'A platform-independent, reproducible research workflow with AirBnB data, using Stata, Python and R.',
  'keywords': 'tisem, airbnb, template, workflow, example',
  'code': ['  [js-link](code.js)   ```js // some js code var name = "Arvind Singh";  if (name == "arvind") { \tconsole.log("testing out coding blck"); } ```  ```bash # some bash code # make-files.txt touch test/john.txt touch test/mike.txt touch test/jenna.txt ```  '],
  'headers': ['Overview',
   'How to run it',
   'Dependencies',
   'Run it',
   'Directory structure'],
  'content': '  Using publicly available data from AirBnB (available via [Kaggle.com](https://www.kaggle.com/airbnb/boston)), we illustrate how a reproducible workflow may look like in practice.  {{% cta-primary-center "Check out the GitHub Repository" "https://github.com/tilburgsciencehub/airbnb-workflow" %}}  We\'ve craft

[{'file': './examples/reproducible-workflow-airbnb.md',
  'title': 'A Reproducible Research Workflow with AirBnB Data',
  'description': 'A platform-independent, reproducible research workflow with AirBnB data, using Stata, Python and R.',
  'keywords': 'tisem, airbnb, template, workflow, example',
  'code': ['  [js-link](code.js)   ```js // some js code var name = "Arvind Singh";  if (name == "arvind") { \tconsole.log("testing out coding blck"); } ```  ```bash # some bash code # make-files.txt touch test/john.txt touch test/mike.txt touch test/jenna.txt ```  '],
  'headers': ['Overview',
   'How to run it',
   'Dependencies',
   'Run it',
   'Directory structure'],
  'content': '  Using publicly available data from AirBnB (available via [Kaggle.com](https://www.kaggle.com/airbnb/boston)), we illustrate how a reproducible workflow may look like in practice.  {{% cta-primary-center "Check out the GitHub Repository" "https://github.com/tilburgsciencehub/airbnb-workflow" %}}  We\'ve craft

In [15]:




actors = [{"name": "Robin Williams", "rating": 1200}]
#


index.save_objects([
    {'name': 'Robin Williams', 'rating': '1200', 'objectID': '6844965001'},
    {'name': 'Roy Williams', 'rating': '1200', 'objectID': '6844965001'},
])

<algoliasearch.responses.IndexingResponse at 0x7fcd8b8b8430>

In [19]:
index.delete_objects(["6844965001"])


<algoliasearch.responses.IndexingResponse at 0x7fcd8b8a5c70>