In [4]:
import os
import traceback
import yaml
from pathlib import Path
import pandas as pd

total_count = 0
provider_count = {}

def parse_provider(config):
    if 'provider' not in config:
        return None

    provider = config['provider']

    if not isinstance(provider, dict):
        return None

    if 'name' in provider:
        return provider['name']

    return None



def process_project(project_path):
    global total_count
    global provider_count

    for path in Path(project_path).rglob('*serverless*.y*ml'):
        total_count += 1

        with open(path, 'r') as file:
            try:
                config = yaml.safe_load(file)
                if not isinstance(config, dict):
                    continue

                provider = parse_provider(config)
                if not provider in provider_count:
                    provider_count[provider] = 0
                provider_count[provider] += 1
            except yaml.constructor.ConstructorError:
                continue
            except yaml.scanner.ScannerError:
                continue
            except yaml.composer.ComposerError:
                continue
            except yaml.parser.ParserError:
                continue
            except Exception as e:
                print(traceback.format_exc())
                continue


if __name__ == '__main__':
    users = [f.path for f in os.scandir('/data/serverless-dataset') if f.is_dir()]
    projects = []
    for user in users:
        projects += [f.path for f in os.scandir(user) if f.is_dir()]

    print('{} Projects found.'.format(len(projects)))

    permissions = {}

    for project in projects:
        process_project(project)

    print('{} candidate ymls found.'.format(total_count))
    print(provider_count)

1064 candidate ymls found.
{'aws': 1062, 'None': 1, 'cloud_provider': 1}
