# Data Extraction

## Typescript/Javascript dependencies

In [None]:
import subprocess

subprocess.run(
    ['git', 'clone', 'https://github.com/supabase/supabase.git'], 
    cwd='../../../', 
    stdout=subprocess.DEVNULL, 
    stderr=subprocess.DEVNULL
)

In [28]:
import subprocess
import shutil
import os

subprocess.run(
    ['skott', '--displayMode=json'], 
    cwd='../../../supabase', 
    stdout=subprocess.DEVNULL, 
    stderr=subprocess.DEVNULL
)

shutil.copy('../../../supabase/skott.json', './data/skott-supabase.json')

'./data/skott-supabase.json'

In [None]:
# import json
# import copy

# data_path = './data/skott.json'
# # data_path = './data/skott-apps.json'

# with open(data_path, 'r') as f:
#     json_data = json.load(f)

# temp_json_data = copy.deepcopy(json_data)
# for key in temp_json_data:
#     if key.startswith('examples/') or key.startswith('docker') or key.startswith('tests') or key.startswith('playwright-tests') or 'config.' in key:
#         del json_data[key]

# data = {
#     'nodes': [],
#     'links': [],
# }

# for key in json_data:
#     data['nodes'].append({
#         'id': key
#     })

#     for adjacent in json_data[key]['adjacentTo']:
#         data['links'].append({
#             'source': key,
#             'target': adjacent,
#             'waeight': 1,
#         })

# print(f"nodes: {len(data['nodes'])}")
# print(f"links: {len(data['links'])}")


# # output_file = "./data.json"

# # with open(output_file, 'w') as f:
# #     json.dump(data, f)

# # print(f"Data saved as JSON file: {output_file}")

## Test Coverage

In [None]:
import os

def find_directory_paths(top_location, directory_name, excludes=[]):
    directory_paths = set()

    for dirpath, dirnames, filenames in os.walk(top_location):
        for dirname in dirnames:
            if any(exclude in dirpath for exclude in excludes):
                continue

            if directory_name in dirname:
                directory_paths.add(dirpath)

    return directory_paths    


def find_files_in_folders(top_location, filename, excludes=[]):
    file_paths = []

    for dirpath, dirnames, filenames in os.walk(top_location):
        if any(exclude in dirpath for exclude in excludes):
            continue

        if any(filename in f for f in filenames):
            file_paths.append(dirpath)

    return file_paths    

In [None]:
test_folders = find_directory_paths('/workspaces/supabase', 'test', excludes=['node_modules', 'images', 'examples'])
print(test_folders)
print(len(test_folders))

jest_test_files = find_files_in_folders('/workspaces/supabase', 'jest', excludes=['node_modules'])
print(jest_test_files)
print(len(jest_test_files))

In [None]:
import subprocess
import shutil
import os

def run_and_copy_jest_coverage_summary(jest_paths, store_path):
    for jest_path in jest_paths:
        try:
            subprocess.run(
                ['jest', '--coverage', '--coverageReporters=json-summary'], 
                cwd=jest_path, 
                stdout=subprocess.DEVNULL, 
                stderr=subprocess.DEVNULL
            )

            if not os.path.exists(store_path):
                os.makedirs(store_path)

            shutil.copy(f'{jest_path}/coverage/coverage-summary.json', f'{store_path}{"-".join(jest_path.split("/")[2:])}-coverage-summary.json')
        except Exception as e:
            print(e)

run_and_copy_jest_coverage_summary(jest_test_files, './data/code-test-coverage/')

## Code Churn

In [1]:
from pydriller import Repository

def get_all_commits(repository_file_location):
    return list(Repository(repository_file_location).traverse_commits())

Consider the example commits below

```
commit (new file)
    old_path = null
    new_path = path

commit (path changed)
    old_path = path
    new_path = another_path

commit (standard)
    old_path = path
    new_path = path

commit (old existing path)
    old_path = None
    new_path = alredy_seen_path
```

Because of the last case, we have to process the commits in order, and delete an entry if the path changes

In [2]:
import datetime

def file_change_count_list(commits, contains_file_endings=[]):
    file_change_count = {}

    change_count_key = 'change-count'
    last_modified_key = 'last-modified'

    for commit in commits:
        for modification in commit.modified_files:
            if not modification.filename.endswith(tuple(contains_file_endings)):
                continue

            if modification.old_path in file_change_count and not modification.new_path in file_change_count:
                data_to_move = file_change_count[modification.old_path]
                del file_change_count[modification.old_path]

                file_change_count[modification.new_path] = {
                    change_count_key: data_to_move[change_count_key] + 1,
                    last_modified_key: commit.committer_date.strftime('%m/%d/%Y, %H:%M:%S'),
                }
            elif not modification.new_path in file_change_count:
                file_change_count[modification.new_path] = {
                    change_count_key: 1,
                    last_modified_key: commit.committer_date.strftime('%m/%d/%Y, %H:%M:%S')
                }
            else:
                file_change_count[modification.new_path][change_count_key] += 1
                file_change_count[modification.new_path][last_modified_key] = commit.committer_date.strftime('%m/%d/%Y, %H:%M:%S')

    return file_change_count

In [3]:
all_commits = get_all_commits('../../../supabase')
print(len(all_commits))

27836


In [4]:
# Takes ~13 minuets

# file_changes = file_change_count_list(all_commits[:250], contains_file_endings=['ts', 'tsx', 'js', 'jsx'])
file_changes = file_change_count_list(all_commits, contains_file_endings=['ts', 'tsx', 'js', 'jsx'])
print(len(file_changes))

5658


In [5]:
import json

# sorted_file_changes = {k: v for k, v in sorted(file_changes.items(), key=lambda item: item[1], reverse=True)}

with open('./data/code-churn-v3.json', 'w') as file:
    json.dump(file_changes, file)