In [2]:
import json
import os
from pathlib import Path
from typing import List, Dict, Any


In [3]:
def filter_corrupt_files(paths: List[Path]) -> (List[Dict[str, Any]], List[str]):
    """
    Filter out files that are not valid json files
    """
    data = {}
    corrupt_files = []
    for path in paths:
        try:
            with open(path, "r") as f:
                single_result = json.load(f)
            data[path.name] = single_result
        except json.decoder.JSONDecodeError:
            corrupt_files.append(path.name)
    return data, corrupt_files


In [4]:
# import Zhinan's data

paths = "../data/zhihan/"

data_z, corrupt_z = filter_corrupt_files([Path(paths) / file for file in os.listdir(paths)])



In [5]:
corrupt_z

[]

In [6]:
len(data_z)

1759

# Dorran's data


Represents list of Python projects (or in the Python ecosystem) that were downloaded in 2023

```sql
SELECT DISTINCT projects.name, REPLACE(projects.url, 'Source, ', '') as trimmed_url, counts.annual_downloads, meta.keywords
-- Get source URL(s) for project
FROM (
  SELECT name, url
  FROM `bigquery-public-data.pypi.distribution_metadata` 
  CROSS JOIN UNNEST(project_urls) AS url
  WHERE url LIKE 'Source, %'
) as projects
-- Add in download count from 2023
LEFT JOIN (
  SELECT project, COUNT(project) as annual_downloads
  FROM `bigquery-public-data.pypi.file_downloads` 
  WHERE TIMESTAMP_TRUNC(timestamp, DAY) >= TIMESTAMP("2023-01-01") AND TIMESTAMP_TRUNC(timestamp, DAY) < TIMESTAMP("2024-01-01") 
  GROUP BY project
) AS counts
ON projects.name = counts.project
-- Add in keywords from most recent upload
LEFT JOIN (
  SELECT DISTINCT name, keywords
  FROM (
    SELECT name, keywords, ROW_NUMBER() OVER (PARTITION BY name ORDER BY upload_time DESC) as rn
    FROM `bigquery-public-data.pypi.distribution_metadata`
  ) WHERE rn = 1
) meta
ON projects.name = meta.name`

In [7]:
# Dorran's data  

paths = "../data/dorran/"

data_d, corrupt_d = filter_corrupt_files([Path(paths) / file for file in os.listdir(paths)])


In [8]:
len(corrupt_d)

139

In [9]:
len(data_d)

911

In [10]:
# json entries examples

for i,(_,v) in enumerate(data_z.items()):
    print( json.dumps(v, indent=4))
    if i>3:
        break

{
    "project_name": "SSUsearch",
    "code_dirs": {
        "scripts": {
            "py": 29,
            "ipynb": 0,
            "total": 29
        },
        "notebooks-pc-linux": {
            "py": 0,
            "ipynb": 12,
            "total": 12
        },
        "notebooks": {
            "py": 0,
            "ipynb": 12,
            "total": 12
        }
    },
    "deps_file": [],
    "imports": [],
    "fawltydeps_version": "0.13.1"
}
{
    "project_name": "markerGeneProfile",
    "code_dirs": null,
    "deps_file": [],
    "imports": [],
    "fawltydeps_version": "0.13.1"
}
{
    "project_name": "evangelist",
    "code_dirs": {
        "dna_puller": {
            "py": 4,
            "ipynb": 0,
            "total": 4
        },
        "gnuplot_generator": {
            "py": 2,
            "ipynb": 0,
            "total": 2
        }
    },
    "deps_file": [
        {
            "source_type": "DepsSource",
            "path": "requirements.txt",
            "pars

In [11]:
for i,(_,v) in enumerate(data_d.items()):
    print( json.dumps(v, indent=4))
    if i>3:
        break

{
    "project_name": "plyara",
    "code_dirs": {
        "/": {
            "py": 9,
            "ipynb": 0,
            "total": 9
        }
    },
    "deps_file": [
        {
            "source_type": "DepsSource",
            "path": "/tmp/tmp3say7qnf/checkout/requirements-dev.txt",
            "parser_choice": "requirements.txt",
            "deps_count": 5,
        },
        {
            "source_type": "DepsSource",
            "path": "/tmp/tmp3say7qnf/checkout/setup.cfg",
            "parser_choice": "setup.cfg",
            "deps_count": 0,
        },
        {
            "source_type": "DepsSource",
            "path": "/tmp/tmp3say7qnf/checkout/setup.py",
            "parser_choice": "setup.py",
            "deps_count": 1,
        }
    ],
    "imports": [],
    "fawltydeps_version": "0.13.1"
}
{
    "project_name": "demoji",
    "code_dirs": {
        "/": {
            "py": 6,
            "ipynb": 0,
            "total": 6
        }
    },
    "deps_file": [
      

Change Dorran's query condition on urls to: (url LIKE 'Source, %' OR url LIKE 'Repository, %')